lesso's picture
Training in progress, step 200, checkpoint
e904307 verified
{
"best_metric": 0.10675784200429916,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.5089058524173028,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002544529262086514,
"grad_norm": 2.0976202487945557,
"learning_rate": 1.0017e-05,
"loss": 0.4494,
"step": 1
},
{
"epoch": 0.002544529262086514,
"eval_loss": 0.6885223388671875,
"eval_runtime": 10.1676,
"eval_samples_per_second": 16.326,
"eval_steps_per_second": 4.131,
"step": 1
},
{
"epoch": 0.005089058524173028,
"grad_norm": 2.4291188716888428,
"learning_rate": 2.0034e-05,
"loss": 0.4714,
"step": 2
},
{
"epoch": 0.007633587786259542,
"grad_norm": 2.6177380084991455,
"learning_rate": 3.0050999999999997e-05,
"loss": 0.4572,
"step": 3
},
{
"epoch": 0.010178117048346057,
"grad_norm": 2.227078676223755,
"learning_rate": 4.0068e-05,
"loss": 0.3751,
"step": 4
},
{
"epoch": 0.01272264631043257,
"grad_norm": 1.521238923072815,
"learning_rate": 5.0085e-05,
"loss": 0.3044,
"step": 5
},
{
"epoch": 0.015267175572519083,
"grad_norm": 1.3237571716308594,
"learning_rate": 6.0101999999999995e-05,
"loss": 0.2047,
"step": 6
},
{
"epoch": 0.017811704834605598,
"grad_norm": 1.6091803312301636,
"learning_rate": 7.0119e-05,
"loss": 0.1256,
"step": 7
},
{
"epoch": 0.020356234096692113,
"grad_norm": 1.4948714971542358,
"learning_rate": 8.0136e-05,
"loss": 0.1606,
"step": 8
},
{
"epoch": 0.022900763358778626,
"grad_norm": 1.0482193231582642,
"learning_rate": 9.0153e-05,
"loss": 0.1225,
"step": 9
},
{
"epoch": 0.02544529262086514,
"grad_norm": 1.3892583847045898,
"learning_rate": 0.00010017,
"loss": 0.1892,
"step": 10
},
{
"epoch": 0.027989821882951654,
"grad_norm": 1.202789306640625,
"learning_rate": 9.964278947368421e-05,
"loss": 0.1744,
"step": 11
},
{
"epoch": 0.030534351145038167,
"grad_norm": 1.1245602369308472,
"learning_rate": 9.911557894736841e-05,
"loss": 0.0761,
"step": 12
},
{
"epoch": 0.03307888040712468,
"grad_norm": 1.1420965194702148,
"learning_rate": 9.858836842105263e-05,
"loss": 0.0954,
"step": 13
},
{
"epoch": 0.035623409669211195,
"grad_norm": 0.8061597943305969,
"learning_rate": 9.806115789473684e-05,
"loss": 0.1012,
"step": 14
},
{
"epoch": 0.03816793893129771,
"grad_norm": 1.2784687280654907,
"learning_rate": 9.753394736842106e-05,
"loss": 0.1408,
"step": 15
},
{
"epoch": 0.04071246819338423,
"grad_norm": 1.0279990434646606,
"learning_rate": 9.700673684210526e-05,
"loss": 0.1174,
"step": 16
},
{
"epoch": 0.043256997455470736,
"grad_norm": 1.0930790901184082,
"learning_rate": 9.647952631578948e-05,
"loss": 0.0984,
"step": 17
},
{
"epoch": 0.04580152671755725,
"grad_norm": 0.4375361502170563,
"learning_rate": 9.595231578947368e-05,
"loss": 0.0476,
"step": 18
},
{
"epoch": 0.04834605597964377,
"grad_norm": 0.5908359289169312,
"learning_rate": 9.542510526315789e-05,
"loss": 0.0612,
"step": 19
},
{
"epoch": 0.05089058524173028,
"grad_norm": 0.4914228916168213,
"learning_rate": 9.48978947368421e-05,
"loss": 0.0473,
"step": 20
},
{
"epoch": 0.05343511450381679,
"grad_norm": 1.2252209186553955,
"learning_rate": 9.437068421052632e-05,
"loss": 0.1821,
"step": 21
},
{
"epoch": 0.05597964376590331,
"grad_norm": 0.7145554423332214,
"learning_rate": 9.384347368421052e-05,
"loss": 0.0709,
"step": 22
},
{
"epoch": 0.058524173027989825,
"grad_norm": 0.2932605445384979,
"learning_rate": 9.331626315789474e-05,
"loss": 0.021,
"step": 23
},
{
"epoch": 0.061068702290076333,
"grad_norm": 0.628063440322876,
"learning_rate": 9.278905263157894e-05,
"loss": 0.0734,
"step": 24
},
{
"epoch": 0.06361323155216285,
"grad_norm": 0.48500677943229675,
"learning_rate": 9.226184210526316e-05,
"loss": 0.0503,
"step": 25
},
{
"epoch": 0.06615776081424936,
"grad_norm": 0.5622182488441467,
"learning_rate": 9.173463157894736e-05,
"loss": 0.0558,
"step": 26
},
{
"epoch": 0.06870229007633588,
"grad_norm": 0.5120857357978821,
"learning_rate": 9.120742105263159e-05,
"loss": 0.045,
"step": 27
},
{
"epoch": 0.07124681933842239,
"grad_norm": 0.12333207577466965,
"learning_rate": 9.068021052631579e-05,
"loss": 0.0056,
"step": 28
},
{
"epoch": 0.0737913486005089,
"grad_norm": 0.8002417087554932,
"learning_rate": 9.0153e-05,
"loss": 0.1115,
"step": 29
},
{
"epoch": 0.07633587786259542,
"grad_norm": 3.1628360748291016,
"learning_rate": 8.96257894736842e-05,
"loss": 0.661,
"step": 30
},
{
"epoch": 0.07888040712468193,
"grad_norm": 1.9972189664840698,
"learning_rate": 8.909857894736842e-05,
"loss": 0.3344,
"step": 31
},
{
"epoch": 0.08142493638676845,
"grad_norm": 1.6704767942428589,
"learning_rate": 8.857136842105263e-05,
"loss": 0.3315,
"step": 32
},
{
"epoch": 0.08396946564885496,
"grad_norm": 1.8845316171646118,
"learning_rate": 8.804415789473684e-05,
"loss": 0.3487,
"step": 33
},
{
"epoch": 0.08651399491094147,
"grad_norm": 2.4319205284118652,
"learning_rate": 8.751694736842105e-05,
"loss": 0.3297,
"step": 34
},
{
"epoch": 0.089058524173028,
"grad_norm": 2.7092981338500977,
"learning_rate": 8.698973684210527e-05,
"loss": 0.5175,
"step": 35
},
{
"epoch": 0.0916030534351145,
"grad_norm": 1.65862238407135,
"learning_rate": 8.646252631578948e-05,
"loss": 0.3395,
"step": 36
},
{
"epoch": 0.09414758269720101,
"grad_norm": 1.9453610181808472,
"learning_rate": 8.593531578947368e-05,
"loss": 0.3279,
"step": 37
},
{
"epoch": 0.09669211195928754,
"grad_norm": 1.8823449611663818,
"learning_rate": 8.54081052631579e-05,
"loss": 0.3059,
"step": 38
},
{
"epoch": 0.09923664122137404,
"grad_norm": 1.427621603012085,
"learning_rate": 8.48808947368421e-05,
"loss": 0.2756,
"step": 39
},
{
"epoch": 0.10178117048346055,
"grad_norm": 1.9383624792099,
"learning_rate": 8.435368421052631e-05,
"loss": 0.3652,
"step": 40
},
{
"epoch": 0.10432569974554708,
"grad_norm": 0.5723309516906738,
"learning_rate": 8.382647368421053e-05,
"loss": 0.0681,
"step": 41
},
{
"epoch": 0.10687022900763359,
"grad_norm": 1.692962884902954,
"learning_rate": 8.329926315789474e-05,
"loss": 0.2737,
"step": 42
},
{
"epoch": 0.10941475826972011,
"grad_norm": 5.488102436065674,
"learning_rate": 8.277205263157894e-05,
"loss": 0.3806,
"step": 43
},
{
"epoch": 0.11195928753180662,
"grad_norm": 2.793001890182495,
"learning_rate": 8.224484210526316e-05,
"loss": 0.3236,
"step": 44
},
{
"epoch": 0.11450381679389313,
"grad_norm": 0.8564298152923584,
"learning_rate": 8.171763157894736e-05,
"loss": 0.0697,
"step": 45
},
{
"epoch": 0.11704834605597965,
"grad_norm": 1.465958595275879,
"learning_rate": 8.119042105263158e-05,
"loss": 0.1509,
"step": 46
},
{
"epoch": 0.11959287531806616,
"grad_norm": 1.0620710849761963,
"learning_rate": 8.066321052631578e-05,
"loss": 0.0998,
"step": 47
},
{
"epoch": 0.12213740458015267,
"grad_norm": 0.551638126373291,
"learning_rate": 8.0136e-05,
"loss": 0.0336,
"step": 48
},
{
"epoch": 0.12468193384223919,
"grad_norm": 1.185890555381775,
"learning_rate": 7.960878947368421e-05,
"loss": 0.0431,
"step": 49
},
{
"epoch": 0.1272264631043257,
"grad_norm": 0.4987131953239441,
"learning_rate": 7.908157894736842e-05,
"loss": 0.017,
"step": 50
},
{
"epoch": 0.1272264631043257,
"eval_loss": 0.20289373397827148,
"eval_runtime": 10.1481,
"eval_samples_per_second": 16.358,
"eval_steps_per_second": 4.139,
"step": 50
},
{
"epoch": 0.1297709923664122,
"grad_norm": 1.5103840827941895,
"learning_rate": 7.855436842105262e-05,
"loss": 0.4364,
"step": 51
},
{
"epoch": 0.13231552162849872,
"grad_norm": 1.3223518133163452,
"learning_rate": 7.802715789473684e-05,
"loss": 0.3782,
"step": 52
},
{
"epoch": 0.13486005089058525,
"grad_norm": 1.2647677659988403,
"learning_rate": 7.749994736842104e-05,
"loss": 0.401,
"step": 53
},
{
"epoch": 0.13740458015267176,
"grad_norm": 0.7807660102844238,
"learning_rate": 7.697273684210526e-05,
"loss": 0.1596,
"step": 54
},
{
"epoch": 0.13994910941475827,
"grad_norm": 0.8354874849319458,
"learning_rate": 7.644552631578947e-05,
"loss": 0.165,
"step": 55
},
{
"epoch": 0.14249363867684478,
"grad_norm": 0.6091985702514648,
"learning_rate": 7.591831578947369e-05,
"loss": 0.0962,
"step": 56
},
{
"epoch": 0.1450381679389313,
"grad_norm": 0.7430208325386047,
"learning_rate": 7.539110526315789e-05,
"loss": 0.1676,
"step": 57
},
{
"epoch": 0.1475826972010178,
"grad_norm": 0.6381292343139648,
"learning_rate": 7.48638947368421e-05,
"loss": 0.1042,
"step": 58
},
{
"epoch": 0.15012722646310434,
"grad_norm": 0.8650558590888977,
"learning_rate": 7.433668421052632e-05,
"loss": 0.1513,
"step": 59
},
{
"epoch": 0.15267175572519084,
"grad_norm": 0.7318075895309448,
"learning_rate": 7.380947368421052e-05,
"loss": 0.1286,
"step": 60
},
{
"epoch": 0.15521628498727735,
"grad_norm": 0.5076261758804321,
"learning_rate": 7.328226315789473e-05,
"loss": 0.0677,
"step": 61
},
{
"epoch": 0.15776081424936386,
"grad_norm": 0.5992767214775085,
"learning_rate": 7.275505263157895e-05,
"loss": 0.0658,
"step": 62
},
{
"epoch": 0.16030534351145037,
"grad_norm": 1.0940337181091309,
"learning_rate": 7.222784210526316e-05,
"loss": 0.1328,
"step": 63
},
{
"epoch": 0.1628498727735369,
"grad_norm": 0.41800355911254883,
"learning_rate": 7.170063157894737e-05,
"loss": 0.0529,
"step": 64
},
{
"epoch": 0.16539440203562342,
"grad_norm": 0.410457968711853,
"learning_rate": 7.117342105263158e-05,
"loss": 0.0427,
"step": 65
},
{
"epoch": 0.16793893129770993,
"grad_norm": 0.928383469581604,
"learning_rate": 7.064621052631578e-05,
"loss": 0.0959,
"step": 66
},
{
"epoch": 0.17048346055979643,
"grad_norm": 0.39881715178489685,
"learning_rate": 7.0119e-05,
"loss": 0.0456,
"step": 67
},
{
"epoch": 0.17302798982188294,
"grad_norm": 0.6098618507385254,
"learning_rate": 6.959178947368421e-05,
"loss": 0.0535,
"step": 68
},
{
"epoch": 0.17557251908396945,
"grad_norm": 0.6409094333648682,
"learning_rate": 6.906457894736843e-05,
"loss": 0.0848,
"step": 69
},
{
"epoch": 0.178117048346056,
"grad_norm": 0.3413279950618744,
"learning_rate": 6.853736842105263e-05,
"loss": 0.0257,
"step": 70
},
{
"epoch": 0.1806615776081425,
"grad_norm": 0.41074299812316895,
"learning_rate": 6.801015789473684e-05,
"loss": 0.0237,
"step": 71
},
{
"epoch": 0.183206106870229,
"grad_norm": 0.6176720857620239,
"learning_rate": 6.748294736842105e-05,
"loss": 0.0628,
"step": 72
},
{
"epoch": 0.18575063613231552,
"grad_norm": 0.24736110866069794,
"learning_rate": 6.695573684210526e-05,
"loss": 0.0158,
"step": 73
},
{
"epoch": 0.18829516539440203,
"grad_norm": 0.8030320405960083,
"learning_rate": 6.642852631578946e-05,
"loss": 0.0547,
"step": 74
},
{
"epoch": 0.19083969465648856,
"grad_norm": 0.46492913365364075,
"learning_rate": 6.590131578947369e-05,
"loss": 0.0397,
"step": 75
},
{
"epoch": 0.19338422391857507,
"grad_norm": 0.570913553237915,
"learning_rate": 6.537410526315789e-05,
"loss": 0.0731,
"step": 76
},
{
"epoch": 0.19592875318066158,
"grad_norm": 0.8975678086280823,
"learning_rate": 6.484689473684211e-05,
"loss": 0.1668,
"step": 77
},
{
"epoch": 0.1984732824427481,
"grad_norm": 0.10023163259029388,
"learning_rate": 6.431968421052631e-05,
"loss": 0.0064,
"step": 78
},
{
"epoch": 0.2010178117048346,
"grad_norm": 1.0112429857254028,
"learning_rate": 6.379247368421052e-05,
"loss": 0.1961,
"step": 79
},
{
"epoch": 0.2035623409669211,
"grad_norm": 1.7674541473388672,
"learning_rate": 6.326526315789474e-05,
"loss": 0.4854,
"step": 80
},
{
"epoch": 0.20610687022900764,
"grad_norm": 2.2243638038635254,
"learning_rate": 6.273805263157894e-05,
"loss": 0.515,
"step": 81
},
{
"epoch": 0.20865139949109415,
"grad_norm": 1.8203999996185303,
"learning_rate": 6.221084210526315e-05,
"loss": 0.4717,
"step": 82
},
{
"epoch": 0.21119592875318066,
"grad_norm": 1.2786808013916016,
"learning_rate": 6.168363157894737e-05,
"loss": 0.2411,
"step": 83
},
{
"epoch": 0.21374045801526717,
"grad_norm": 1.3873907327651978,
"learning_rate": 6.115642105263159e-05,
"loss": 0.2586,
"step": 84
},
{
"epoch": 0.21628498727735368,
"grad_norm": 1.169811725616455,
"learning_rate": 6.0629210526315787e-05,
"loss": 0.2859,
"step": 85
},
{
"epoch": 0.21882951653944022,
"grad_norm": 1.7699614763259888,
"learning_rate": 6.0101999999999995e-05,
"loss": 0.3636,
"step": 86
},
{
"epoch": 0.22137404580152673,
"grad_norm": 1.0457572937011719,
"learning_rate": 5.95747894736842e-05,
"loss": 0.2371,
"step": 87
},
{
"epoch": 0.22391857506361323,
"grad_norm": 1.2649692296981812,
"learning_rate": 5.904757894736841e-05,
"loss": 0.2149,
"step": 88
},
{
"epoch": 0.22646310432569974,
"grad_norm": 0.8703845143318176,
"learning_rate": 5.852036842105263e-05,
"loss": 0.1582,
"step": 89
},
{
"epoch": 0.22900763358778625,
"grad_norm": 1.6556470394134521,
"learning_rate": 5.799315789473684e-05,
"loss": 0.2751,
"step": 90
},
{
"epoch": 0.23155216284987276,
"grad_norm": 1.0715538263320923,
"learning_rate": 5.746594736842105e-05,
"loss": 0.2189,
"step": 91
},
{
"epoch": 0.2340966921119593,
"grad_norm": 1.5337820053100586,
"learning_rate": 5.693873684210526e-05,
"loss": 0.3324,
"step": 92
},
{
"epoch": 0.2366412213740458,
"grad_norm": 0.6989188194274902,
"learning_rate": 5.641152631578947e-05,
"loss": 0.0784,
"step": 93
},
{
"epoch": 0.23918575063613232,
"grad_norm": 1.108068585395813,
"learning_rate": 5.588431578947368e-05,
"loss": 0.1732,
"step": 94
},
{
"epoch": 0.24173027989821882,
"grad_norm": 0.7206950783729553,
"learning_rate": 5.5357105263157896e-05,
"loss": 0.08,
"step": 95
},
{
"epoch": 0.24427480916030533,
"grad_norm": 1.3309029340744019,
"learning_rate": 5.482989473684211e-05,
"loss": 0.1309,
"step": 96
},
{
"epoch": 0.24681933842239187,
"grad_norm": 1.4102177619934082,
"learning_rate": 5.430268421052632e-05,
"loss": 0.1174,
"step": 97
},
{
"epoch": 0.24936386768447838,
"grad_norm": 0.6907632350921631,
"learning_rate": 5.377547368421053e-05,
"loss": 0.0753,
"step": 98
},
{
"epoch": 0.25190839694656486,
"grad_norm": 0.5914320945739746,
"learning_rate": 5.3248263157894736e-05,
"loss": 0.0487,
"step": 99
},
{
"epoch": 0.2544529262086514,
"grad_norm": 0.410552054643631,
"learning_rate": 5.2721052631578944e-05,
"loss": 0.028,
"step": 100
},
{
"epoch": 0.2544529262086514,
"eval_loss": 0.1450011283159256,
"eval_runtime": 10.1692,
"eval_samples_per_second": 16.324,
"eval_steps_per_second": 4.13,
"step": 100
},
{
"epoch": 0.25699745547073793,
"grad_norm": 0.9659016728401184,
"learning_rate": 5.219384210526315e-05,
"loss": 0.2955,
"step": 101
},
{
"epoch": 0.2595419847328244,
"grad_norm": 0.5893524885177612,
"learning_rate": 5.1666631578947374e-05,
"loss": 0.158,
"step": 102
},
{
"epoch": 0.26208651399491095,
"grad_norm": 0.561215877532959,
"learning_rate": 5.113942105263158e-05,
"loss": 0.1543,
"step": 103
},
{
"epoch": 0.26463104325699743,
"grad_norm": 0.5634675621986389,
"learning_rate": 5.061221052631579e-05,
"loss": 0.1578,
"step": 104
},
{
"epoch": 0.26717557251908397,
"grad_norm": 0.7371407747268677,
"learning_rate": 5.0085e-05,
"loss": 0.1396,
"step": 105
},
{
"epoch": 0.2697201017811705,
"grad_norm": 0.62689608335495,
"learning_rate": 4.955778947368421e-05,
"loss": 0.1542,
"step": 106
},
{
"epoch": 0.272264631043257,
"grad_norm": 0.5553103685379028,
"learning_rate": 4.903057894736842e-05,
"loss": 0.0909,
"step": 107
},
{
"epoch": 0.2748091603053435,
"grad_norm": 0.5198187828063965,
"learning_rate": 4.850336842105263e-05,
"loss": 0.0785,
"step": 108
},
{
"epoch": 0.27735368956743,
"grad_norm": 0.7179524898529053,
"learning_rate": 4.797615789473684e-05,
"loss": 0.1036,
"step": 109
},
{
"epoch": 0.27989821882951654,
"grad_norm": 0.44508594274520874,
"learning_rate": 4.744894736842105e-05,
"loss": 0.1104,
"step": 110
},
{
"epoch": 0.2824427480916031,
"grad_norm": 0.7336511015892029,
"learning_rate": 4.692173684210526e-05,
"loss": 0.1067,
"step": 111
},
{
"epoch": 0.28498727735368956,
"grad_norm": 0.9355735778808594,
"learning_rate": 4.639452631578947e-05,
"loss": 0.1675,
"step": 112
},
{
"epoch": 0.2875318066157761,
"grad_norm": 0.46843382716178894,
"learning_rate": 4.586731578947368e-05,
"loss": 0.0723,
"step": 113
},
{
"epoch": 0.2900763358778626,
"grad_norm": 0.5565648078918457,
"learning_rate": 4.5340105263157894e-05,
"loss": 0.0579,
"step": 114
},
{
"epoch": 0.2926208651399491,
"grad_norm": 0.35323649644851685,
"learning_rate": 4.48128947368421e-05,
"loss": 0.0532,
"step": 115
},
{
"epoch": 0.2951653944020356,
"grad_norm": 0.41509339213371277,
"learning_rate": 4.428568421052632e-05,
"loss": 0.0788,
"step": 116
},
{
"epoch": 0.29770992366412213,
"grad_norm": 0.4781738817691803,
"learning_rate": 4.3758473684210525e-05,
"loss": 0.0939,
"step": 117
},
{
"epoch": 0.30025445292620867,
"grad_norm": 0.5751485824584961,
"learning_rate": 4.323126315789474e-05,
"loss": 0.0883,
"step": 118
},
{
"epoch": 0.30279898218829515,
"grad_norm": 0.2596683204174042,
"learning_rate": 4.270405263157895e-05,
"loss": 0.0365,
"step": 119
},
{
"epoch": 0.3053435114503817,
"grad_norm": 0.572528600692749,
"learning_rate": 4.217684210526316e-05,
"loss": 0.0871,
"step": 120
},
{
"epoch": 0.30788804071246817,
"grad_norm": 0.5119253396987915,
"learning_rate": 4.164963157894737e-05,
"loss": 0.0973,
"step": 121
},
{
"epoch": 0.3104325699745547,
"grad_norm": 0.5054477453231812,
"learning_rate": 4.112242105263158e-05,
"loss": 0.0494,
"step": 122
},
{
"epoch": 0.31297709923664124,
"grad_norm": 0.3897090256214142,
"learning_rate": 4.059521052631579e-05,
"loss": 0.0333,
"step": 123
},
{
"epoch": 0.3155216284987277,
"grad_norm": 0.2573760747909546,
"learning_rate": 4.0068e-05,
"loss": 0.0229,
"step": 124
},
{
"epoch": 0.31806615776081426,
"grad_norm": 0.28332197666168213,
"learning_rate": 3.954078947368421e-05,
"loss": 0.0257,
"step": 125
},
{
"epoch": 0.32061068702290074,
"grad_norm": 0.446418434381485,
"learning_rate": 3.901357894736842e-05,
"loss": 0.0825,
"step": 126
},
{
"epoch": 0.3231552162849873,
"grad_norm": 0.29756420850753784,
"learning_rate": 3.848636842105263e-05,
"loss": 0.0242,
"step": 127
},
{
"epoch": 0.3256997455470738,
"grad_norm": 0.5935866236686707,
"learning_rate": 3.795915789473684e-05,
"loss": 0.0925,
"step": 128
},
{
"epoch": 0.3282442748091603,
"grad_norm": 0.2986157536506653,
"learning_rate": 3.743194736842105e-05,
"loss": 0.0154,
"step": 129
},
{
"epoch": 0.33078880407124683,
"grad_norm": 0.03564433753490448,
"learning_rate": 3.690473684210526e-05,
"loss": 0.0024,
"step": 130
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.021472949534654617,
"learning_rate": 3.6377526315789475e-05,
"loss": 0.0011,
"step": 131
},
{
"epoch": 0.33587786259541985,
"grad_norm": 1.2343424558639526,
"learning_rate": 3.585031578947368e-05,
"loss": 0.3226,
"step": 132
},
{
"epoch": 0.3384223918575064,
"grad_norm": 1.3323383331298828,
"learning_rate": 3.532310526315789e-05,
"loss": 0.2967,
"step": 133
},
{
"epoch": 0.34096692111959287,
"grad_norm": 1.58578360080719,
"learning_rate": 3.4795894736842106e-05,
"loss": 0.2687,
"step": 134
},
{
"epoch": 0.3435114503816794,
"grad_norm": 1.3783105611801147,
"learning_rate": 3.4268684210526314e-05,
"loss": 0.2798,
"step": 135
},
{
"epoch": 0.3460559796437659,
"grad_norm": 1.470922827720642,
"learning_rate": 3.374147368421052e-05,
"loss": 0.3177,
"step": 136
},
{
"epoch": 0.3486005089058524,
"grad_norm": 1.449453592300415,
"learning_rate": 3.321426315789473e-05,
"loss": 0.1875,
"step": 137
},
{
"epoch": 0.3511450381679389,
"grad_norm": 1.273271083831787,
"learning_rate": 3.2687052631578946e-05,
"loss": 0.2517,
"step": 138
},
{
"epoch": 0.35368956743002544,
"grad_norm": 1.2989132404327393,
"learning_rate": 3.2159842105263154e-05,
"loss": 0.1741,
"step": 139
},
{
"epoch": 0.356234096692112,
"grad_norm": 1.1349838972091675,
"learning_rate": 3.163263157894737e-05,
"loss": 0.2073,
"step": 140
},
{
"epoch": 0.35877862595419846,
"grad_norm": 1.2873899936676025,
"learning_rate": 3.110542105263158e-05,
"loss": 0.1692,
"step": 141
},
{
"epoch": 0.361323155216285,
"grad_norm": 1.297892689704895,
"learning_rate": 3.057821052631579e-05,
"loss": 0.1529,
"step": 142
},
{
"epoch": 0.3638676844783715,
"grad_norm": 1.0262969732284546,
"learning_rate": 3.0050999999999997e-05,
"loss": 0.1269,
"step": 143
},
{
"epoch": 0.366412213740458,
"grad_norm": 1.489499807357788,
"learning_rate": 2.9523789473684206e-05,
"loss": 0.2182,
"step": 144
},
{
"epoch": 0.36895674300254455,
"grad_norm": 2.6656413078308105,
"learning_rate": 2.899657894736842e-05,
"loss": 0.1811,
"step": 145
},
{
"epoch": 0.37150127226463103,
"grad_norm": 1.5800155401229858,
"learning_rate": 2.846936842105263e-05,
"loss": 0.1364,
"step": 146
},
{
"epoch": 0.37404580152671757,
"grad_norm": 0.6563022136688232,
"learning_rate": 2.794215789473684e-05,
"loss": 0.0965,
"step": 147
},
{
"epoch": 0.37659033078880405,
"grad_norm": 1.1012194156646729,
"learning_rate": 2.7414947368421056e-05,
"loss": 0.097,
"step": 148
},
{
"epoch": 0.3791348600508906,
"grad_norm": 1.3474540710449219,
"learning_rate": 2.6887736842105264e-05,
"loss": 0.1278,
"step": 149
},
{
"epoch": 0.3816793893129771,
"grad_norm": 1.2162439823150635,
"learning_rate": 2.6360526315789472e-05,
"loss": 0.1464,
"step": 150
},
{
"epoch": 0.3816793893129771,
"eval_loss": 0.1360974758863449,
"eval_runtime": 10.1762,
"eval_samples_per_second": 16.313,
"eval_steps_per_second": 4.127,
"step": 150
},
{
"epoch": 0.3842239185750636,
"grad_norm": 0.6443613767623901,
"learning_rate": 2.5833315789473687e-05,
"loss": 0.2032,
"step": 151
},
{
"epoch": 0.38676844783715014,
"grad_norm": 0.7321302890777588,
"learning_rate": 2.5306105263157895e-05,
"loss": 0.1961,
"step": 152
},
{
"epoch": 0.3893129770992366,
"grad_norm": 0.7189200520515442,
"learning_rate": 2.4778894736842104e-05,
"loss": 0.1966,
"step": 153
},
{
"epoch": 0.39185750636132316,
"grad_norm": 0.6960674524307251,
"learning_rate": 2.4251684210526315e-05,
"loss": 0.2118,
"step": 154
},
{
"epoch": 0.3944020356234097,
"grad_norm": 0.7753060460090637,
"learning_rate": 2.3724473684210524e-05,
"loss": 0.2101,
"step": 155
},
{
"epoch": 0.3969465648854962,
"grad_norm": 0.5562716126441956,
"learning_rate": 2.3197263157894735e-05,
"loss": 0.1283,
"step": 156
},
{
"epoch": 0.3994910941475827,
"grad_norm": 0.37449532747268677,
"learning_rate": 2.2670052631578947e-05,
"loss": 0.1007,
"step": 157
},
{
"epoch": 0.4020356234096692,
"grad_norm": 0.5224964022636414,
"learning_rate": 2.214284210526316e-05,
"loss": 0.1294,
"step": 158
},
{
"epoch": 0.40458015267175573,
"grad_norm": 0.3669807016849518,
"learning_rate": 2.161563157894737e-05,
"loss": 0.0846,
"step": 159
},
{
"epoch": 0.4071246819338422,
"grad_norm": 0.5417796969413757,
"learning_rate": 2.108842105263158e-05,
"loss": 0.12,
"step": 160
},
{
"epoch": 0.40966921119592875,
"grad_norm": 0.506889820098877,
"learning_rate": 2.056121052631579e-05,
"loss": 0.0786,
"step": 161
},
{
"epoch": 0.4122137404580153,
"grad_norm": 0.49759092926979065,
"learning_rate": 2.0034e-05,
"loss": 0.093,
"step": 162
},
{
"epoch": 0.41475826972010177,
"grad_norm": 0.29034364223480225,
"learning_rate": 1.950678947368421e-05,
"loss": 0.0639,
"step": 163
},
{
"epoch": 0.4173027989821883,
"grad_norm": 0.6314502358436584,
"learning_rate": 1.897957894736842e-05,
"loss": 0.1211,
"step": 164
},
{
"epoch": 0.4198473282442748,
"grad_norm": 0.23841609060764313,
"learning_rate": 1.845236842105263e-05,
"loss": 0.0461,
"step": 165
},
{
"epoch": 0.4223918575063613,
"grad_norm": 0.35829946398735046,
"learning_rate": 1.792515789473684e-05,
"loss": 0.0773,
"step": 166
},
{
"epoch": 0.42493638676844786,
"grad_norm": 0.43481776118278503,
"learning_rate": 1.7397947368421053e-05,
"loss": 0.0921,
"step": 167
},
{
"epoch": 0.42748091603053434,
"grad_norm": 0.35226166248321533,
"learning_rate": 1.687073684210526e-05,
"loss": 0.0594,
"step": 168
},
{
"epoch": 0.4300254452926209,
"grad_norm": 0.5202860832214355,
"learning_rate": 1.6343526315789473e-05,
"loss": 0.0986,
"step": 169
},
{
"epoch": 0.43256997455470736,
"grad_norm": 0.23757660388946533,
"learning_rate": 1.5816315789473685e-05,
"loss": 0.032,
"step": 170
},
{
"epoch": 0.4351145038167939,
"grad_norm": 0.27789339423179626,
"learning_rate": 1.5289105263157896e-05,
"loss": 0.0438,
"step": 171
},
{
"epoch": 0.43765903307888043,
"grad_norm": 0.41914746165275574,
"learning_rate": 1.4761894736842103e-05,
"loss": 0.0462,
"step": 172
},
{
"epoch": 0.4402035623409669,
"grad_norm": 0.2738276720046997,
"learning_rate": 1.4234684210526314e-05,
"loss": 0.0293,
"step": 173
},
{
"epoch": 0.44274809160305345,
"grad_norm": 0.4610910713672638,
"learning_rate": 1.3707473684210528e-05,
"loss": 0.0522,
"step": 174
},
{
"epoch": 0.44529262086513993,
"grad_norm": 0.18284475803375244,
"learning_rate": 1.3180263157894736e-05,
"loss": 0.0267,
"step": 175
},
{
"epoch": 0.44783715012722647,
"grad_norm": 0.2613477110862732,
"learning_rate": 1.2653052631578948e-05,
"loss": 0.0191,
"step": 176
},
{
"epoch": 0.45038167938931295,
"grad_norm": 0.21745839715003967,
"learning_rate": 1.2125842105263158e-05,
"loss": 0.0267,
"step": 177
},
{
"epoch": 0.4529262086513995,
"grad_norm": 0.6353086829185486,
"learning_rate": 1.1598631578947368e-05,
"loss": 0.0828,
"step": 178
},
{
"epoch": 0.455470737913486,
"grad_norm": 1.0890721082687378,
"learning_rate": 1.107142105263158e-05,
"loss": 0.3399,
"step": 179
},
{
"epoch": 0.4580152671755725,
"grad_norm": 0.9117040038108826,
"learning_rate": 1.054421052631579e-05,
"loss": 0.2736,
"step": 180
},
{
"epoch": 0.46055979643765904,
"grad_norm": 1.2923952341079712,
"learning_rate": 1.0017e-05,
"loss": 0.267,
"step": 181
},
{
"epoch": 0.4631043256997455,
"grad_norm": 0.9573558568954468,
"learning_rate": 9.48978947368421e-06,
"loss": 0.2016,
"step": 182
},
{
"epoch": 0.46564885496183206,
"grad_norm": 0.787228524684906,
"learning_rate": 8.96257894736842e-06,
"loss": 0.1423,
"step": 183
},
{
"epoch": 0.4681933842239186,
"grad_norm": 1.1528656482696533,
"learning_rate": 8.43536842105263e-06,
"loss": 0.2141,
"step": 184
},
{
"epoch": 0.4707379134860051,
"grad_norm": 2.192894220352173,
"learning_rate": 7.908157894736842e-06,
"loss": 0.3117,
"step": 185
},
{
"epoch": 0.4732824427480916,
"grad_norm": 1.0140397548675537,
"learning_rate": 7.380947368421051e-06,
"loss": 0.1539,
"step": 186
},
{
"epoch": 0.4758269720101781,
"grad_norm": 1.4267032146453857,
"learning_rate": 6.853736842105264e-06,
"loss": 0.2976,
"step": 187
},
{
"epoch": 0.47837150127226463,
"grad_norm": 1.1065343618392944,
"learning_rate": 6.326526315789474e-06,
"loss": 0.2073,
"step": 188
},
{
"epoch": 0.48091603053435117,
"grad_norm": 1.1713448762893677,
"learning_rate": 5.799315789473684e-06,
"loss": 0.1977,
"step": 189
},
{
"epoch": 0.48346055979643765,
"grad_norm": 0.6917346119880676,
"learning_rate": 5.272105263157895e-06,
"loss": 0.0885,
"step": 190
},
{
"epoch": 0.4860050890585242,
"grad_norm": 1.0129892826080322,
"learning_rate": 4.744894736842105e-06,
"loss": 0.1472,
"step": 191
},
{
"epoch": 0.48854961832061067,
"grad_norm": 1.470230221748352,
"learning_rate": 4.217684210526315e-06,
"loss": 0.22,
"step": 192
},
{
"epoch": 0.4910941475826972,
"grad_norm": 0.406305730342865,
"learning_rate": 3.6904736842105257e-06,
"loss": 0.0418,
"step": 193
},
{
"epoch": 0.49363867684478374,
"grad_norm": 1.7621744871139526,
"learning_rate": 3.163263157894737e-06,
"loss": 0.2434,
"step": 194
},
{
"epoch": 0.4961832061068702,
"grad_norm": 1.2661513090133667,
"learning_rate": 2.6360526315789473e-06,
"loss": 0.1978,
"step": 195
},
{
"epoch": 0.49872773536895676,
"grad_norm": 1.6431432962417603,
"learning_rate": 2.1088421052631577e-06,
"loss": 0.2335,
"step": 196
},
{
"epoch": 0.5012722646310432,
"grad_norm": 1.3212987184524536,
"learning_rate": 1.5816315789473685e-06,
"loss": 0.0987,
"step": 197
},
{
"epoch": 0.5038167938931297,
"grad_norm": 0.8376440405845642,
"learning_rate": 1.0544210526315788e-06,
"loss": 0.0621,
"step": 198
},
{
"epoch": 0.5063613231552163,
"grad_norm": 0.5887414216995239,
"learning_rate": 5.272105263157894e-07,
"loss": 0.0602,
"step": 199
},
{
"epoch": 0.5089058524173028,
"grad_norm": 0.6141554117202759,
"learning_rate": 0.0,
"loss": 0.0582,
"step": 200
},
{
"epoch": 0.5089058524173028,
"eval_loss": 0.10675784200429916,
"eval_runtime": 10.1526,
"eval_samples_per_second": 16.35,
"eval_steps_per_second": 4.137,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.3676760973312e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}