lesso's picture
Training in progress, step 200, checkpoint
0654082 verified
raw
history blame
36.7 kB
{
"best_metric": 2.7491917610168457,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.06791171477079797,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00033955857385398983,
"grad_norm": 21.100221633911133,
"learning_rate": 1.0100000000000002e-05,
"loss": 5.7028,
"step": 1
},
{
"epoch": 0.00033955857385398983,
"eval_loss": 3.8351783752441406,
"eval_runtime": 134.0344,
"eval_samples_per_second": 9.251,
"eval_steps_per_second": 2.313,
"step": 1
},
{
"epoch": 0.0006791171477079797,
"grad_norm": 32.934295654296875,
"learning_rate": 2.0200000000000003e-05,
"loss": 6.2743,
"step": 2
},
{
"epoch": 0.0010186757215619694,
"grad_norm": 11.293846130371094,
"learning_rate": 3.0299999999999998e-05,
"loss": 5.5001,
"step": 3
},
{
"epoch": 0.0013582342954159593,
"grad_norm": 12.974593162536621,
"learning_rate": 4.0400000000000006e-05,
"loss": 5.5615,
"step": 4
},
{
"epoch": 0.001697792869269949,
"grad_norm": 11.986504554748535,
"learning_rate": 5.05e-05,
"loss": 6.0771,
"step": 5
},
{
"epoch": 0.0020373514431239388,
"grad_norm": 11.836446762084961,
"learning_rate": 6.0599999999999996e-05,
"loss": 5.833,
"step": 6
},
{
"epoch": 0.0023769100169779285,
"grad_norm": 12.108397483825684,
"learning_rate": 7.07e-05,
"loss": 6.3409,
"step": 7
},
{
"epoch": 0.0027164685908319186,
"grad_norm": 13.400388717651367,
"learning_rate": 8.080000000000001e-05,
"loss": 6.262,
"step": 8
},
{
"epoch": 0.0030560271646859084,
"grad_norm": 12.49435806274414,
"learning_rate": 9.09e-05,
"loss": 5.619,
"step": 9
},
{
"epoch": 0.003395585738539898,
"grad_norm": 12.52596378326416,
"learning_rate": 0.000101,
"loss": 5.1372,
"step": 10
},
{
"epoch": 0.003735144312393888,
"grad_norm": 12.6868314743042,
"learning_rate": 0.00010046842105263158,
"loss": 5.2604,
"step": 11
},
{
"epoch": 0.0040747028862478775,
"grad_norm": 14.099748611450195,
"learning_rate": 9.993684210526315e-05,
"loss": 5.58,
"step": 12
},
{
"epoch": 0.004414261460101867,
"grad_norm": 12.148128509521484,
"learning_rate": 9.940526315789473e-05,
"loss": 5.5032,
"step": 13
},
{
"epoch": 0.004753820033955857,
"grad_norm": 14.858428001403809,
"learning_rate": 9.887368421052632e-05,
"loss": 6.4015,
"step": 14
},
{
"epoch": 0.0050933786078098476,
"grad_norm": 14.074804306030273,
"learning_rate": 9.83421052631579e-05,
"loss": 5.938,
"step": 15
},
{
"epoch": 0.005432937181663837,
"grad_norm": 13.298508644104004,
"learning_rate": 9.781052631578948e-05,
"loss": 5.9138,
"step": 16
},
{
"epoch": 0.005772495755517827,
"grad_norm": 13.888445854187012,
"learning_rate": 9.727894736842106e-05,
"loss": 5.5768,
"step": 17
},
{
"epoch": 0.006112054329371817,
"grad_norm": 12.229842185974121,
"learning_rate": 9.674736842105263e-05,
"loss": 5.4015,
"step": 18
},
{
"epoch": 0.0064516129032258064,
"grad_norm": 17.1883487701416,
"learning_rate": 9.621578947368421e-05,
"loss": 6.27,
"step": 19
},
{
"epoch": 0.006791171477079796,
"grad_norm": 13.889158248901367,
"learning_rate": 9.568421052631578e-05,
"loss": 5.3761,
"step": 20
},
{
"epoch": 0.007130730050933786,
"grad_norm": 18.03790283203125,
"learning_rate": 9.515263157894737e-05,
"loss": 5.8293,
"step": 21
},
{
"epoch": 0.007470288624787776,
"grad_norm": 15.712692260742188,
"learning_rate": 9.462105263157895e-05,
"loss": 6.1728,
"step": 22
},
{
"epoch": 0.007809847198641765,
"grad_norm": 15.400139808654785,
"learning_rate": 9.408947368421054e-05,
"loss": 6.7802,
"step": 23
},
{
"epoch": 0.008149405772495755,
"grad_norm": 14.763883590698242,
"learning_rate": 9.355789473684211e-05,
"loss": 5.7079,
"step": 24
},
{
"epoch": 0.008488964346349746,
"grad_norm": 12.111189842224121,
"learning_rate": 9.302631578947369e-05,
"loss": 5.5747,
"step": 25
},
{
"epoch": 0.008828522920203734,
"grad_norm": 14.711851119995117,
"learning_rate": 9.249473684210526e-05,
"loss": 6.0876,
"step": 26
},
{
"epoch": 0.009168081494057725,
"grad_norm": 15.358607292175293,
"learning_rate": 9.196315789473685e-05,
"loss": 5.6319,
"step": 27
},
{
"epoch": 0.009507640067911714,
"grad_norm": 13.506231307983398,
"learning_rate": 9.143157894736843e-05,
"loss": 5.8136,
"step": 28
},
{
"epoch": 0.009847198641765705,
"grad_norm": 14.769192695617676,
"learning_rate": 9.09e-05,
"loss": 5.8328,
"step": 29
},
{
"epoch": 0.010186757215619695,
"grad_norm": 14.589241027832031,
"learning_rate": 9.036842105263158e-05,
"loss": 5.5436,
"step": 30
},
{
"epoch": 0.010526315789473684,
"grad_norm": 16.822694778442383,
"learning_rate": 8.983684210526316e-05,
"loss": 6.7274,
"step": 31
},
{
"epoch": 0.010865874363327675,
"grad_norm": 17.69041633605957,
"learning_rate": 8.930526315789474e-05,
"loss": 6.3115,
"step": 32
},
{
"epoch": 0.011205432937181663,
"grad_norm": 16.131786346435547,
"learning_rate": 8.877368421052632e-05,
"loss": 5.8964,
"step": 33
},
{
"epoch": 0.011544991511035654,
"grad_norm": 17.694215774536133,
"learning_rate": 8.82421052631579e-05,
"loss": 6.6347,
"step": 34
},
{
"epoch": 0.011884550084889643,
"grad_norm": 16.214025497436523,
"learning_rate": 8.771052631578948e-05,
"loss": 5.3229,
"step": 35
},
{
"epoch": 0.012224108658743633,
"grad_norm": 17.1286678314209,
"learning_rate": 8.717894736842105e-05,
"loss": 6.4127,
"step": 36
},
{
"epoch": 0.012563667232597622,
"grad_norm": 17.29891014099121,
"learning_rate": 8.664736842105263e-05,
"loss": 5.6328,
"step": 37
},
{
"epoch": 0.012903225806451613,
"grad_norm": 22.239364624023438,
"learning_rate": 8.61157894736842e-05,
"loss": 6.787,
"step": 38
},
{
"epoch": 0.013242784380305603,
"grad_norm": 17.34881591796875,
"learning_rate": 8.55842105263158e-05,
"loss": 5.9971,
"step": 39
},
{
"epoch": 0.013582342954159592,
"grad_norm": 31.443096160888672,
"learning_rate": 8.505263157894737e-05,
"loss": 7.1717,
"step": 40
},
{
"epoch": 0.013921901528013583,
"grad_norm": 20.705217361450195,
"learning_rate": 8.452105263157896e-05,
"loss": 6.672,
"step": 41
},
{
"epoch": 0.014261460101867572,
"grad_norm": 22.882652282714844,
"learning_rate": 8.398947368421053e-05,
"loss": 6.7516,
"step": 42
},
{
"epoch": 0.014601018675721562,
"grad_norm": 25.97607421875,
"learning_rate": 8.345789473684211e-05,
"loss": 7.0149,
"step": 43
},
{
"epoch": 0.014940577249575551,
"grad_norm": 29.19485855102539,
"learning_rate": 8.292631578947368e-05,
"loss": 7.1149,
"step": 44
},
{
"epoch": 0.015280135823429542,
"grad_norm": 26.053762435913086,
"learning_rate": 8.239473684210526e-05,
"loss": 6.5975,
"step": 45
},
{
"epoch": 0.01561969439728353,
"grad_norm": 28.610328674316406,
"learning_rate": 8.186315789473683e-05,
"loss": 6.2714,
"step": 46
},
{
"epoch": 0.01595925297113752,
"grad_norm": 35.02290344238281,
"learning_rate": 8.133157894736842e-05,
"loss": 7.4225,
"step": 47
},
{
"epoch": 0.01629881154499151,
"grad_norm": 51.69056701660156,
"learning_rate": 8.080000000000001e-05,
"loss": 6.8981,
"step": 48
},
{
"epoch": 0.016638370118845502,
"grad_norm": 50.08887481689453,
"learning_rate": 8.026842105263159e-05,
"loss": 7.0015,
"step": 49
},
{
"epoch": 0.01697792869269949,
"grad_norm": 40.95683288574219,
"learning_rate": 7.973684210526316e-05,
"loss": 6.7858,
"step": 50
},
{
"epoch": 0.01697792869269949,
"eval_loss": 3.2727584838867188,
"eval_runtime": 136.3007,
"eval_samples_per_second": 9.098,
"eval_steps_per_second": 2.274,
"step": 50
},
{
"epoch": 0.01731748726655348,
"grad_norm": 8.816231727600098,
"learning_rate": 7.920526315789474e-05,
"loss": 6.4331,
"step": 51
},
{
"epoch": 0.01765704584040747,
"grad_norm": 7.692835807800293,
"learning_rate": 7.867368421052631e-05,
"loss": 6.1655,
"step": 52
},
{
"epoch": 0.01799660441426146,
"grad_norm": 6.719763278961182,
"learning_rate": 7.814210526315789e-05,
"loss": 5.9821,
"step": 53
},
{
"epoch": 0.01833616298811545,
"grad_norm": 6.741839408874512,
"learning_rate": 7.761052631578946e-05,
"loss": 5.5779,
"step": 54
},
{
"epoch": 0.01867572156196944,
"grad_norm": 7.938393592834473,
"learning_rate": 7.707894736842105e-05,
"loss": 6.1207,
"step": 55
},
{
"epoch": 0.019015280135823428,
"grad_norm": 7.241247177124023,
"learning_rate": 7.654736842105264e-05,
"loss": 5.5408,
"step": 56
},
{
"epoch": 0.01935483870967742,
"grad_norm": 7.55157470703125,
"learning_rate": 7.601578947368422e-05,
"loss": 5.5802,
"step": 57
},
{
"epoch": 0.01969439728353141,
"grad_norm": 7.256725311279297,
"learning_rate": 7.548421052631579e-05,
"loss": 5.7427,
"step": 58
},
{
"epoch": 0.020033955857385398,
"grad_norm": 8.38663101196289,
"learning_rate": 7.495263157894737e-05,
"loss": 5.8768,
"step": 59
},
{
"epoch": 0.02037351443123939,
"grad_norm": 7.507662296295166,
"learning_rate": 7.442105263157894e-05,
"loss": 5.8298,
"step": 60
},
{
"epoch": 0.02071307300509338,
"grad_norm": 8.123747825622559,
"learning_rate": 7.388947368421053e-05,
"loss": 5.6901,
"step": 61
},
{
"epoch": 0.021052631578947368,
"grad_norm": 7.707481384277344,
"learning_rate": 7.335789473684211e-05,
"loss": 5.4775,
"step": 62
},
{
"epoch": 0.021392190152801357,
"grad_norm": 8.684199333190918,
"learning_rate": 7.282631578947368e-05,
"loss": 4.7097,
"step": 63
},
{
"epoch": 0.02173174872665535,
"grad_norm": 9.479657173156738,
"learning_rate": 7.229473684210527e-05,
"loss": 6.1553,
"step": 64
},
{
"epoch": 0.022071307300509338,
"grad_norm": 7.97694206237793,
"learning_rate": 7.176315789473685e-05,
"loss": 5.1793,
"step": 65
},
{
"epoch": 0.022410865874363327,
"grad_norm": 8.905004501342773,
"learning_rate": 7.123157894736842e-05,
"loss": 5.1063,
"step": 66
},
{
"epoch": 0.02275042444821732,
"grad_norm": 9.290450096130371,
"learning_rate": 7.07e-05,
"loss": 5.6181,
"step": 67
},
{
"epoch": 0.023089983022071308,
"grad_norm": 9.563346862792969,
"learning_rate": 7.016842105263159e-05,
"loss": 5.3551,
"step": 68
},
{
"epoch": 0.023429541595925297,
"grad_norm": 9.26034927368164,
"learning_rate": 6.963684210526316e-05,
"loss": 5.5433,
"step": 69
},
{
"epoch": 0.023769100169779286,
"grad_norm": 9.813597679138184,
"learning_rate": 6.910526315789474e-05,
"loss": 5.3692,
"step": 70
},
{
"epoch": 0.024108658743633278,
"grad_norm": 9.946206092834473,
"learning_rate": 6.857368421052631e-05,
"loss": 5.4733,
"step": 71
},
{
"epoch": 0.024448217317487267,
"grad_norm": 11.333394050598145,
"learning_rate": 6.80421052631579e-05,
"loss": 6.3697,
"step": 72
},
{
"epoch": 0.024787775891341256,
"grad_norm": 10.807990074157715,
"learning_rate": 6.751052631578948e-05,
"loss": 5.7684,
"step": 73
},
{
"epoch": 0.025127334465195245,
"grad_norm": 10.34673023223877,
"learning_rate": 6.697894736842105e-05,
"loss": 5.5579,
"step": 74
},
{
"epoch": 0.025466893039049237,
"grad_norm": 10.379812240600586,
"learning_rate": 6.644736842105264e-05,
"loss": 6.0875,
"step": 75
},
{
"epoch": 0.025806451612903226,
"grad_norm": 10.365467071533203,
"learning_rate": 6.591578947368422e-05,
"loss": 5.4606,
"step": 76
},
{
"epoch": 0.026146010186757215,
"grad_norm": 12.18764877319336,
"learning_rate": 6.538421052631579e-05,
"loss": 5.7223,
"step": 77
},
{
"epoch": 0.026485568760611207,
"grad_norm": 10.989704132080078,
"learning_rate": 6.485263157894737e-05,
"loss": 5.3804,
"step": 78
},
{
"epoch": 0.026825127334465196,
"grad_norm": 11.694632530212402,
"learning_rate": 6.432105263157894e-05,
"loss": 5.822,
"step": 79
},
{
"epoch": 0.027164685908319185,
"grad_norm": 12.42897891998291,
"learning_rate": 6.378947368421053e-05,
"loss": 5.6253,
"step": 80
},
{
"epoch": 0.027504244482173174,
"grad_norm": 12.49673080444336,
"learning_rate": 6.32578947368421e-05,
"loss": 5.885,
"step": 81
},
{
"epoch": 0.027843803056027166,
"grad_norm": 12.554586410522461,
"learning_rate": 6.27263157894737e-05,
"loss": 5.4057,
"step": 82
},
{
"epoch": 0.028183361629881155,
"grad_norm": 12.602128028869629,
"learning_rate": 6.219473684210527e-05,
"loss": 5.8701,
"step": 83
},
{
"epoch": 0.028522920203735144,
"grad_norm": 14.500311851501465,
"learning_rate": 6.166315789473685e-05,
"loss": 5.7579,
"step": 84
},
{
"epoch": 0.028862478777589132,
"grad_norm": 12.415670394897461,
"learning_rate": 6.113157894736842e-05,
"loss": 5.589,
"step": 85
},
{
"epoch": 0.029202037351443125,
"grad_norm": 12.579917907714844,
"learning_rate": 6.0599999999999996e-05,
"loss": 5.7132,
"step": 86
},
{
"epoch": 0.029541595925297114,
"grad_norm": 14.4943208694458,
"learning_rate": 6.006842105263158e-05,
"loss": 6.1524,
"step": 87
},
{
"epoch": 0.029881154499151102,
"grad_norm": 13.979001998901367,
"learning_rate": 5.953684210526315e-05,
"loss": 5.4524,
"step": 88
},
{
"epoch": 0.030220713073005095,
"grad_norm": 12.837852478027344,
"learning_rate": 5.900526315789474e-05,
"loss": 5.0063,
"step": 89
},
{
"epoch": 0.030560271646859084,
"grad_norm": 15.69062614440918,
"learning_rate": 5.847368421052632e-05,
"loss": 5.7173,
"step": 90
},
{
"epoch": 0.030899830220713072,
"grad_norm": 18.907155990600586,
"learning_rate": 5.79421052631579e-05,
"loss": 5.5478,
"step": 91
},
{
"epoch": 0.03123938879456706,
"grad_norm": 13.907947540283203,
"learning_rate": 5.7410526315789475e-05,
"loss": 5.1368,
"step": 92
},
{
"epoch": 0.031578947368421054,
"grad_norm": 21.56955337524414,
"learning_rate": 5.687894736842105e-05,
"loss": 5.4158,
"step": 93
},
{
"epoch": 0.03191850594227504,
"grad_norm": 19.460166931152344,
"learning_rate": 5.6347368421052625e-05,
"loss": 6.2592,
"step": 94
},
{
"epoch": 0.03225806451612903,
"grad_norm": 22.000574111938477,
"learning_rate": 5.5815789473684214e-05,
"loss": 5.9925,
"step": 95
},
{
"epoch": 0.03259762308998302,
"grad_norm": 21.29176139831543,
"learning_rate": 5.5284210526315796e-05,
"loss": 6.2012,
"step": 96
},
{
"epoch": 0.03293718166383701,
"grad_norm": 24.593799591064453,
"learning_rate": 5.475263157894737e-05,
"loss": 6.3382,
"step": 97
},
{
"epoch": 0.033276740237691005,
"grad_norm": 25.268535614013672,
"learning_rate": 5.422105263157895e-05,
"loss": 6.17,
"step": 98
},
{
"epoch": 0.033616298811544994,
"grad_norm": 37.28253173828125,
"learning_rate": 5.368947368421053e-05,
"loss": 7.309,
"step": 99
},
{
"epoch": 0.03395585738539898,
"grad_norm": 61.91118240356445,
"learning_rate": 5.3157894736842104e-05,
"loss": 9.4064,
"step": 100
},
{
"epoch": 0.03395585738539898,
"eval_loss": 3.108811855316162,
"eval_runtime": 133.8842,
"eval_samples_per_second": 9.262,
"eval_steps_per_second": 2.315,
"step": 100
},
{
"epoch": 0.03429541595925297,
"grad_norm": 8.512744903564453,
"learning_rate": 5.262631578947368e-05,
"loss": 6.0228,
"step": 101
},
{
"epoch": 0.03463497453310696,
"grad_norm": 7.949807643890381,
"learning_rate": 5.209473684210527e-05,
"loss": 5.8142,
"step": 102
},
{
"epoch": 0.03497453310696095,
"grad_norm": 6.752256870269775,
"learning_rate": 5.1563157894736844e-05,
"loss": 5.552,
"step": 103
},
{
"epoch": 0.03531409168081494,
"grad_norm": 6.490177631378174,
"learning_rate": 5.1031578947368426e-05,
"loss": 5.8519,
"step": 104
},
{
"epoch": 0.035653650254668934,
"grad_norm": 5.849376678466797,
"learning_rate": 5.05e-05,
"loss": 5.6065,
"step": 105
},
{
"epoch": 0.03599320882852292,
"grad_norm": 6.028791427612305,
"learning_rate": 4.9968421052631576e-05,
"loss": 5.6156,
"step": 106
},
{
"epoch": 0.03633276740237691,
"grad_norm": 5.619626522064209,
"learning_rate": 4.943684210526316e-05,
"loss": 5.2623,
"step": 107
},
{
"epoch": 0.0366723259762309,
"grad_norm": 5.889388084411621,
"learning_rate": 4.890526315789474e-05,
"loss": 5.3881,
"step": 108
},
{
"epoch": 0.03701188455008489,
"grad_norm": 6.1949615478515625,
"learning_rate": 4.8373684210526316e-05,
"loss": 5.1539,
"step": 109
},
{
"epoch": 0.03735144312393888,
"grad_norm": 6.2401442527771,
"learning_rate": 4.784210526315789e-05,
"loss": 5.1559,
"step": 110
},
{
"epoch": 0.03769100169779287,
"grad_norm": 6.51352071762085,
"learning_rate": 4.731052631578947e-05,
"loss": 4.9546,
"step": 111
},
{
"epoch": 0.038030560271646856,
"grad_norm": 7.465339660644531,
"learning_rate": 4.6778947368421055e-05,
"loss": 5.389,
"step": 112
},
{
"epoch": 0.03837011884550085,
"grad_norm": 8.441889762878418,
"learning_rate": 4.624736842105263e-05,
"loss": 5.6321,
"step": 113
},
{
"epoch": 0.03870967741935484,
"grad_norm": 8.055974006652832,
"learning_rate": 4.571578947368421e-05,
"loss": 5.5059,
"step": 114
},
{
"epoch": 0.03904923599320883,
"grad_norm": 7.581737041473389,
"learning_rate": 4.518421052631579e-05,
"loss": 5.1159,
"step": 115
},
{
"epoch": 0.03938879456706282,
"grad_norm": 8.991089820861816,
"learning_rate": 4.465263157894737e-05,
"loss": 5.9822,
"step": 116
},
{
"epoch": 0.03972835314091681,
"grad_norm": 8.726984024047852,
"learning_rate": 4.412105263157895e-05,
"loss": 5.4402,
"step": 117
},
{
"epoch": 0.040067911714770796,
"grad_norm": 8.529667854309082,
"learning_rate": 4.358947368421053e-05,
"loss": 5.2337,
"step": 118
},
{
"epoch": 0.040407470288624785,
"grad_norm": 8.97454833984375,
"learning_rate": 4.30578947368421e-05,
"loss": 5.6533,
"step": 119
},
{
"epoch": 0.04074702886247878,
"grad_norm": 8.54892349243164,
"learning_rate": 4.2526315789473685e-05,
"loss": 5.4748,
"step": 120
},
{
"epoch": 0.04108658743633277,
"grad_norm": 8.859085083007812,
"learning_rate": 4.199473684210527e-05,
"loss": 5.733,
"step": 121
},
{
"epoch": 0.04142614601018676,
"grad_norm": 8.932308197021484,
"learning_rate": 4.146315789473684e-05,
"loss": 5.3355,
"step": 122
},
{
"epoch": 0.04176570458404075,
"grad_norm": 9.009238243103027,
"learning_rate": 4.093157894736842e-05,
"loss": 5.2937,
"step": 123
},
{
"epoch": 0.042105263157894736,
"grad_norm": 8.809886932373047,
"learning_rate": 4.0400000000000006e-05,
"loss": 5.6176,
"step": 124
},
{
"epoch": 0.042444821731748725,
"grad_norm": 10.109439849853516,
"learning_rate": 3.986842105263158e-05,
"loss": 5.5681,
"step": 125
},
{
"epoch": 0.042784380305602714,
"grad_norm": 9.18508243560791,
"learning_rate": 3.933684210526316e-05,
"loss": 5.263,
"step": 126
},
{
"epoch": 0.04312393887945671,
"grad_norm": 10.614432334899902,
"learning_rate": 3.880526315789473e-05,
"loss": 5.6346,
"step": 127
},
{
"epoch": 0.0434634974533107,
"grad_norm": 11.10940933227539,
"learning_rate": 3.827368421052632e-05,
"loss": 5.6721,
"step": 128
},
{
"epoch": 0.04380305602716469,
"grad_norm": 9.783493041992188,
"learning_rate": 3.7742105263157896e-05,
"loss": 5.2759,
"step": 129
},
{
"epoch": 0.044142614601018676,
"grad_norm": 10.094010353088379,
"learning_rate": 3.721052631578947e-05,
"loss": 5.0748,
"step": 130
},
{
"epoch": 0.044482173174872665,
"grad_norm": 11.673230171203613,
"learning_rate": 3.6678947368421054e-05,
"loss": 6.0693,
"step": 131
},
{
"epoch": 0.044821731748726654,
"grad_norm": 13.237796783447266,
"learning_rate": 3.6147368421052636e-05,
"loss": 5.8695,
"step": 132
},
{
"epoch": 0.04516129032258064,
"grad_norm": 11.816963195800781,
"learning_rate": 3.561578947368421e-05,
"loss": 4.9874,
"step": 133
},
{
"epoch": 0.04550084889643464,
"grad_norm": 11.55286979675293,
"learning_rate": 3.508421052631579e-05,
"loss": 5.6631,
"step": 134
},
{
"epoch": 0.04584040747028863,
"grad_norm": 14.232548713684082,
"learning_rate": 3.455263157894737e-05,
"loss": 5.5924,
"step": 135
},
{
"epoch": 0.046179966044142616,
"grad_norm": 14.204998970031738,
"learning_rate": 3.402105263157895e-05,
"loss": 6.4456,
"step": 136
},
{
"epoch": 0.046519524617996605,
"grad_norm": 16.168073654174805,
"learning_rate": 3.3489473684210526e-05,
"loss": 6.0166,
"step": 137
},
{
"epoch": 0.046859083191850594,
"grad_norm": 13.623854637145996,
"learning_rate": 3.295789473684211e-05,
"loss": 4.8727,
"step": 138
},
{
"epoch": 0.04719864176570458,
"grad_norm": 14.182967185974121,
"learning_rate": 3.242631578947368e-05,
"loss": 5.2488,
"step": 139
},
{
"epoch": 0.04753820033955857,
"grad_norm": 15.692301750183105,
"learning_rate": 3.1894736842105265e-05,
"loss": 5.7354,
"step": 140
},
{
"epoch": 0.04787775891341256,
"grad_norm": 13.835912704467773,
"learning_rate": 3.136315789473685e-05,
"loss": 4.7608,
"step": 141
},
{
"epoch": 0.048217317487266556,
"grad_norm": 23.79447364807129,
"learning_rate": 3.083157894736842e-05,
"loss": 5.9428,
"step": 142
},
{
"epoch": 0.048556876061120545,
"grad_norm": 20.368270874023438,
"learning_rate": 3.0299999999999998e-05,
"loss": 5.8896,
"step": 143
},
{
"epoch": 0.048896434634974534,
"grad_norm": 26.974061965942383,
"learning_rate": 2.9768421052631577e-05,
"loss": 6.8198,
"step": 144
},
{
"epoch": 0.04923599320882852,
"grad_norm": 21.44305419921875,
"learning_rate": 2.923684210526316e-05,
"loss": 6.8624,
"step": 145
},
{
"epoch": 0.04957555178268251,
"grad_norm": 22.52785301208496,
"learning_rate": 2.8705263157894737e-05,
"loss": 5.6798,
"step": 146
},
{
"epoch": 0.0499151103565365,
"grad_norm": 27.52121353149414,
"learning_rate": 2.8173684210526313e-05,
"loss": 6.4979,
"step": 147
},
{
"epoch": 0.05025466893039049,
"grad_norm": 34.942691802978516,
"learning_rate": 2.7642105263157898e-05,
"loss": 7.2382,
"step": 148
},
{
"epoch": 0.050594227504244485,
"grad_norm": 44.1684684753418,
"learning_rate": 2.7110526315789473e-05,
"loss": 7.3026,
"step": 149
},
{
"epoch": 0.050933786078098474,
"grad_norm": 29.29156494140625,
"learning_rate": 2.6578947368421052e-05,
"loss": 5.9047,
"step": 150
},
{
"epoch": 0.050933786078098474,
"eval_loss": 2.8410627841949463,
"eval_runtime": 134.1501,
"eval_samples_per_second": 9.243,
"eval_steps_per_second": 2.311,
"step": 150
},
{
"epoch": 0.05127334465195246,
"grad_norm": 4.4673285484313965,
"learning_rate": 2.6047368421052634e-05,
"loss": 5.1138,
"step": 151
},
{
"epoch": 0.05161290322580645,
"grad_norm": 5.355312347412109,
"learning_rate": 2.5515789473684213e-05,
"loss": 5.4617,
"step": 152
},
{
"epoch": 0.05195246179966044,
"grad_norm": 5.351836681365967,
"learning_rate": 2.4984210526315788e-05,
"loss": 5.3663,
"step": 153
},
{
"epoch": 0.05229202037351443,
"grad_norm": 6.066408634185791,
"learning_rate": 2.445263157894737e-05,
"loss": 5.3583,
"step": 154
},
{
"epoch": 0.05263157894736842,
"grad_norm": 5.661759376525879,
"learning_rate": 2.3921052631578946e-05,
"loss": 5.3968,
"step": 155
},
{
"epoch": 0.052971137521222414,
"grad_norm": 5.76517391204834,
"learning_rate": 2.3389473684210528e-05,
"loss": 5.0868,
"step": 156
},
{
"epoch": 0.0533106960950764,
"grad_norm": 6.425754070281982,
"learning_rate": 2.2857894736842106e-05,
"loss": 5.7214,
"step": 157
},
{
"epoch": 0.05365025466893039,
"grad_norm": 6.3002753257751465,
"learning_rate": 2.2326315789473685e-05,
"loss": 4.8879,
"step": 158
},
{
"epoch": 0.05398981324278438,
"grad_norm": 5.956075191497803,
"learning_rate": 2.1794736842105264e-05,
"loss": 5.3174,
"step": 159
},
{
"epoch": 0.05432937181663837,
"grad_norm": 6.668689727783203,
"learning_rate": 2.1263157894736842e-05,
"loss": 5.1679,
"step": 160
},
{
"epoch": 0.05466893039049236,
"grad_norm": 6.492646217346191,
"learning_rate": 2.073157894736842e-05,
"loss": 5.1203,
"step": 161
},
{
"epoch": 0.05500848896434635,
"grad_norm": 6.742474555969238,
"learning_rate": 2.0200000000000003e-05,
"loss": 5.3391,
"step": 162
},
{
"epoch": 0.05534804753820034,
"grad_norm": 7.066228866577148,
"learning_rate": 1.966842105263158e-05,
"loss": 5.2756,
"step": 163
},
{
"epoch": 0.05568760611205433,
"grad_norm": 7.766740798950195,
"learning_rate": 1.913684210526316e-05,
"loss": 5.0874,
"step": 164
},
{
"epoch": 0.05602716468590832,
"grad_norm": 7.978595733642578,
"learning_rate": 1.8605263157894736e-05,
"loss": 5.2179,
"step": 165
},
{
"epoch": 0.05636672325976231,
"grad_norm": 7.731940269470215,
"learning_rate": 1.8073684210526318e-05,
"loss": 5.0202,
"step": 166
},
{
"epoch": 0.0567062818336163,
"grad_norm": 7.676203727722168,
"learning_rate": 1.7542105263157897e-05,
"loss": 5.3613,
"step": 167
},
{
"epoch": 0.05704584040747029,
"grad_norm": 8.548066139221191,
"learning_rate": 1.7010526315789475e-05,
"loss": 5.5421,
"step": 168
},
{
"epoch": 0.057385398981324276,
"grad_norm": 7.955386638641357,
"learning_rate": 1.6478947368421054e-05,
"loss": 5.2485,
"step": 169
},
{
"epoch": 0.057724957555178265,
"grad_norm": 8.47363567352295,
"learning_rate": 1.5947368421052633e-05,
"loss": 5.5402,
"step": 170
},
{
"epoch": 0.05806451612903226,
"grad_norm": 8.166704177856445,
"learning_rate": 1.541578947368421e-05,
"loss": 4.6554,
"step": 171
},
{
"epoch": 0.05840407470288625,
"grad_norm": 9.387300491333008,
"learning_rate": 1.4884210526315788e-05,
"loss": 5.4705,
"step": 172
},
{
"epoch": 0.05874363327674024,
"grad_norm": 11.64377212524414,
"learning_rate": 1.4352631578947369e-05,
"loss": 5.0644,
"step": 173
},
{
"epoch": 0.05908319185059423,
"grad_norm": 9.732513427734375,
"learning_rate": 1.3821052631578949e-05,
"loss": 5.5656,
"step": 174
},
{
"epoch": 0.059422750424448216,
"grad_norm": 9.858539581298828,
"learning_rate": 1.3289473684210526e-05,
"loss": 5.3905,
"step": 175
},
{
"epoch": 0.059762308998302205,
"grad_norm": 11.403061866760254,
"learning_rate": 1.2757894736842106e-05,
"loss": 5.5977,
"step": 176
},
{
"epoch": 0.060101867572156194,
"grad_norm": 9.382144927978516,
"learning_rate": 1.2226315789473685e-05,
"loss": 5.2198,
"step": 177
},
{
"epoch": 0.06044142614601019,
"grad_norm": 12.708952903747559,
"learning_rate": 1.1694736842105264e-05,
"loss": 5.279,
"step": 178
},
{
"epoch": 0.06078098471986418,
"grad_norm": 11.602399826049805,
"learning_rate": 1.1163157894736842e-05,
"loss": 5.6348,
"step": 179
},
{
"epoch": 0.06112054329371817,
"grad_norm": 11.256779670715332,
"learning_rate": 1.0631578947368421e-05,
"loss": 5.3102,
"step": 180
},
{
"epoch": 0.061460101867572156,
"grad_norm": 13.216877937316895,
"learning_rate": 1.0100000000000002e-05,
"loss": 5.7058,
"step": 181
},
{
"epoch": 0.061799660441426145,
"grad_norm": 11.540813446044922,
"learning_rate": 9.56842105263158e-06,
"loss": 5.7871,
"step": 182
},
{
"epoch": 0.062139219015280134,
"grad_norm": 11.003501892089844,
"learning_rate": 9.036842105263159e-06,
"loss": 4.9942,
"step": 183
},
{
"epoch": 0.06247877758913412,
"grad_norm": 12.439997673034668,
"learning_rate": 8.505263157894738e-06,
"loss": 5.6214,
"step": 184
},
{
"epoch": 0.06281833616298811,
"grad_norm": 13.413476943969727,
"learning_rate": 7.973684210526316e-06,
"loss": 5.2558,
"step": 185
},
{
"epoch": 0.06315789473684211,
"grad_norm": 12.21358585357666,
"learning_rate": 7.442105263157894e-06,
"loss": 5.3534,
"step": 186
},
{
"epoch": 0.06349745331069609,
"grad_norm": 17.38026237487793,
"learning_rate": 6.9105263157894745e-06,
"loss": 5.9743,
"step": 187
},
{
"epoch": 0.06383701188455009,
"grad_norm": 14.174423217773438,
"learning_rate": 6.378947368421053e-06,
"loss": 6.3298,
"step": 188
},
{
"epoch": 0.06417657045840408,
"grad_norm": 15.029065132141113,
"learning_rate": 5.847368421052632e-06,
"loss": 6.079,
"step": 189
},
{
"epoch": 0.06451612903225806,
"grad_norm": 18.191574096679688,
"learning_rate": 5.315789473684211e-06,
"loss": 6.2973,
"step": 190
},
{
"epoch": 0.06485568760611206,
"grad_norm": 20.552350997924805,
"learning_rate": 4.78421052631579e-06,
"loss": 5.2437,
"step": 191
},
{
"epoch": 0.06519524617996604,
"grad_norm": 15.131223678588867,
"learning_rate": 4.252631578947369e-06,
"loss": 4.9799,
"step": 192
},
{
"epoch": 0.06553480475382004,
"grad_norm": 19.161144256591797,
"learning_rate": 3.721052631578947e-06,
"loss": 5.4162,
"step": 193
},
{
"epoch": 0.06587436332767402,
"grad_norm": 17.99496841430664,
"learning_rate": 3.1894736842105266e-06,
"loss": 6.2559,
"step": 194
},
{
"epoch": 0.06621392190152801,
"grad_norm": 21.827606201171875,
"learning_rate": 2.6578947368421053e-06,
"loss": 6.9087,
"step": 195
},
{
"epoch": 0.06655348047538201,
"grad_norm": 22.512189865112305,
"learning_rate": 2.1263157894736844e-06,
"loss": 5.9861,
"step": 196
},
{
"epoch": 0.06689303904923599,
"grad_norm": 20.787433624267578,
"learning_rate": 1.5947368421052633e-06,
"loss": 6.6073,
"step": 197
},
{
"epoch": 0.06723259762308999,
"grad_norm": 34.3043098449707,
"learning_rate": 1.0631578947368422e-06,
"loss": 6.9233,
"step": 198
},
{
"epoch": 0.06757215619694397,
"grad_norm": 39.098968505859375,
"learning_rate": 5.315789473684211e-07,
"loss": 7.7742,
"step": 199
},
{
"epoch": 0.06791171477079797,
"grad_norm": 41.64736557006836,
"learning_rate": 0.0,
"loss": 7.8041,
"step": 200
},
{
"epoch": 0.06791171477079797,
"eval_loss": 2.7491917610168457,
"eval_runtime": 134.4966,
"eval_samples_per_second": 9.22,
"eval_steps_per_second": 2.305,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.229663245605274e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}