ancient41's picture
Training in progress, step 116, checkpoint
4fad338 verified
raw
history blame
21.6 kB
{
"best_metric": 0.0009016587864607573,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 1.0021598272138228,
"eval_steps": 50,
"global_step": 116,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008639308855291577,
"grad_norm": 0.5806592106819153,
"learning_rate": 1e-05,
"loss": 0.0374,
"step": 1
},
{
"epoch": 0.008639308855291577,
"eval_loss": 0.15225130319595337,
"eval_runtime": 20.0655,
"eval_samples_per_second": 9.718,
"eval_steps_per_second": 2.442,
"step": 1
},
{
"epoch": 0.017278617710583154,
"grad_norm": 0.6535840630531311,
"learning_rate": 2e-05,
"loss": 0.0344,
"step": 2
},
{
"epoch": 0.02591792656587473,
"grad_norm": 0.4030662775039673,
"learning_rate": 3e-05,
"loss": 0.0354,
"step": 3
},
{
"epoch": 0.03455723542116631,
"grad_norm": 0.1255313754081726,
"learning_rate": 4e-05,
"loss": 0.0286,
"step": 4
},
{
"epoch": 0.04319654427645788,
"grad_norm": 0.15026195347309113,
"learning_rate": 5e-05,
"loss": 0.0283,
"step": 5
},
{
"epoch": 0.05183585313174946,
"grad_norm": 0.34920528531074524,
"learning_rate": 6e-05,
"loss": 0.0299,
"step": 6
},
{
"epoch": 0.06047516198704104,
"grad_norm": 0.16989673674106598,
"learning_rate": 7e-05,
"loss": 0.0313,
"step": 7
},
{
"epoch": 0.06911447084233262,
"grad_norm": 0.1835932433605194,
"learning_rate": 8e-05,
"loss": 0.0256,
"step": 8
},
{
"epoch": 0.07775377969762419,
"grad_norm": 0.2971993088722229,
"learning_rate": 9e-05,
"loss": 0.0199,
"step": 9
},
{
"epoch": 0.08639308855291576,
"grad_norm": 0.1404445469379425,
"learning_rate": 0.0001,
"loss": 0.005,
"step": 10
},
{
"epoch": 0.09503239740820735,
"grad_norm": 0.36361753940582275,
"learning_rate": 9.997804182543973e-05,
"loss": 0.01,
"step": 11
},
{
"epoch": 0.10367170626349892,
"grad_norm": 0.4649268388748169,
"learning_rate": 9.991218658821608e-05,
"loss": 0.0044,
"step": 12
},
{
"epoch": 0.11231101511879049,
"grad_norm": 0.08656516671180725,
"learning_rate": 9.980249213076084e-05,
"loss": 0.0023,
"step": 13
},
{
"epoch": 0.12095032397408208,
"grad_norm": 0.1461178958415985,
"learning_rate": 9.964905480067586e-05,
"loss": 0.0051,
"step": 14
},
{
"epoch": 0.12958963282937366,
"grad_norm": 0.10659543424844742,
"learning_rate": 9.94520093661082e-05,
"loss": 0.0028,
"step": 15
},
{
"epoch": 0.13822894168466524,
"grad_norm": 0.06367552280426025,
"learning_rate": 9.921152889737984e-05,
"loss": 0.001,
"step": 16
},
{
"epoch": 0.1468682505399568,
"grad_norm": 0.07228197157382965,
"learning_rate": 9.89278246149752e-05,
"loss": 0.0008,
"step": 17
},
{
"epoch": 0.15550755939524838,
"grad_norm": 0.14873471856117249,
"learning_rate": 9.860114570402054e-05,
"loss": 0.0028,
"step": 18
},
{
"epoch": 0.16414686825053995,
"grad_norm": 0.20919160544872284,
"learning_rate": 9.823177909541794e-05,
"loss": 0.0062,
"step": 19
},
{
"epoch": 0.17278617710583152,
"grad_norm": 0.11298267543315887,
"learning_rate": 9.782004921382612e-05,
"loss": 0.0006,
"step": 20
},
{
"epoch": 0.18142548596112312,
"grad_norm": 0.02028828300535679,
"learning_rate": 9.736631769270957e-05,
"loss": 0.0008,
"step": 21
},
{
"epoch": 0.1900647948164147,
"grad_norm": 0.17469586431980133,
"learning_rate": 9.687098305670605e-05,
"loss": 0.0036,
"step": 22
},
{
"epoch": 0.19870410367170627,
"grad_norm": 0.06677763164043427,
"learning_rate": 9.633448037159167e-05,
"loss": 0.0011,
"step": 23
},
{
"epoch": 0.20734341252699784,
"grad_norm": 0.08885731548070908,
"learning_rate": 9.575728086215092e-05,
"loss": 0.002,
"step": 24
},
{
"epoch": 0.2159827213822894,
"grad_norm": 0.07344262301921844,
"learning_rate": 9.513989149828718e-05,
"loss": 0.0016,
"step": 25
},
{
"epoch": 0.22462203023758098,
"grad_norm": 0.012290475890040398,
"learning_rate": 9.448285454973738e-05,
"loss": 0.0006,
"step": 26
},
{
"epoch": 0.23326133909287258,
"grad_norm": 0.2666710913181305,
"learning_rate": 9.378674710978185e-05,
"loss": 0.001,
"step": 27
},
{
"epoch": 0.24190064794816415,
"grad_norm": 0.15356595814228058,
"learning_rate": 9.305218058836778e-05,
"loss": 0.0012,
"step": 28
},
{
"epoch": 0.2505399568034557,
"grad_norm": 0.05029755458235741,
"learning_rate": 9.22798001750913e-05,
"loss": 0.0158,
"step": 29
},
{
"epoch": 0.2591792656587473,
"grad_norm": 0.07061488926410675,
"learning_rate": 9.14702842725101e-05,
"loss": 0.0148,
"step": 30
},
{
"epoch": 0.2678185745140389,
"grad_norm": 0.05950339511036873,
"learning_rate": 9.062434390028407e-05,
"loss": 0.0154,
"step": 31
},
{
"epoch": 0.27645788336933047,
"grad_norm": 0.045820388942956924,
"learning_rate": 8.974272207066767e-05,
"loss": 0.0116,
"step": 32
},
{
"epoch": 0.28509719222462204,
"grad_norm": 0.03941355645656586,
"learning_rate": 8.882619313590212e-05,
"loss": 0.0072,
"step": 33
},
{
"epoch": 0.2937365010799136,
"grad_norm": 0.02324046567082405,
"learning_rate": 8.787556210808101e-05,
"loss": 0.0012,
"step": 34
},
{
"epoch": 0.3023758099352052,
"grad_norm": 0.053608641028404236,
"learning_rate": 8.689166395208636e-05,
"loss": 0.0023,
"step": 35
},
{
"epoch": 0.31101511879049676,
"grad_norm": 0.062077559530735016,
"learning_rate": 8.587536285221656e-05,
"loss": 0.0017,
"step": 36
},
{
"epoch": 0.31965442764578833,
"grad_norm": 0.09231416881084442,
"learning_rate": 8.482755145314986e-05,
"loss": 0.0018,
"step": 37
},
{
"epoch": 0.3282937365010799,
"grad_norm": 0.08266697824001312,
"learning_rate": 8.374915007591053e-05,
"loss": 0.0021,
"step": 38
},
{
"epoch": 0.3369330453563715,
"grad_norm": 0.019569700583815575,
"learning_rate": 8.264110590952609e-05,
"loss": 0.0005,
"step": 39
},
{
"epoch": 0.34557235421166305,
"grad_norm": 0.07023479044437408,
"learning_rate": 8.150439217908556e-05,
"loss": 0.0019,
"step": 40
},
{
"epoch": 0.3542116630669546,
"grad_norm": 0.029872030019760132,
"learning_rate": 8.034000729092968e-05,
"loss": 0.0011,
"step": 41
},
{
"epoch": 0.36285097192224625,
"grad_norm": 0.18075761198997498,
"learning_rate": 7.91489739557236e-05,
"loss": 0.0011,
"step": 42
},
{
"epoch": 0.3714902807775378,
"grad_norm": 0.010835711844265461,
"learning_rate": 7.793233829018262e-05,
"loss": 0.0003,
"step": 43
},
{
"epoch": 0.3801295896328294,
"grad_norm": 0.010719070211052895,
"learning_rate": 7.669116889823955e-05,
"loss": 0.0004,
"step": 44
},
{
"epoch": 0.38876889848812096,
"grad_norm": 0.015587416477501392,
"learning_rate": 7.542655593246103e-05,
"loss": 0.0004,
"step": 45
},
{
"epoch": 0.39740820734341253,
"grad_norm": 0.0032320828177034855,
"learning_rate": 7.413961013653726e-05,
"loss": 0.0002,
"step": 46
},
{
"epoch": 0.4060475161987041,
"grad_norm": 0.24344860017299652,
"learning_rate": 7.283146186968565e-05,
"loss": 0.0009,
"step": 47
},
{
"epoch": 0.4146868250539957,
"grad_norm": 0.1430875062942505,
"learning_rate": 7.150326011382604e-05,
"loss": 0.0021,
"step": 48
},
{
"epoch": 0.42332613390928725,
"grad_norm": 0.01567676290869713,
"learning_rate": 7.015617146439863e-05,
"loss": 0.0002,
"step": 49
},
{
"epoch": 0.4319654427645788,
"grad_norm": 0.11232331395149231,
"learning_rate": 6.879137910571191e-05,
"loss": 0.004,
"step": 50
},
{
"epoch": 0.4319654427645788,
"eval_loss": 0.0022863983176648617,
"eval_runtime": 20.6121,
"eval_samples_per_second": 9.46,
"eval_steps_per_second": 2.377,
"step": 50
},
{
"epoch": 0.4406047516198704,
"grad_norm": 0.023258408531546593,
"learning_rate": 6.741008177171995e-05,
"loss": 0.0002,
"step": 51
},
{
"epoch": 0.44924406047516197,
"grad_norm": 0.015541836619377136,
"learning_rate": 6.601349269314188e-05,
"loss": 0.0004,
"step": 52
},
{
"epoch": 0.45788336933045354,
"grad_norm": 0.004774713423103094,
"learning_rate": 6.460283853184879e-05,
"loss": 0.0003,
"step": 53
},
{
"epoch": 0.46652267818574517,
"grad_norm": 0.1771538257598877,
"learning_rate": 6.317935830345338e-05,
"loss": 0.0037,
"step": 54
},
{
"epoch": 0.47516198704103674,
"grad_norm": 0.10851258039474487,
"learning_rate": 6.174430228904919e-05,
"loss": 0.0021,
"step": 55
},
{
"epoch": 0.4838012958963283,
"grad_norm": 0.029805807396769524,
"learning_rate": 6.029893093705492e-05,
"loss": 0.0004,
"step": 56
},
{
"epoch": 0.4924406047516199,
"grad_norm": 0.05265142768621445,
"learning_rate": 5.884451375612865e-05,
"loss": 0.0072,
"step": 57
},
{
"epoch": 0.5010799136069114,
"grad_norm": 0.06926386058330536,
"learning_rate": 5.738232820012407e-05,
"loss": 0.0062,
"step": 58
},
{
"epoch": 0.509719222462203,
"grad_norm": 0.08675476908683777,
"learning_rate": 5.5913658546068295e-05,
"loss": 0.0054,
"step": 59
},
{
"epoch": 0.5183585313174947,
"grad_norm": 0.03324931487441063,
"learning_rate": 5.4439794766146746e-05,
"loss": 0.0035,
"step": 60
},
{
"epoch": 0.5269978401727862,
"grad_norm": 0.07478857040405273,
"learning_rate": 5.296203139468572e-05,
"loss": 0.0024,
"step": 61
},
{
"epoch": 0.5356371490280778,
"grad_norm": 0.08285272121429443,
"learning_rate": 5.148166639112799e-05,
"loss": 0.0018,
"step": 62
},
{
"epoch": 0.5442764578833693,
"grad_norm": 0.023794766515493393,
"learning_rate": 5e-05,
"loss": 0.0007,
"step": 63
},
{
"epoch": 0.5529157667386609,
"grad_norm": 0.025584915652871132,
"learning_rate": 4.851833360887201e-05,
"loss": 0.0005,
"step": 64
},
{
"epoch": 0.5615550755939525,
"grad_norm": 0.04154638200998306,
"learning_rate": 4.703796860531429e-05,
"loss": 0.0015,
"step": 65
},
{
"epoch": 0.5701943844492441,
"grad_norm": 0.017531629651784897,
"learning_rate": 4.5560205233853266e-05,
"loss": 0.0004,
"step": 66
},
{
"epoch": 0.5788336933045356,
"grad_norm": 0.046652115881443024,
"learning_rate": 4.4086341453931716e-05,
"loss": 0.0005,
"step": 67
},
{
"epoch": 0.5874730021598272,
"grad_norm": 0.21531158685684204,
"learning_rate": 4.2617671799875944e-05,
"loss": 0.0018,
"step": 68
},
{
"epoch": 0.5961123110151187,
"grad_norm": 0.11707913130521774,
"learning_rate": 4.115548624387137e-05,
"loss": 0.0013,
"step": 69
},
{
"epoch": 0.6047516198704104,
"grad_norm": 0.008791811764240265,
"learning_rate": 3.970106906294509e-05,
"loss": 0.0003,
"step": 70
},
{
"epoch": 0.6133909287257019,
"grad_norm": 0.017406433820724487,
"learning_rate": 3.825569771095082e-05,
"loss": 0.0003,
"step": 71
},
{
"epoch": 0.6220302375809935,
"grad_norm": 0.03273649513721466,
"learning_rate": 3.682064169654663e-05,
"loss": 0.0008,
"step": 72
},
{
"epoch": 0.6306695464362851,
"grad_norm": 0.014443274587392807,
"learning_rate": 3.539716146815122e-05,
"loss": 0.0003,
"step": 73
},
{
"epoch": 0.6393088552915767,
"grad_norm": 0.0037852220702916384,
"learning_rate": 3.3986507306858125e-05,
"loss": 0.0002,
"step": 74
},
{
"epoch": 0.6479481641468683,
"grad_norm": 0.006565776187926531,
"learning_rate": 3.258991822828007e-05,
"loss": 0.0002,
"step": 75
},
{
"epoch": 0.6565874730021598,
"grad_norm": 0.0011975999223068357,
"learning_rate": 3.12086208942881e-05,
"loss": 0.0001,
"step": 76
},
{
"epoch": 0.6652267818574514,
"grad_norm": 0.021672353148460388,
"learning_rate": 2.98438285356014e-05,
"loss": 0.0003,
"step": 77
},
{
"epoch": 0.673866090712743,
"grad_norm": 0.010406700894236565,
"learning_rate": 2.8496739886173995e-05,
"loss": 0.0004,
"step": 78
},
{
"epoch": 0.6825053995680346,
"grad_norm": 0.13345706462860107,
"learning_rate": 2.716853813031435e-05,
"loss": 0.0008,
"step": 79
},
{
"epoch": 0.6911447084233261,
"grad_norm": 0.0036847260780632496,
"learning_rate": 2.5860389863462765e-05,
"loss": 0.0002,
"step": 80
},
{
"epoch": 0.6997840172786177,
"grad_norm": 0.031118186190724373,
"learning_rate": 2.4573444067538986e-05,
"loss": 0.0002,
"step": 81
},
{
"epoch": 0.7084233261339092,
"grad_norm": 0.1681869924068451,
"learning_rate": 2.3308831101760486e-05,
"loss": 0.0022,
"step": 82
},
{
"epoch": 0.7170626349892009,
"grad_norm": 0.06865206360816956,
"learning_rate": 2.2067661709817383e-05,
"loss": 0.0009,
"step": 83
},
{
"epoch": 0.7257019438444925,
"grad_norm": 0.15822599828243256,
"learning_rate": 2.0851026044276406e-05,
"loss": 0.0014,
"step": 84
},
{
"epoch": 0.734341252699784,
"grad_norm": 0.0194676723331213,
"learning_rate": 1.9659992709070345e-05,
"loss": 0.002,
"step": 85
},
{
"epoch": 0.7429805615550756,
"grad_norm": 0.031017431989312172,
"learning_rate": 1.849560782091445e-05,
"loss": 0.0021,
"step": 86
},
{
"epoch": 0.7516198704103672,
"grad_norm": 0.03133060783147812,
"learning_rate": 1.7358894090473925e-05,
"loss": 0.0023,
"step": 87
},
{
"epoch": 0.7602591792656588,
"grad_norm": 0.03962257131934166,
"learning_rate": 1.6250849924089484e-05,
"loss": 0.0018,
"step": 88
},
{
"epoch": 0.7688984881209503,
"grad_norm": 0.027627507224678993,
"learning_rate": 1.5172448546850165e-05,
"loss": 0.0015,
"step": 89
},
{
"epoch": 0.7775377969762419,
"grad_norm": 0.021091526374220848,
"learning_rate": 1.4124637147783432e-05,
"loss": 0.0005,
"step": 90
},
{
"epoch": 0.7861771058315334,
"grad_norm": 0.01683308742940426,
"learning_rate": 1.3108336047913633e-05,
"loss": 0.0005,
"step": 91
},
{
"epoch": 0.7948164146868251,
"grad_norm": 0.033081360161304474,
"learning_rate": 1.2124437891918993e-05,
"loss": 0.0004,
"step": 92
},
{
"epoch": 0.8034557235421166,
"grad_norm": 0.023177431896328926,
"learning_rate": 1.1173806864097886e-05,
"loss": 0.0008,
"step": 93
},
{
"epoch": 0.8120950323974082,
"grad_norm": 0.03851751610636711,
"learning_rate": 1.0257277929332332e-05,
"loss": 0.0005,
"step": 94
},
{
"epoch": 0.8207343412526998,
"grad_norm": 0.07173438370227814,
"learning_rate": 9.375656099715934e-06,
"loss": 0.0004,
"step": 95
},
{
"epoch": 0.8293736501079914,
"grad_norm": 0.015922777354717255,
"learning_rate": 8.529715727489912e-06,
"loss": 0.0006,
"step": 96
},
{
"epoch": 0.838012958963283,
"grad_norm": 0.02879387140274048,
"learning_rate": 7.720199824908692e-06,
"loss": 0.0004,
"step": 97
},
{
"epoch": 0.8466522678185745,
"grad_norm": 0.0059148469008505344,
"learning_rate": 6.947819411632223e-06,
"loss": 0.0007,
"step": 98
},
{
"epoch": 0.8552915766738661,
"grad_norm": 0.006823898293077946,
"learning_rate": 6.213252890218163e-06,
"loss": 0.0002,
"step": 99
},
{
"epoch": 0.8639308855291576,
"grad_norm": 0.05277214199304581,
"learning_rate": 5.51714545026264e-06,
"loss": 0.0005,
"step": 100
},
{
"epoch": 0.8639308855291576,
"eval_loss": 0.0009016587864607573,
"eval_runtime": 20.3415,
"eval_samples_per_second": 9.586,
"eval_steps_per_second": 2.409,
"step": 100
},
{
"epoch": 0.8725701943844493,
"grad_norm": 0.012060822919011116,
"learning_rate": 4.860108501712824e-06,
"loss": 0.0003,
"step": 101
},
{
"epoch": 0.8812095032397408,
"grad_norm": 0.060871824622154236,
"learning_rate": 4.242719137849077e-06,
"loss": 0.0007,
"step": 102
},
{
"epoch": 0.8898488120950324,
"grad_norm": 0.04358503967523575,
"learning_rate": 3.6655196284083317e-06,
"loss": 0.0006,
"step": 103
},
{
"epoch": 0.8984881209503239,
"grad_norm": 0.015056795440614223,
"learning_rate": 3.1290169432939553e-06,
"loss": 0.0003,
"step": 104
},
{
"epoch": 0.9071274298056156,
"grad_norm": 0.06826309114694595,
"learning_rate": 2.6336823072904304e-06,
"loss": 0.0019,
"step": 105
},
{
"epoch": 0.9157667386609071,
"grad_norm": 0.004253576509654522,
"learning_rate": 2.179950786173879e-06,
"loss": 0.0002,
"step": 106
},
{
"epoch": 0.9244060475161987,
"grad_norm": 0.027383577078580856,
"learning_rate": 1.7682209045820686e-06,
"loss": 0.0004,
"step": 107
},
{
"epoch": 0.9330453563714903,
"grad_norm": 0.01728072762489319,
"learning_rate": 1.3988542959794627e-06,
"loss": 0.0003,
"step": 108
},
{
"epoch": 0.9416846652267818,
"grad_norm": 0.11028740555047989,
"learning_rate": 1.0721753850247984e-06,
"loss": 0.0021,
"step": 109
},
{
"epoch": 0.9503239740820735,
"grad_norm": 0.007546401582658291,
"learning_rate": 7.884711026201585e-07,
"loss": 0.0002,
"step": 110
},
{
"epoch": 0.958963282937365,
"grad_norm": 0.011820383369922638,
"learning_rate": 5.479906338917984e-07,
"loss": 0.0002,
"step": 111
},
{
"epoch": 0.9676025917926566,
"grad_norm": 0.027329521253705025,
"learning_rate": 3.5094519932415417e-07,
"loss": 0.0002,
"step": 112
},
{
"epoch": 0.9762419006479481,
"grad_norm": 0.027211442589759827,
"learning_rate": 1.975078692391552e-07,
"loss": 0.0012,
"step": 113
},
{
"epoch": 0.9848812095032398,
"grad_norm": 0.013164684176445007,
"learning_rate": 8.781341178393244e-08,
"loss": 0.0004,
"step": 114
},
{
"epoch": 0.9935205183585313,
"grad_norm": 0.0681779533624649,
"learning_rate": 2.1958174560282595e-08,
"loss": 0.0021,
"step": 115
},
{
"epoch": 1.0021598272138228,
"grad_norm": 0.34717699885368347,
"learning_rate": 0.0,
"loss": 0.0036,
"step": 116
}
],
"logging_steps": 1,
"max_steps": 116,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.3075543649878016e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}