lesso's picture
Training in progress, step 200, checkpoint
93641c5 verified
{
"best_metric": 0.21266202628612518,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.16359918200409,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008179959100204499,
"grad_norm": 0.44888365268707275,
"learning_rate": 1.012e-05,
"loss": 0.8848,
"step": 1
},
{
"epoch": 0.0008179959100204499,
"eval_loss": 0.5526403784751892,
"eval_runtime": 164.606,
"eval_samples_per_second": 3.129,
"eval_steps_per_second": 0.784,
"step": 1
},
{
"epoch": 0.0016359918200408998,
"grad_norm": 0.3820202052593231,
"learning_rate": 2.024e-05,
"loss": 0.6338,
"step": 2
},
{
"epoch": 0.00245398773006135,
"grad_norm": 1.0369555950164795,
"learning_rate": 3.0359999999999997e-05,
"loss": 1.3722,
"step": 3
},
{
"epoch": 0.0032719836400817996,
"grad_norm": 0.5409644842147827,
"learning_rate": 4.048e-05,
"loss": 1.027,
"step": 4
},
{
"epoch": 0.00408997955010225,
"grad_norm": 0.5290982127189636,
"learning_rate": 5.06e-05,
"loss": 0.7956,
"step": 5
},
{
"epoch": 0.0049079754601227,
"grad_norm": 0.4899097979068756,
"learning_rate": 6.0719999999999995e-05,
"loss": 0.8705,
"step": 6
},
{
"epoch": 0.0057259713701431495,
"grad_norm": 0.7995485663414001,
"learning_rate": 7.083999999999999e-05,
"loss": 1.2093,
"step": 7
},
{
"epoch": 0.006543967280163599,
"grad_norm": 0.5504206418991089,
"learning_rate": 8.096e-05,
"loss": 0.9514,
"step": 8
},
{
"epoch": 0.007361963190184049,
"grad_norm": 0.5944662690162659,
"learning_rate": 9.108e-05,
"loss": 0.955,
"step": 9
},
{
"epoch": 0.0081799591002045,
"grad_norm": 0.6484851837158203,
"learning_rate": 0.0001012,
"loss": 0.9926,
"step": 10
},
{
"epoch": 0.00899795501022495,
"grad_norm": 0.6986644864082336,
"learning_rate": 0.00010066736842105262,
"loss": 0.8318,
"step": 11
},
{
"epoch": 0.0098159509202454,
"grad_norm": 0.6467517018318176,
"learning_rate": 0.00010013473684210525,
"loss": 0.7294,
"step": 12
},
{
"epoch": 0.01063394683026585,
"grad_norm": 0.7561603784561157,
"learning_rate": 9.960210526315788e-05,
"loss": 0.8644,
"step": 13
},
{
"epoch": 0.011451942740286299,
"grad_norm": 0.7822588086128235,
"learning_rate": 9.906947368421052e-05,
"loss": 0.9813,
"step": 14
},
{
"epoch": 0.012269938650306749,
"grad_norm": 0.6852607727050781,
"learning_rate": 9.853684210526316e-05,
"loss": 0.9142,
"step": 15
},
{
"epoch": 0.013087934560327199,
"grad_norm": 0.6556634902954102,
"learning_rate": 9.800421052631579e-05,
"loss": 0.5068,
"step": 16
},
{
"epoch": 0.013905930470347648,
"grad_norm": 0.954055905342102,
"learning_rate": 9.747157894736841e-05,
"loss": 1.126,
"step": 17
},
{
"epoch": 0.014723926380368098,
"grad_norm": 0.8079794645309448,
"learning_rate": 9.693894736842104e-05,
"loss": 0.963,
"step": 18
},
{
"epoch": 0.015541922290388548,
"grad_norm": 0.8518968820571899,
"learning_rate": 9.640631578947367e-05,
"loss": 0.9903,
"step": 19
},
{
"epoch": 0.016359918200409,
"grad_norm": 0.5757415890693665,
"learning_rate": 9.58736842105263e-05,
"loss": 0.4356,
"step": 20
},
{
"epoch": 0.01717791411042945,
"grad_norm": 1.0395272970199585,
"learning_rate": 9.534105263157894e-05,
"loss": 1.2191,
"step": 21
},
{
"epoch": 0.0179959100204499,
"grad_norm": 1.0517971515655518,
"learning_rate": 9.480842105263158e-05,
"loss": 1.1783,
"step": 22
},
{
"epoch": 0.01881390593047035,
"grad_norm": 0.9631999135017395,
"learning_rate": 9.427578947368421e-05,
"loss": 0.9933,
"step": 23
},
{
"epoch": 0.0196319018404908,
"grad_norm": 1.0935978889465332,
"learning_rate": 9.374315789473684e-05,
"loss": 0.8218,
"step": 24
},
{
"epoch": 0.02044989775051125,
"grad_norm": 0.8494012951850891,
"learning_rate": 9.321052631578946e-05,
"loss": 0.953,
"step": 25
},
{
"epoch": 0.0212678936605317,
"grad_norm": 0.6680853962898254,
"learning_rate": 9.267789473684209e-05,
"loss": 0.4734,
"step": 26
},
{
"epoch": 0.022085889570552148,
"grad_norm": 0.7398406267166138,
"learning_rate": 9.214526315789473e-05,
"loss": 0.6016,
"step": 27
},
{
"epoch": 0.022903885480572598,
"grad_norm": 0.9350616335868835,
"learning_rate": 9.161263157894736e-05,
"loss": 0.7371,
"step": 28
},
{
"epoch": 0.023721881390593048,
"grad_norm": 0.886838436126709,
"learning_rate": 9.108e-05,
"loss": 0.8018,
"step": 29
},
{
"epoch": 0.024539877300613498,
"grad_norm": 0.6444780826568604,
"learning_rate": 9.054736842105263e-05,
"loss": 0.551,
"step": 30
},
{
"epoch": 0.025357873210633947,
"grad_norm": 0.8645835518836975,
"learning_rate": 9.001473684210526e-05,
"loss": 0.8206,
"step": 31
},
{
"epoch": 0.026175869120654397,
"grad_norm": 1.1317236423492432,
"learning_rate": 8.948210526315789e-05,
"loss": 0.4924,
"step": 32
},
{
"epoch": 0.026993865030674847,
"grad_norm": 0.7311250567436218,
"learning_rate": 8.894947368421051e-05,
"loss": 0.4553,
"step": 33
},
{
"epoch": 0.027811860940695297,
"grad_norm": 0.5183742046356201,
"learning_rate": 8.841684210526315e-05,
"loss": 0.3488,
"step": 34
},
{
"epoch": 0.028629856850715747,
"grad_norm": 0.7674701809883118,
"learning_rate": 8.788421052631578e-05,
"loss": 0.617,
"step": 35
},
{
"epoch": 0.029447852760736196,
"grad_norm": 0.36555930972099304,
"learning_rate": 8.735157894736842e-05,
"loss": 0.1531,
"step": 36
},
{
"epoch": 0.030265848670756646,
"grad_norm": 0.23393642902374268,
"learning_rate": 8.681894736842105e-05,
"loss": 0.0187,
"step": 37
},
{
"epoch": 0.031083844580777096,
"grad_norm": 0.09792309999465942,
"learning_rate": 8.628631578947368e-05,
"loss": 0.0083,
"step": 38
},
{
"epoch": 0.03190184049079755,
"grad_norm": 0.07559319585561752,
"learning_rate": 8.575368421052631e-05,
"loss": 0.0057,
"step": 39
},
{
"epoch": 0.032719836400818,
"grad_norm": 0.05586693063378334,
"learning_rate": 8.522105263157893e-05,
"loss": 0.0034,
"step": 40
},
{
"epoch": 0.03353783231083845,
"grad_norm": 0.08591938763856888,
"learning_rate": 8.468842105263158e-05,
"loss": 0.0034,
"step": 41
},
{
"epoch": 0.0343558282208589,
"grad_norm": 0.06896223872900009,
"learning_rate": 8.41557894736842e-05,
"loss": 0.0034,
"step": 42
},
{
"epoch": 0.03517382413087935,
"grad_norm": 0.11430584639310837,
"learning_rate": 8.362315789473683e-05,
"loss": 0.0046,
"step": 43
},
{
"epoch": 0.0359918200408998,
"grad_norm": 0.1037866622209549,
"learning_rate": 8.309052631578947e-05,
"loss": 0.0026,
"step": 44
},
{
"epoch": 0.03680981595092025,
"grad_norm": 0.29823267459869385,
"learning_rate": 8.25578947368421e-05,
"loss": 0.0045,
"step": 45
},
{
"epoch": 0.0376278118609407,
"grad_norm": 0.017684394493699074,
"learning_rate": 8.202526315789473e-05,
"loss": 0.0005,
"step": 46
},
{
"epoch": 0.03844580777096115,
"grad_norm": 0.021542565897107124,
"learning_rate": 8.149263157894736e-05,
"loss": 0.0007,
"step": 47
},
{
"epoch": 0.0392638036809816,
"grad_norm": 0.01819503679871559,
"learning_rate": 8.096e-05,
"loss": 0.0003,
"step": 48
},
{
"epoch": 0.04008179959100205,
"grad_norm": 0.31022578477859497,
"learning_rate": 8.042736842105263e-05,
"loss": 0.0054,
"step": 49
},
{
"epoch": 0.0408997955010225,
"grad_norm": 0.00996240321546793,
"learning_rate": 7.989473684210525e-05,
"loss": 0.0003,
"step": 50
},
{
"epoch": 0.0408997955010225,
"eval_loss": 0.3140620291233063,
"eval_runtime": 164.9522,
"eval_samples_per_second": 3.122,
"eval_steps_per_second": 0.782,
"step": 50
},
{
"epoch": 0.04171779141104295,
"grad_norm": 0.9700417518615723,
"learning_rate": 7.93621052631579e-05,
"loss": 0.975,
"step": 51
},
{
"epoch": 0.0425357873210634,
"grad_norm": 0.7017274498939514,
"learning_rate": 7.882947368421052e-05,
"loss": 0.801,
"step": 52
},
{
"epoch": 0.043353783231083846,
"grad_norm": 0.5045299530029297,
"learning_rate": 7.829684210526315e-05,
"loss": 0.7498,
"step": 53
},
{
"epoch": 0.044171779141104296,
"grad_norm": 0.5897574424743652,
"learning_rate": 7.776421052631578e-05,
"loss": 1.052,
"step": 54
},
{
"epoch": 0.044989775051124746,
"grad_norm": 0.6167125105857849,
"learning_rate": 7.723157894736842e-05,
"loss": 1.0903,
"step": 55
},
{
"epoch": 0.045807770961145196,
"grad_norm": 0.488067090511322,
"learning_rate": 7.669894736842105e-05,
"loss": 0.9046,
"step": 56
},
{
"epoch": 0.046625766871165646,
"grad_norm": 0.4243190884590149,
"learning_rate": 7.616631578947367e-05,
"loss": 0.6169,
"step": 57
},
{
"epoch": 0.047443762781186095,
"grad_norm": 0.4573240876197815,
"learning_rate": 7.563368421052632e-05,
"loss": 0.5286,
"step": 58
},
{
"epoch": 0.048261758691206545,
"grad_norm": 0.35533568263053894,
"learning_rate": 7.510105263157894e-05,
"loss": 0.3841,
"step": 59
},
{
"epoch": 0.049079754601226995,
"grad_norm": 3.222236394882202,
"learning_rate": 7.456842105263157e-05,
"loss": 0.4826,
"step": 60
},
{
"epoch": 0.049897750511247445,
"grad_norm": 0.6317524909973145,
"learning_rate": 7.403578947368421e-05,
"loss": 0.6264,
"step": 61
},
{
"epoch": 0.050715746421267895,
"grad_norm": 0.9670488238334656,
"learning_rate": 7.350315789473684e-05,
"loss": 0.3956,
"step": 62
},
{
"epoch": 0.051533742331288344,
"grad_norm": 0.6406692266464233,
"learning_rate": 7.297052631578947e-05,
"loss": 0.471,
"step": 63
},
{
"epoch": 0.052351738241308794,
"grad_norm": 0.5081653594970703,
"learning_rate": 7.24378947368421e-05,
"loss": 0.6125,
"step": 64
},
{
"epoch": 0.053169734151329244,
"grad_norm": 0.5987796783447266,
"learning_rate": 7.190526315789474e-05,
"loss": 0.6309,
"step": 65
},
{
"epoch": 0.053987730061349694,
"grad_norm": 0.6003880500793457,
"learning_rate": 7.137263157894736e-05,
"loss": 0.7208,
"step": 66
},
{
"epoch": 0.054805725971370144,
"grad_norm": 0.8808057904243469,
"learning_rate": 7.083999999999999e-05,
"loss": 0.9311,
"step": 67
},
{
"epoch": 0.05562372188139059,
"grad_norm": 0.8602137565612793,
"learning_rate": 7.030736842105263e-05,
"loss": 0.9426,
"step": 68
},
{
"epoch": 0.05644171779141104,
"grad_norm": 0.5142946243286133,
"learning_rate": 6.977473684210526e-05,
"loss": 0.5403,
"step": 69
},
{
"epoch": 0.05725971370143149,
"grad_norm": 0.5149444937705994,
"learning_rate": 6.924210526315789e-05,
"loss": 0.597,
"step": 70
},
{
"epoch": 0.05807770961145194,
"grad_norm": 0.5740354061126709,
"learning_rate": 6.870947368421052e-05,
"loss": 0.5234,
"step": 71
},
{
"epoch": 0.05889570552147239,
"grad_norm": 3.2240636348724365,
"learning_rate": 6.817684210526316e-05,
"loss": 0.9182,
"step": 72
},
{
"epoch": 0.05971370143149284,
"grad_norm": 4.175367832183838,
"learning_rate": 6.764421052631579e-05,
"loss": 0.8576,
"step": 73
},
{
"epoch": 0.06053169734151329,
"grad_norm": 0.5963009595870972,
"learning_rate": 6.711157894736841e-05,
"loss": 0.6773,
"step": 74
},
{
"epoch": 0.06134969325153374,
"grad_norm": 0.9091707468032837,
"learning_rate": 6.657894736842106e-05,
"loss": 0.793,
"step": 75
},
{
"epoch": 0.06216768916155419,
"grad_norm": 0.706394374370575,
"learning_rate": 6.604631578947368e-05,
"loss": 0.6347,
"step": 76
},
{
"epoch": 0.06298568507157465,
"grad_norm": 0.711338222026825,
"learning_rate": 6.551368421052631e-05,
"loss": 0.7409,
"step": 77
},
{
"epoch": 0.0638036809815951,
"grad_norm": 0.7073589563369751,
"learning_rate": 6.498105263157894e-05,
"loss": 0.5461,
"step": 78
},
{
"epoch": 0.06462167689161555,
"grad_norm": 0.6385952234268188,
"learning_rate": 6.444842105263157e-05,
"loss": 0.5371,
"step": 79
},
{
"epoch": 0.065439672801636,
"grad_norm": 0.671297013759613,
"learning_rate": 6.391578947368421e-05,
"loss": 0.4195,
"step": 80
},
{
"epoch": 0.06625766871165645,
"grad_norm": 0.7944509387016296,
"learning_rate": 6.338315789473684e-05,
"loss": 0.2961,
"step": 81
},
{
"epoch": 0.0670756646216769,
"grad_norm": 0.4690554440021515,
"learning_rate": 6.285052631578948e-05,
"loss": 0.3195,
"step": 82
},
{
"epoch": 0.06789366053169735,
"grad_norm": 0.40818992257118225,
"learning_rate": 6.23178947368421e-05,
"loss": 0.324,
"step": 83
},
{
"epoch": 0.0687116564417178,
"grad_norm": 0.19155316054821014,
"learning_rate": 6.178526315789473e-05,
"loss": 0.076,
"step": 84
},
{
"epoch": 0.06952965235173825,
"grad_norm": 0.26963499188423157,
"learning_rate": 6.125263157894736e-05,
"loss": 0.1005,
"step": 85
},
{
"epoch": 0.0703476482617587,
"grad_norm": 0.46540749073028564,
"learning_rate": 6.0719999999999995e-05,
"loss": 0.1703,
"step": 86
},
{
"epoch": 0.07116564417177915,
"grad_norm": 0.026622101664543152,
"learning_rate": 6.018736842105262e-05,
"loss": 0.001,
"step": 87
},
{
"epoch": 0.0719836400817996,
"grad_norm": 0.2342638075351715,
"learning_rate": 5.965473684210526e-05,
"loss": 0.0101,
"step": 88
},
{
"epoch": 0.07280163599182005,
"grad_norm": 0.37686291337013245,
"learning_rate": 5.912210526315789e-05,
"loss": 0.0078,
"step": 89
},
{
"epoch": 0.0736196319018405,
"grad_norm": 0.10179778933525085,
"learning_rate": 5.8589473684210526e-05,
"loss": 0.0014,
"step": 90
},
{
"epoch": 0.07443762781186095,
"grad_norm": 0.1271056979894638,
"learning_rate": 5.8056842105263154e-05,
"loss": 0.0045,
"step": 91
},
{
"epoch": 0.0752556237218814,
"grad_norm": 0.03991863876581192,
"learning_rate": 5.752421052631578e-05,
"loss": 0.002,
"step": 92
},
{
"epoch": 0.07607361963190185,
"grad_norm": 0.3088296055793762,
"learning_rate": 5.6991578947368416e-05,
"loss": 0.0097,
"step": 93
},
{
"epoch": 0.0768916155419223,
"grad_norm": 0.01231884490698576,
"learning_rate": 5.6458947368421044e-05,
"loss": 0.0005,
"step": 94
},
{
"epoch": 0.07770961145194274,
"grad_norm": 0.0380236841738224,
"learning_rate": 5.5926315789473685e-05,
"loss": 0.0017,
"step": 95
},
{
"epoch": 0.0785276073619632,
"grad_norm": 0.018580930307507515,
"learning_rate": 5.539368421052631e-05,
"loss": 0.0007,
"step": 96
},
{
"epoch": 0.07934560327198364,
"grad_norm": 0.3009152114391327,
"learning_rate": 5.486105263157895e-05,
"loss": 0.0008,
"step": 97
},
{
"epoch": 0.0801635991820041,
"grad_norm": 0.14345374703407288,
"learning_rate": 5.4328421052631575e-05,
"loss": 0.0096,
"step": 98
},
{
"epoch": 0.08098159509202454,
"grad_norm": 0.0597989596426487,
"learning_rate": 5.37957894736842e-05,
"loss": 0.0013,
"step": 99
},
{
"epoch": 0.081799591002045,
"grad_norm": 0.025775019079446793,
"learning_rate": 5.326315789473684e-05,
"loss": 0.0011,
"step": 100
},
{
"epoch": 0.081799591002045,
"eval_loss": 0.2703871726989746,
"eval_runtime": 165.4305,
"eval_samples_per_second": 3.113,
"eval_steps_per_second": 0.78,
"step": 100
},
{
"epoch": 0.08261758691206544,
"grad_norm": 3.3351776599884033,
"learning_rate": 5.2730526315789465e-05,
"loss": 0.8922,
"step": 101
},
{
"epoch": 0.0834355828220859,
"grad_norm": 0.621583878993988,
"learning_rate": 5.2197894736842107e-05,
"loss": 0.8219,
"step": 102
},
{
"epoch": 0.08425357873210634,
"grad_norm": 0.4286845624446869,
"learning_rate": 5.1665263157894734e-05,
"loss": 0.6638,
"step": 103
},
{
"epoch": 0.0850715746421268,
"grad_norm": 0.5275766253471375,
"learning_rate": 5.113263157894737e-05,
"loss": 0.9051,
"step": 104
},
{
"epoch": 0.08588957055214724,
"grad_norm": 0.5137267112731934,
"learning_rate": 5.06e-05,
"loss": 0.7615,
"step": 105
},
{
"epoch": 0.08670756646216769,
"grad_norm": 0.4253179430961609,
"learning_rate": 5.0067368421052624e-05,
"loss": 0.6455,
"step": 106
},
{
"epoch": 0.08752556237218814,
"grad_norm": 0.4956965148448944,
"learning_rate": 4.953473684210526e-05,
"loss": 0.8425,
"step": 107
},
{
"epoch": 0.08834355828220859,
"grad_norm": 0.4571160674095154,
"learning_rate": 4.9002105263157893e-05,
"loss": 0.6951,
"step": 108
},
{
"epoch": 0.08916155419222904,
"grad_norm": 0.48802193999290466,
"learning_rate": 4.846947368421052e-05,
"loss": 0.7291,
"step": 109
},
{
"epoch": 0.08997955010224949,
"grad_norm": 0.5465656518936157,
"learning_rate": 4.793684210526315e-05,
"loss": 0.8595,
"step": 110
},
{
"epoch": 0.09079754601226994,
"grad_norm": 0.4221843183040619,
"learning_rate": 4.740421052631579e-05,
"loss": 0.5227,
"step": 111
},
{
"epoch": 0.09161554192229039,
"grad_norm": 0.40702882409095764,
"learning_rate": 4.687157894736842e-05,
"loss": 0.4758,
"step": 112
},
{
"epoch": 0.09243353783231084,
"grad_norm": 0.4591318964958191,
"learning_rate": 4.6338947368421046e-05,
"loss": 0.5884,
"step": 113
},
{
"epoch": 0.09325153374233129,
"grad_norm": 0.3259945809841156,
"learning_rate": 4.580631578947368e-05,
"loss": 0.3064,
"step": 114
},
{
"epoch": 0.09406952965235174,
"grad_norm": 0.41009268164634705,
"learning_rate": 4.5273684210526315e-05,
"loss": 0.4763,
"step": 115
},
{
"epoch": 0.09488752556237219,
"grad_norm": 0.49340561032295227,
"learning_rate": 4.474105263157894e-05,
"loss": 0.5706,
"step": 116
},
{
"epoch": 0.09570552147239264,
"grad_norm": 0.41743770241737366,
"learning_rate": 4.420842105263158e-05,
"loss": 0.3968,
"step": 117
},
{
"epoch": 0.09652351738241309,
"grad_norm": 0.5831127166748047,
"learning_rate": 4.367578947368421e-05,
"loss": 0.789,
"step": 118
},
{
"epoch": 0.09734151329243354,
"grad_norm": 0.540946900844574,
"learning_rate": 4.314315789473684e-05,
"loss": 0.5618,
"step": 119
},
{
"epoch": 0.09815950920245399,
"grad_norm": 0.5608387589454651,
"learning_rate": 4.261052631578947e-05,
"loss": 0.7326,
"step": 120
},
{
"epoch": 0.09897750511247444,
"grad_norm": 0.5865150690078735,
"learning_rate": 4.20778947368421e-05,
"loss": 0.6434,
"step": 121
},
{
"epoch": 0.09979550102249489,
"grad_norm": 0.4052663743495941,
"learning_rate": 4.1545263157894736e-05,
"loss": 0.4434,
"step": 122
},
{
"epoch": 0.10061349693251534,
"grad_norm": 0.5830983519554138,
"learning_rate": 4.1012631578947364e-05,
"loss": 0.533,
"step": 123
},
{
"epoch": 0.10143149284253579,
"grad_norm": 0.5231256484985352,
"learning_rate": 4.048e-05,
"loss": 0.54,
"step": 124
},
{
"epoch": 0.10224948875255624,
"grad_norm": 0.655725359916687,
"learning_rate": 3.9947368421052626e-05,
"loss": 0.7891,
"step": 125
},
{
"epoch": 0.10306748466257669,
"grad_norm": 0.6883142590522766,
"learning_rate": 3.941473684210526e-05,
"loss": 0.848,
"step": 126
},
{
"epoch": 0.10388548057259714,
"grad_norm": 0.5699670314788818,
"learning_rate": 3.888210526315789e-05,
"loss": 0.5417,
"step": 127
},
{
"epoch": 0.10470347648261759,
"grad_norm": 0.6029432415962219,
"learning_rate": 3.834947368421052e-05,
"loss": 0.5477,
"step": 128
},
{
"epoch": 0.10552147239263804,
"grad_norm": 0.5479352474212646,
"learning_rate": 3.781684210526316e-05,
"loss": 0.5703,
"step": 129
},
{
"epoch": 0.10633946830265849,
"grad_norm": 0.6330269575119019,
"learning_rate": 3.7284210526315786e-05,
"loss": 0.7119,
"step": 130
},
{
"epoch": 0.10715746421267894,
"grad_norm": 0.3221192955970764,
"learning_rate": 3.675157894736842e-05,
"loss": 0.2668,
"step": 131
},
{
"epoch": 0.10797546012269939,
"grad_norm": 0.4486640989780426,
"learning_rate": 3.621894736842105e-05,
"loss": 0.2938,
"step": 132
},
{
"epoch": 0.10879345603271984,
"grad_norm": 0.6219035983085632,
"learning_rate": 3.568631578947368e-05,
"loss": 0.7169,
"step": 133
},
{
"epoch": 0.10961145194274029,
"grad_norm": 0.5056197047233582,
"learning_rate": 3.515368421052632e-05,
"loss": 0.5306,
"step": 134
},
{
"epoch": 0.11042944785276074,
"grad_norm": 0.3415873646736145,
"learning_rate": 3.4621052631578945e-05,
"loss": 0.2147,
"step": 135
},
{
"epoch": 0.11124744376278119,
"grad_norm": 0.7372704744338989,
"learning_rate": 3.408842105263158e-05,
"loss": 0.5961,
"step": 136
},
{
"epoch": 0.11206543967280164,
"grad_norm": 0.356452614068985,
"learning_rate": 3.355578947368421e-05,
"loss": 0.2583,
"step": 137
},
{
"epoch": 0.11288343558282209,
"grad_norm": 0.3617746829986572,
"learning_rate": 3.302315789473684e-05,
"loss": 0.1477,
"step": 138
},
{
"epoch": 0.11370143149284254,
"grad_norm": 0.18670551478862762,
"learning_rate": 3.249052631578947e-05,
"loss": 0.0075,
"step": 139
},
{
"epoch": 0.11451942740286299,
"grad_norm": 0.05176525190472603,
"learning_rate": 3.1957894736842104e-05,
"loss": 0.0016,
"step": 140
},
{
"epoch": 0.11533742331288344,
"grad_norm": 0.04952479153871536,
"learning_rate": 3.142526315789474e-05,
"loss": 0.0018,
"step": 141
},
{
"epoch": 0.11615541922290389,
"grad_norm": 0.07239986956119537,
"learning_rate": 3.0892631578947366e-05,
"loss": 0.001,
"step": 142
},
{
"epoch": 0.11697341513292434,
"grad_norm": 0.021206321194767952,
"learning_rate": 3.0359999999999997e-05,
"loss": 0.0005,
"step": 143
},
{
"epoch": 0.11779141104294479,
"grad_norm": 0.00947723537683487,
"learning_rate": 2.982736842105263e-05,
"loss": 0.0005,
"step": 144
},
{
"epoch": 0.11860940695296524,
"grad_norm": 0.17608602344989777,
"learning_rate": 2.9294736842105263e-05,
"loss": 0.0033,
"step": 145
},
{
"epoch": 0.11942740286298568,
"grad_norm": 0.02094121463596821,
"learning_rate": 2.876210526315789e-05,
"loss": 0.0006,
"step": 146
},
{
"epoch": 0.12024539877300613,
"grad_norm": 0.07817188650369644,
"learning_rate": 2.8229473684210522e-05,
"loss": 0.0015,
"step": 147
},
{
"epoch": 0.12106339468302658,
"grad_norm": 0.42592841386795044,
"learning_rate": 2.7696842105263156e-05,
"loss": 0.0054,
"step": 148
},
{
"epoch": 0.12188139059304703,
"grad_norm": 0.007265524938702583,
"learning_rate": 2.7164210526315788e-05,
"loss": 0.0002,
"step": 149
},
{
"epoch": 0.12269938650306748,
"grad_norm": 0.008094916120171547,
"learning_rate": 2.663157894736842e-05,
"loss": 0.0004,
"step": 150
},
{
"epoch": 0.12269938650306748,
"eval_loss": 0.2351786196231842,
"eval_runtime": 165.3057,
"eval_samples_per_second": 3.115,
"eval_steps_per_second": 0.78,
"step": 150
},
{
"epoch": 0.12351738241308793,
"grad_norm": 0.430698424577713,
"learning_rate": 2.6098947368421053e-05,
"loss": 0.838,
"step": 151
},
{
"epoch": 0.12433537832310838,
"grad_norm": 0.4114360213279724,
"learning_rate": 2.5566315789473684e-05,
"loss": 0.6347,
"step": 152
},
{
"epoch": 0.12515337423312883,
"grad_norm": 5.368963241577148,
"learning_rate": 2.5033684210526312e-05,
"loss": 1.7455,
"step": 153
},
{
"epoch": 0.1259713701431493,
"grad_norm": 0.36379197239875793,
"learning_rate": 2.4501052631578947e-05,
"loss": 0.5644,
"step": 154
},
{
"epoch": 0.12678936605316973,
"grad_norm": 0.3544858694076538,
"learning_rate": 2.3968421052631575e-05,
"loss": 0.5105,
"step": 155
},
{
"epoch": 0.1276073619631902,
"grad_norm": 0.3365378975868225,
"learning_rate": 2.343578947368421e-05,
"loss": 0.4261,
"step": 156
},
{
"epoch": 0.12842535787321063,
"grad_norm": 0.4293052852153778,
"learning_rate": 2.290315789473684e-05,
"loss": 0.695,
"step": 157
},
{
"epoch": 0.1292433537832311,
"grad_norm": 0.5024716854095459,
"learning_rate": 2.237052631578947e-05,
"loss": 0.7874,
"step": 158
},
{
"epoch": 0.13006134969325153,
"grad_norm": 0.4503779113292694,
"learning_rate": 2.1837894736842106e-05,
"loss": 0.6787,
"step": 159
},
{
"epoch": 0.130879345603272,
"grad_norm": 0.5354055166244507,
"learning_rate": 2.1305263157894734e-05,
"loss": 0.8901,
"step": 160
},
{
"epoch": 0.13169734151329243,
"grad_norm": 0.6013686656951904,
"learning_rate": 2.0772631578947368e-05,
"loss": 0.6101,
"step": 161
},
{
"epoch": 0.1325153374233129,
"grad_norm": 0.5253039002418518,
"learning_rate": 2.024e-05,
"loss": 0.6207,
"step": 162
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.5484157800674438,
"learning_rate": 1.970736842105263e-05,
"loss": 0.63,
"step": 163
},
{
"epoch": 0.1341513292433538,
"grad_norm": 0.376302570104599,
"learning_rate": 1.917473684210526e-05,
"loss": 0.3893,
"step": 164
},
{
"epoch": 0.13496932515337423,
"grad_norm": 0.41201335191726685,
"learning_rate": 1.8642105263157893e-05,
"loss": 0.4192,
"step": 165
},
{
"epoch": 0.1357873210633947,
"grad_norm": 0.69189453125,
"learning_rate": 1.8109473684210524e-05,
"loss": 0.8196,
"step": 166
},
{
"epoch": 0.13660531697341513,
"grad_norm": 0.3967001140117645,
"learning_rate": 1.757684210526316e-05,
"loss": 0.4478,
"step": 167
},
{
"epoch": 0.1374233128834356,
"grad_norm": 0.40037596225738525,
"learning_rate": 1.704421052631579e-05,
"loss": 0.437,
"step": 168
},
{
"epoch": 0.13824130879345603,
"grad_norm": 0.4589173197746277,
"learning_rate": 1.651157894736842e-05,
"loss": 0.5041,
"step": 169
},
{
"epoch": 0.1390593047034765,
"grad_norm": 0.5317126512527466,
"learning_rate": 1.5978947368421052e-05,
"loss": 0.5844,
"step": 170
},
{
"epoch": 0.13987730061349693,
"grad_norm": 0.6097099184989929,
"learning_rate": 1.5446315789473683e-05,
"loss": 0.8981,
"step": 171
},
{
"epoch": 0.1406952965235174,
"grad_norm": 0.5526396632194519,
"learning_rate": 1.4913684210526314e-05,
"loss": 0.5214,
"step": 172
},
{
"epoch": 0.14151329243353783,
"grad_norm": 0.49050372838974,
"learning_rate": 1.4381052631578945e-05,
"loss": 0.3992,
"step": 173
},
{
"epoch": 0.1423312883435583,
"grad_norm": 0.6631816029548645,
"learning_rate": 1.3848421052631578e-05,
"loss": 0.7089,
"step": 174
},
{
"epoch": 0.14314928425357873,
"grad_norm": 0.6019887328147888,
"learning_rate": 1.331578947368421e-05,
"loss": 0.6868,
"step": 175
},
{
"epoch": 0.1439672801635992,
"grad_norm": 0.46158432960510254,
"learning_rate": 1.2783157894736842e-05,
"loss": 0.4053,
"step": 176
},
{
"epoch": 0.14478527607361963,
"grad_norm": 3.7597968578338623,
"learning_rate": 1.2250526315789473e-05,
"loss": 0.6116,
"step": 177
},
{
"epoch": 0.1456032719836401,
"grad_norm": 0.3607610762119293,
"learning_rate": 1.1717894736842105e-05,
"loss": 0.2041,
"step": 178
},
{
"epoch": 0.14642126789366053,
"grad_norm": 0.6213468313217163,
"learning_rate": 1.1185263157894736e-05,
"loss": 0.6546,
"step": 179
},
{
"epoch": 0.147239263803681,
"grad_norm": 0.4181584119796753,
"learning_rate": 1.0652631578947367e-05,
"loss": 0.3367,
"step": 180
},
{
"epoch": 0.14805725971370143,
"grad_norm": 0.7935755848884583,
"learning_rate": 1.012e-05,
"loss": 0.7382,
"step": 181
},
{
"epoch": 0.1488752556237219,
"grad_norm": 0.7149289846420288,
"learning_rate": 9.58736842105263e-06,
"loss": 0.6995,
"step": 182
},
{
"epoch": 0.14969325153374233,
"grad_norm": 0.6175360083580017,
"learning_rate": 9.054736842105262e-06,
"loss": 0.515,
"step": 183
},
{
"epoch": 0.1505112474437628,
"grad_norm": 0.432099848985672,
"learning_rate": 8.522105263157895e-06,
"loss": 0.222,
"step": 184
},
{
"epoch": 0.15132924335378323,
"grad_norm": 0.14052408933639526,
"learning_rate": 7.989473684210526e-06,
"loss": 0.0522,
"step": 185
},
{
"epoch": 0.1521472392638037,
"grad_norm": 0.06722941249608994,
"learning_rate": 7.456842105263157e-06,
"loss": 0.0043,
"step": 186
},
{
"epoch": 0.15296523517382413,
"grad_norm": 0.015035667456686497,
"learning_rate": 6.924210526315789e-06,
"loss": 0.0008,
"step": 187
},
{
"epoch": 0.1537832310838446,
"grad_norm": 0.15017950534820557,
"learning_rate": 6.391578947368421e-06,
"loss": 0.003,
"step": 188
},
{
"epoch": 0.15460122699386503,
"grad_norm": 0.018131496384739876,
"learning_rate": 5.858947368421052e-06,
"loss": 0.001,
"step": 189
},
{
"epoch": 0.1554192229038855,
"grad_norm": 0.015157670713961124,
"learning_rate": 5.326315789473683e-06,
"loss": 0.0008,
"step": 190
},
{
"epoch": 0.15623721881390593,
"grad_norm": 0.11725586652755737,
"learning_rate": 4.793684210526315e-06,
"loss": 0.0067,
"step": 191
},
{
"epoch": 0.1570552147239264,
"grad_norm": 0.043377045542001724,
"learning_rate": 4.261052631578947e-06,
"loss": 0.0024,
"step": 192
},
{
"epoch": 0.15787321063394683,
"grad_norm": 0.10283850133419037,
"learning_rate": 3.7284210526315786e-06,
"loss": 0.004,
"step": 193
},
{
"epoch": 0.1586912065439673,
"grad_norm": 0.1786062866449356,
"learning_rate": 3.1957894736842106e-06,
"loss": 0.0071,
"step": 194
},
{
"epoch": 0.15950920245398773,
"grad_norm": 0.13341274857521057,
"learning_rate": 2.6631578947368417e-06,
"loss": 0.0007,
"step": 195
},
{
"epoch": 0.1603271983640082,
"grad_norm": 0.1136331856250763,
"learning_rate": 2.1305263157894737e-06,
"loss": 0.0033,
"step": 196
},
{
"epoch": 0.16114519427402862,
"grad_norm": 0.052339375019073486,
"learning_rate": 1.5978947368421053e-06,
"loss": 0.002,
"step": 197
},
{
"epoch": 0.1619631901840491,
"grad_norm": 0.01893027499318123,
"learning_rate": 1.0652631578947369e-06,
"loss": 0.0008,
"step": 198
},
{
"epoch": 0.16278118609406952,
"grad_norm": 0.01076345145702362,
"learning_rate": 5.326315789473684e-07,
"loss": 0.0006,
"step": 199
},
{
"epoch": 0.16359918200409,
"grad_norm": 0.029393598437309265,
"learning_rate": 0.0,
"loss": 0.0015,
"step": 200
},
{
"epoch": 0.16359918200409,
"eval_loss": 0.21266202628612518,
"eval_runtime": 165.9021,
"eval_samples_per_second": 3.104,
"eval_steps_per_second": 0.778,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.484049111154688e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}