{ "best_metric": 0.21266202628612518, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.16359918200409, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008179959100204499, "grad_norm": 0.44888365268707275, "learning_rate": 1.012e-05, "loss": 0.8848, "step": 1 }, { "epoch": 0.0008179959100204499, "eval_loss": 0.5526403784751892, "eval_runtime": 164.606, "eval_samples_per_second": 3.129, "eval_steps_per_second": 0.784, "step": 1 }, { "epoch": 0.0016359918200408998, "grad_norm": 0.3820202052593231, "learning_rate": 2.024e-05, "loss": 0.6338, "step": 2 }, { "epoch": 0.00245398773006135, "grad_norm": 1.0369555950164795, "learning_rate": 3.0359999999999997e-05, "loss": 1.3722, "step": 3 }, { "epoch": 0.0032719836400817996, "grad_norm": 0.5409644842147827, "learning_rate": 4.048e-05, "loss": 1.027, "step": 4 }, { "epoch": 0.00408997955010225, "grad_norm": 0.5290982127189636, "learning_rate": 5.06e-05, "loss": 0.7956, "step": 5 }, { "epoch": 0.0049079754601227, "grad_norm": 0.4899097979068756, "learning_rate": 6.0719999999999995e-05, "loss": 0.8705, "step": 6 }, { "epoch": 0.0057259713701431495, "grad_norm": 0.7995485663414001, "learning_rate": 7.083999999999999e-05, "loss": 1.2093, "step": 7 }, { "epoch": 0.006543967280163599, "grad_norm": 0.5504206418991089, "learning_rate": 8.096e-05, "loss": 0.9514, "step": 8 }, { "epoch": 0.007361963190184049, "grad_norm": 0.5944662690162659, "learning_rate": 9.108e-05, "loss": 0.955, "step": 9 }, { "epoch": 0.0081799591002045, "grad_norm": 0.6484851837158203, "learning_rate": 0.0001012, "loss": 0.9926, "step": 10 }, { "epoch": 0.00899795501022495, "grad_norm": 0.6986644864082336, "learning_rate": 0.00010066736842105262, "loss": 0.8318, "step": 11 }, { "epoch": 0.0098159509202454, "grad_norm": 0.6467517018318176, "learning_rate": 0.00010013473684210525, "loss": 0.7294, "step": 12 }, { "epoch": 0.01063394683026585, "grad_norm": 0.7561603784561157, "learning_rate": 9.960210526315788e-05, "loss": 0.8644, "step": 13 }, { "epoch": 0.011451942740286299, "grad_norm": 0.7822588086128235, "learning_rate": 9.906947368421052e-05, "loss": 0.9813, "step": 14 }, { "epoch": 0.012269938650306749, "grad_norm": 0.6852607727050781, "learning_rate": 9.853684210526316e-05, "loss": 0.9142, "step": 15 }, { "epoch": 0.013087934560327199, "grad_norm": 0.6556634902954102, "learning_rate": 9.800421052631579e-05, "loss": 0.5068, "step": 16 }, { "epoch": 0.013905930470347648, "grad_norm": 0.954055905342102, "learning_rate": 9.747157894736841e-05, "loss": 1.126, "step": 17 }, { "epoch": 0.014723926380368098, "grad_norm": 0.8079794645309448, "learning_rate": 9.693894736842104e-05, "loss": 0.963, "step": 18 }, { "epoch": 0.015541922290388548, "grad_norm": 0.8518968820571899, "learning_rate": 9.640631578947367e-05, "loss": 0.9903, "step": 19 }, { "epoch": 0.016359918200409, "grad_norm": 0.5757415890693665, "learning_rate": 9.58736842105263e-05, "loss": 0.4356, "step": 20 }, { "epoch": 0.01717791411042945, "grad_norm": 1.0395272970199585, "learning_rate": 9.534105263157894e-05, "loss": 1.2191, "step": 21 }, { "epoch": 0.0179959100204499, "grad_norm": 1.0517971515655518, "learning_rate": 9.480842105263158e-05, "loss": 1.1783, "step": 22 }, { "epoch": 0.01881390593047035, "grad_norm": 0.9631999135017395, "learning_rate": 9.427578947368421e-05, "loss": 0.9933, "step": 23 }, { "epoch": 0.0196319018404908, "grad_norm": 1.0935978889465332, "learning_rate": 9.374315789473684e-05, "loss": 0.8218, "step": 24 }, { "epoch": 0.02044989775051125, "grad_norm": 0.8494012951850891, "learning_rate": 9.321052631578946e-05, "loss": 0.953, "step": 25 }, { "epoch": 0.0212678936605317, "grad_norm": 0.6680853962898254, "learning_rate": 9.267789473684209e-05, "loss": 0.4734, "step": 26 }, { "epoch": 0.022085889570552148, "grad_norm": 0.7398406267166138, "learning_rate": 9.214526315789473e-05, "loss": 0.6016, "step": 27 }, { "epoch": 0.022903885480572598, "grad_norm": 0.9350616335868835, "learning_rate": 9.161263157894736e-05, "loss": 0.7371, "step": 28 }, { "epoch": 0.023721881390593048, "grad_norm": 0.886838436126709, "learning_rate": 9.108e-05, "loss": 0.8018, "step": 29 }, { "epoch": 0.024539877300613498, "grad_norm": 0.6444780826568604, "learning_rate": 9.054736842105263e-05, "loss": 0.551, "step": 30 }, { "epoch": 0.025357873210633947, "grad_norm": 0.8645835518836975, "learning_rate": 9.001473684210526e-05, "loss": 0.8206, "step": 31 }, { "epoch": 0.026175869120654397, "grad_norm": 1.1317236423492432, "learning_rate": 8.948210526315789e-05, "loss": 0.4924, "step": 32 }, { "epoch": 0.026993865030674847, "grad_norm": 0.7311250567436218, "learning_rate": 8.894947368421051e-05, "loss": 0.4553, "step": 33 }, { "epoch": 0.027811860940695297, "grad_norm": 0.5183742046356201, "learning_rate": 8.841684210526315e-05, "loss": 0.3488, "step": 34 }, { "epoch": 0.028629856850715747, "grad_norm": 0.7674701809883118, "learning_rate": 8.788421052631578e-05, "loss": 0.617, "step": 35 }, { "epoch": 0.029447852760736196, "grad_norm": 0.36555930972099304, "learning_rate": 8.735157894736842e-05, "loss": 0.1531, "step": 36 }, { "epoch": 0.030265848670756646, "grad_norm": 0.23393642902374268, "learning_rate": 8.681894736842105e-05, "loss": 0.0187, "step": 37 }, { "epoch": 0.031083844580777096, "grad_norm": 0.09792309999465942, "learning_rate": 8.628631578947368e-05, "loss": 0.0083, "step": 38 }, { "epoch": 0.03190184049079755, "grad_norm": 0.07559319585561752, "learning_rate": 8.575368421052631e-05, "loss": 0.0057, "step": 39 }, { "epoch": 0.032719836400818, "grad_norm": 0.05586693063378334, "learning_rate": 8.522105263157893e-05, "loss": 0.0034, "step": 40 }, { "epoch": 0.03353783231083845, "grad_norm": 0.08591938763856888, "learning_rate": 8.468842105263158e-05, "loss": 0.0034, "step": 41 }, { "epoch": 0.0343558282208589, "grad_norm": 0.06896223872900009, "learning_rate": 8.41557894736842e-05, "loss": 0.0034, "step": 42 }, { "epoch": 0.03517382413087935, "grad_norm": 0.11430584639310837, "learning_rate": 8.362315789473683e-05, "loss": 0.0046, "step": 43 }, { "epoch": 0.0359918200408998, "grad_norm": 0.1037866622209549, "learning_rate": 8.309052631578947e-05, "loss": 0.0026, "step": 44 }, { "epoch": 0.03680981595092025, "grad_norm": 0.29823267459869385, "learning_rate": 8.25578947368421e-05, "loss": 0.0045, "step": 45 }, { "epoch": 0.0376278118609407, "grad_norm": 0.017684394493699074, "learning_rate": 8.202526315789473e-05, "loss": 0.0005, "step": 46 }, { "epoch": 0.03844580777096115, "grad_norm": 0.021542565897107124, "learning_rate": 8.149263157894736e-05, "loss": 0.0007, "step": 47 }, { "epoch": 0.0392638036809816, "grad_norm": 0.01819503679871559, "learning_rate": 8.096e-05, "loss": 0.0003, "step": 48 }, { "epoch": 0.04008179959100205, "grad_norm": 0.31022578477859497, "learning_rate": 8.042736842105263e-05, "loss": 0.0054, "step": 49 }, { "epoch": 0.0408997955010225, "grad_norm": 0.00996240321546793, "learning_rate": 7.989473684210525e-05, "loss": 0.0003, "step": 50 }, { "epoch": 0.0408997955010225, "eval_loss": 0.3140620291233063, "eval_runtime": 164.9522, "eval_samples_per_second": 3.122, "eval_steps_per_second": 0.782, "step": 50 }, { "epoch": 0.04171779141104295, "grad_norm": 0.9700417518615723, "learning_rate": 7.93621052631579e-05, "loss": 0.975, "step": 51 }, { "epoch": 0.0425357873210634, "grad_norm": 0.7017274498939514, "learning_rate": 7.882947368421052e-05, "loss": 0.801, "step": 52 }, { "epoch": 0.043353783231083846, "grad_norm": 0.5045299530029297, "learning_rate": 7.829684210526315e-05, "loss": 0.7498, "step": 53 }, { "epoch": 0.044171779141104296, "grad_norm": 0.5897574424743652, "learning_rate": 7.776421052631578e-05, "loss": 1.052, "step": 54 }, { "epoch": 0.044989775051124746, "grad_norm": 0.6167125105857849, "learning_rate": 7.723157894736842e-05, "loss": 1.0903, "step": 55 }, { "epoch": 0.045807770961145196, "grad_norm": 0.488067090511322, "learning_rate": 7.669894736842105e-05, "loss": 0.9046, "step": 56 }, { "epoch": 0.046625766871165646, "grad_norm": 0.4243190884590149, "learning_rate": 7.616631578947367e-05, "loss": 0.6169, "step": 57 }, { "epoch": 0.047443762781186095, "grad_norm": 0.4573240876197815, "learning_rate": 7.563368421052632e-05, "loss": 0.5286, "step": 58 }, { "epoch": 0.048261758691206545, "grad_norm": 0.35533568263053894, "learning_rate": 7.510105263157894e-05, "loss": 0.3841, "step": 59 }, { "epoch": 0.049079754601226995, "grad_norm": 3.222236394882202, "learning_rate": 7.456842105263157e-05, "loss": 0.4826, "step": 60 }, { "epoch": 0.049897750511247445, "grad_norm": 0.6317524909973145, "learning_rate": 7.403578947368421e-05, "loss": 0.6264, "step": 61 }, { "epoch": 0.050715746421267895, "grad_norm": 0.9670488238334656, "learning_rate": 7.350315789473684e-05, "loss": 0.3956, "step": 62 }, { "epoch": 0.051533742331288344, "grad_norm": 0.6406692266464233, "learning_rate": 7.297052631578947e-05, "loss": 0.471, "step": 63 }, { "epoch": 0.052351738241308794, "grad_norm": 0.5081653594970703, "learning_rate": 7.24378947368421e-05, "loss": 0.6125, "step": 64 }, { "epoch": 0.053169734151329244, "grad_norm": 0.5987796783447266, "learning_rate": 7.190526315789474e-05, "loss": 0.6309, "step": 65 }, { "epoch": 0.053987730061349694, "grad_norm": 0.6003880500793457, "learning_rate": 7.137263157894736e-05, "loss": 0.7208, "step": 66 }, { "epoch": 0.054805725971370144, "grad_norm": 0.8808057904243469, "learning_rate": 7.083999999999999e-05, "loss": 0.9311, "step": 67 }, { "epoch": 0.05562372188139059, "grad_norm": 0.8602137565612793, "learning_rate": 7.030736842105263e-05, "loss": 0.9426, "step": 68 }, { "epoch": 0.05644171779141104, "grad_norm": 0.5142946243286133, "learning_rate": 6.977473684210526e-05, "loss": 0.5403, "step": 69 }, { "epoch": 0.05725971370143149, "grad_norm": 0.5149444937705994, "learning_rate": 6.924210526315789e-05, "loss": 0.597, "step": 70 }, { "epoch": 0.05807770961145194, "grad_norm": 0.5740354061126709, "learning_rate": 6.870947368421052e-05, "loss": 0.5234, "step": 71 }, { "epoch": 0.05889570552147239, "grad_norm": 3.2240636348724365, "learning_rate": 6.817684210526316e-05, "loss": 0.9182, "step": 72 }, { "epoch": 0.05971370143149284, "grad_norm": 4.175367832183838, "learning_rate": 6.764421052631579e-05, "loss": 0.8576, "step": 73 }, { "epoch": 0.06053169734151329, "grad_norm": 0.5963009595870972, "learning_rate": 6.711157894736841e-05, "loss": 0.6773, "step": 74 }, { "epoch": 0.06134969325153374, "grad_norm": 0.9091707468032837, "learning_rate": 6.657894736842106e-05, "loss": 0.793, "step": 75 }, { "epoch": 0.06216768916155419, "grad_norm": 0.706394374370575, "learning_rate": 6.604631578947368e-05, "loss": 0.6347, "step": 76 }, { "epoch": 0.06298568507157465, "grad_norm": 0.711338222026825, "learning_rate": 6.551368421052631e-05, "loss": 0.7409, "step": 77 }, { "epoch": 0.0638036809815951, "grad_norm": 0.7073589563369751, "learning_rate": 6.498105263157894e-05, "loss": 0.5461, "step": 78 }, { "epoch": 0.06462167689161555, "grad_norm": 0.6385952234268188, "learning_rate": 6.444842105263157e-05, "loss": 0.5371, "step": 79 }, { "epoch": 0.065439672801636, "grad_norm": 0.671297013759613, "learning_rate": 6.391578947368421e-05, "loss": 0.4195, "step": 80 }, { "epoch": 0.06625766871165645, "grad_norm": 0.7944509387016296, "learning_rate": 6.338315789473684e-05, "loss": 0.2961, "step": 81 }, { "epoch": 0.0670756646216769, "grad_norm": 0.4690554440021515, "learning_rate": 6.285052631578948e-05, "loss": 0.3195, "step": 82 }, { "epoch": 0.06789366053169735, "grad_norm": 0.40818992257118225, "learning_rate": 6.23178947368421e-05, "loss": 0.324, "step": 83 }, { "epoch": 0.0687116564417178, "grad_norm": 0.19155316054821014, "learning_rate": 6.178526315789473e-05, "loss": 0.076, "step": 84 }, { "epoch": 0.06952965235173825, "grad_norm": 0.26963499188423157, "learning_rate": 6.125263157894736e-05, "loss": 0.1005, "step": 85 }, { "epoch": 0.0703476482617587, "grad_norm": 0.46540749073028564, "learning_rate": 6.0719999999999995e-05, "loss": 0.1703, "step": 86 }, { "epoch": 0.07116564417177915, "grad_norm": 0.026622101664543152, "learning_rate": 6.018736842105262e-05, "loss": 0.001, "step": 87 }, { "epoch": 0.0719836400817996, "grad_norm": 0.2342638075351715, "learning_rate": 5.965473684210526e-05, "loss": 0.0101, "step": 88 }, { "epoch": 0.07280163599182005, "grad_norm": 0.37686291337013245, "learning_rate": 5.912210526315789e-05, "loss": 0.0078, "step": 89 }, { "epoch": 0.0736196319018405, "grad_norm": 0.10179778933525085, "learning_rate": 5.8589473684210526e-05, "loss": 0.0014, "step": 90 }, { "epoch": 0.07443762781186095, "grad_norm": 0.1271056979894638, "learning_rate": 5.8056842105263154e-05, "loss": 0.0045, "step": 91 }, { "epoch": 0.0752556237218814, "grad_norm": 0.03991863876581192, "learning_rate": 5.752421052631578e-05, "loss": 0.002, "step": 92 }, { "epoch": 0.07607361963190185, "grad_norm": 0.3088296055793762, "learning_rate": 5.6991578947368416e-05, "loss": 0.0097, "step": 93 }, { "epoch": 0.0768916155419223, "grad_norm": 0.01231884490698576, "learning_rate": 5.6458947368421044e-05, "loss": 0.0005, "step": 94 }, { "epoch": 0.07770961145194274, "grad_norm": 0.0380236841738224, "learning_rate": 5.5926315789473685e-05, "loss": 0.0017, "step": 95 }, { "epoch": 0.0785276073619632, "grad_norm": 0.018580930307507515, "learning_rate": 5.539368421052631e-05, "loss": 0.0007, "step": 96 }, { "epoch": 0.07934560327198364, "grad_norm": 0.3009152114391327, "learning_rate": 5.486105263157895e-05, "loss": 0.0008, "step": 97 }, { "epoch": 0.0801635991820041, "grad_norm": 0.14345374703407288, "learning_rate": 5.4328421052631575e-05, "loss": 0.0096, "step": 98 }, { "epoch": 0.08098159509202454, "grad_norm": 0.0597989596426487, "learning_rate": 5.37957894736842e-05, "loss": 0.0013, "step": 99 }, { "epoch": 0.081799591002045, "grad_norm": 0.025775019079446793, "learning_rate": 5.326315789473684e-05, "loss": 0.0011, "step": 100 }, { "epoch": 0.081799591002045, "eval_loss": 0.2703871726989746, "eval_runtime": 165.4305, "eval_samples_per_second": 3.113, "eval_steps_per_second": 0.78, "step": 100 }, { "epoch": 0.08261758691206544, "grad_norm": 3.3351776599884033, "learning_rate": 5.2730526315789465e-05, "loss": 0.8922, "step": 101 }, { "epoch": 0.0834355828220859, "grad_norm": 0.621583878993988, "learning_rate": 5.2197894736842107e-05, "loss": 0.8219, "step": 102 }, { "epoch": 0.08425357873210634, "grad_norm": 0.4286845624446869, "learning_rate": 5.1665263157894734e-05, "loss": 0.6638, "step": 103 }, { "epoch": 0.0850715746421268, "grad_norm": 0.5275766253471375, "learning_rate": 5.113263157894737e-05, "loss": 0.9051, "step": 104 }, { "epoch": 0.08588957055214724, "grad_norm": 0.5137267112731934, "learning_rate": 5.06e-05, "loss": 0.7615, "step": 105 }, { "epoch": 0.08670756646216769, "grad_norm": 0.4253179430961609, "learning_rate": 5.0067368421052624e-05, "loss": 0.6455, "step": 106 }, { "epoch": 0.08752556237218814, "grad_norm": 0.4956965148448944, "learning_rate": 4.953473684210526e-05, "loss": 0.8425, "step": 107 }, { "epoch": 0.08834355828220859, "grad_norm": 0.4571160674095154, "learning_rate": 4.9002105263157893e-05, "loss": 0.6951, "step": 108 }, { "epoch": 0.08916155419222904, "grad_norm": 0.48802193999290466, "learning_rate": 4.846947368421052e-05, "loss": 0.7291, "step": 109 }, { "epoch": 0.08997955010224949, "grad_norm": 0.5465656518936157, "learning_rate": 4.793684210526315e-05, "loss": 0.8595, "step": 110 }, { "epoch": 0.09079754601226994, "grad_norm": 0.4221843183040619, "learning_rate": 4.740421052631579e-05, "loss": 0.5227, "step": 111 }, { "epoch": 0.09161554192229039, "grad_norm": 0.40702882409095764, "learning_rate": 4.687157894736842e-05, "loss": 0.4758, "step": 112 }, { "epoch": 0.09243353783231084, "grad_norm": 0.4591318964958191, "learning_rate": 4.6338947368421046e-05, "loss": 0.5884, "step": 113 }, { "epoch": 0.09325153374233129, "grad_norm": 0.3259945809841156, "learning_rate": 4.580631578947368e-05, "loss": 0.3064, "step": 114 }, { "epoch": 0.09406952965235174, "grad_norm": 0.41009268164634705, "learning_rate": 4.5273684210526315e-05, "loss": 0.4763, "step": 115 }, { "epoch": 0.09488752556237219, "grad_norm": 0.49340561032295227, "learning_rate": 4.474105263157894e-05, "loss": 0.5706, "step": 116 }, { "epoch": 0.09570552147239264, "grad_norm": 0.41743770241737366, "learning_rate": 4.420842105263158e-05, "loss": 0.3968, "step": 117 }, { "epoch": 0.09652351738241309, "grad_norm": 0.5831127166748047, "learning_rate": 4.367578947368421e-05, "loss": 0.789, "step": 118 }, { "epoch": 0.09734151329243354, "grad_norm": 0.540946900844574, "learning_rate": 4.314315789473684e-05, "loss": 0.5618, "step": 119 }, { "epoch": 0.09815950920245399, "grad_norm": 0.5608387589454651, "learning_rate": 4.261052631578947e-05, "loss": 0.7326, "step": 120 }, { "epoch": 0.09897750511247444, "grad_norm": 0.5865150690078735, "learning_rate": 4.20778947368421e-05, "loss": 0.6434, "step": 121 }, { "epoch": 0.09979550102249489, "grad_norm": 0.4052663743495941, "learning_rate": 4.1545263157894736e-05, "loss": 0.4434, "step": 122 }, { "epoch": 0.10061349693251534, "grad_norm": 0.5830983519554138, "learning_rate": 4.1012631578947364e-05, "loss": 0.533, "step": 123 }, { "epoch": 0.10143149284253579, "grad_norm": 0.5231256484985352, "learning_rate": 4.048e-05, "loss": 0.54, "step": 124 }, { "epoch": 0.10224948875255624, "grad_norm": 0.655725359916687, "learning_rate": 3.9947368421052626e-05, "loss": 0.7891, "step": 125 }, { "epoch": 0.10306748466257669, "grad_norm": 0.6883142590522766, "learning_rate": 3.941473684210526e-05, "loss": 0.848, "step": 126 }, { "epoch": 0.10388548057259714, "grad_norm": 0.5699670314788818, "learning_rate": 3.888210526315789e-05, "loss": 0.5417, "step": 127 }, { "epoch": 0.10470347648261759, "grad_norm": 0.6029432415962219, "learning_rate": 3.834947368421052e-05, "loss": 0.5477, "step": 128 }, { "epoch": 0.10552147239263804, "grad_norm": 0.5479352474212646, "learning_rate": 3.781684210526316e-05, "loss": 0.5703, "step": 129 }, { "epoch": 0.10633946830265849, "grad_norm": 0.6330269575119019, "learning_rate": 3.7284210526315786e-05, "loss": 0.7119, "step": 130 }, { "epoch": 0.10715746421267894, "grad_norm": 0.3221192955970764, "learning_rate": 3.675157894736842e-05, "loss": 0.2668, "step": 131 }, { "epoch": 0.10797546012269939, "grad_norm": 0.4486640989780426, "learning_rate": 3.621894736842105e-05, "loss": 0.2938, "step": 132 }, { "epoch": 0.10879345603271984, "grad_norm": 0.6219035983085632, "learning_rate": 3.568631578947368e-05, "loss": 0.7169, "step": 133 }, { "epoch": 0.10961145194274029, "grad_norm": 0.5056197047233582, "learning_rate": 3.515368421052632e-05, "loss": 0.5306, "step": 134 }, { "epoch": 0.11042944785276074, "grad_norm": 0.3415873646736145, "learning_rate": 3.4621052631578945e-05, "loss": 0.2147, "step": 135 }, { "epoch": 0.11124744376278119, "grad_norm": 0.7372704744338989, "learning_rate": 3.408842105263158e-05, "loss": 0.5961, "step": 136 }, { "epoch": 0.11206543967280164, "grad_norm": 0.356452614068985, "learning_rate": 3.355578947368421e-05, "loss": 0.2583, "step": 137 }, { "epoch": 0.11288343558282209, "grad_norm": 0.3617746829986572, "learning_rate": 3.302315789473684e-05, "loss": 0.1477, "step": 138 }, { "epoch": 0.11370143149284254, "grad_norm": 0.18670551478862762, "learning_rate": 3.249052631578947e-05, "loss": 0.0075, "step": 139 }, { "epoch": 0.11451942740286299, "grad_norm": 0.05176525190472603, "learning_rate": 3.1957894736842104e-05, "loss": 0.0016, "step": 140 }, { "epoch": 0.11533742331288344, "grad_norm": 0.04952479153871536, "learning_rate": 3.142526315789474e-05, "loss": 0.0018, "step": 141 }, { "epoch": 0.11615541922290389, "grad_norm": 0.07239986956119537, "learning_rate": 3.0892631578947366e-05, "loss": 0.001, "step": 142 }, { "epoch": 0.11697341513292434, "grad_norm": 0.021206321194767952, "learning_rate": 3.0359999999999997e-05, "loss": 0.0005, "step": 143 }, { "epoch": 0.11779141104294479, "grad_norm": 0.00947723537683487, "learning_rate": 2.982736842105263e-05, "loss": 0.0005, "step": 144 }, { "epoch": 0.11860940695296524, "grad_norm": 0.17608602344989777, "learning_rate": 2.9294736842105263e-05, "loss": 0.0033, "step": 145 }, { "epoch": 0.11942740286298568, "grad_norm": 0.02094121463596821, "learning_rate": 2.876210526315789e-05, "loss": 0.0006, "step": 146 }, { "epoch": 0.12024539877300613, "grad_norm": 0.07817188650369644, "learning_rate": 2.8229473684210522e-05, "loss": 0.0015, "step": 147 }, { "epoch": 0.12106339468302658, "grad_norm": 0.42592841386795044, "learning_rate": 2.7696842105263156e-05, "loss": 0.0054, "step": 148 }, { "epoch": 0.12188139059304703, "grad_norm": 0.007265524938702583, "learning_rate": 2.7164210526315788e-05, "loss": 0.0002, "step": 149 }, { "epoch": 0.12269938650306748, "grad_norm": 0.008094916120171547, "learning_rate": 2.663157894736842e-05, "loss": 0.0004, "step": 150 }, { "epoch": 0.12269938650306748, "eval_loss": 0.2351786196231842, "eval_runtime": 165.3057, "eval_samples_per_second": 3.115, "eval_steps_per_second": 0.78, "step": 150 }, { "epoch": 0.12351738241308793, "grad_norm": 0.430698424577713, "learning_rate": 2.6098947368421053e-05, "loss": 0.838, "step": 151 }, { "epoch": 0.12433537832310838, "grad_norm": 0.4114360213279724, "learning_rate": 2.5566315789473684e-05, "loss": 0.6347, "step": 152 }, { "epoch": 0.12515337423312883, "grad_norm": 5.368963241577148, "learning_rate": 2.5033684210526312e-05, "loss": 1.7455, "step": 153 }, { "epoch": 0.1259713701431493, "grad_norm": 0.36379197239875793, "learning_rate": 2.4501052631578947e-05, "loss": 0.5644, "step": 154 }, { "epoch": 0.12678936605316973, "grad_norm": 0.3544858694076538, "learning_rate": 2.3968421052631575e-05, "loss": 0.5105, "step": 155 }, { "epoch": 0.1276073619631902, "grad_norm": 0.3365378975868225, "learning_rate": 2.343578947368421e-05, "loss": 0.4261, "step": 156 }, { "epoch": 0.12842535787321063, "grad_norm": 0.4293052852153778, "learning_rate": 2.290315789473684e-05, "loss": 0.695, "step": 157 }, { "epoch": 0.1292433537832311, "grad_norm": 0.5024716854095459, "learning_rate": 2.237052631578947e-05, "loss": 0.7874, "step": 158 }, { "epoch": 0.13006134969325153, "grad_norm": 0.4503779113292694, "learning_rate": 2.1837894736842106e-05, "loss": 0.6787, "step": 159 }, { "epoch": 0.130879345603272, "grad_norm": 0.5354055166244507, "learning_rate": 2.1305263157894734e-05, "loss": 0.8901, "step": 160 }, { "epoch": 0.13169734151329243, "grad_norm": 0.6013686656951904, "learning_rate": 2.0772631578947368e-05, "loss": 0.6101, "step": 161 }, { "epoch": 0.1325153374233129, "grad_norm": 0.5253039002418518, "learning_rate": 2.024e-05, "loss": 0.6207, "step": 162 }, { "epoch": 0.13333333333333333, "grad_norm": 0.5484157800674438, "learning_rate": 1.970736842105263e-05, "loss": 0.63, "step": 163 }, { "epoch": 0.1341513292433538, "grad_norm": 0.376302570104599, "learning_rate": 1.917473684210526e-05, "loss": 0.3893, "step": 164 }, { "epoch": 0.13496932515337423, "grad_norm": 0.41201335191726685, "learning_rate": 1.8642105263157893e-05, "loss": 0.4192, "step": 165 }, { "epoch": 0.1357873210633947, "grad_norm": 0.69189453125, "learning_rate": 1.8109473684210524e-05, "loss": 0.8196, "step": 166 }, { "epoch": 0.13660531697341513, "grad_norm": 0.3967001140117645, "learning_rate": 1.757684210526316e-05, "loss": 0.4478, "step": 167 }, { "epoch": 0.1374233128834356, "grad_norm": 0.40037596225738525, "learning_rate": 1.704421052631579e-05, "loss": 0.437, "step": 168 }, { "epoch": 0.13824130879345603, "grad_norm": 0.4589173197746277, "learning_rate": 1.651157894736842e-05, "loss": 0.5041, "step": 169 }, { "epoch": 0.1390593047034765, "grad_norm": 0.5317126512527466, "learning_rate": 1.5978947368421052e-05, "loss": 0.5844, "step": 170 }, { "epoch": 0.13987730061349693, "grad_norm": 0.6097099184989929, "learning_rate": 1.5446315789473683e-05, "loss": 0.8981, "step": 171 }, { "epoch": 0.1406952965235174, "grad_norm": 0.5526396632194519, "learning_rate": 1.4913684210526314e-05, "loss": 0.5214, "step": 172 }, { "epoch": 0.14151329243353783, "grad_norm": 0.49050372838974, "learning_rate": 1.4381052631578945e-05, "loss": 0.3992, "step": 173 }, { "epoch": 0.1423312883435583, "grad_norm": 0.6631816029548645, "learning_rate": 1.3848421052631578e-05, "loss": 0.7089, "step": 174 }, { "epoch": 0.14314928425357873, "grad_norm": 0.6019887328147888, "learning_rate": 1.331578947368421e-05, "loss": 0.6868, "step": 175 }, { "epoch": 0.1439672801635992, "grad_norm": 0.46158432960510254, "learning_rate": 1.2783157894736842e-05, "loss": 0.4053, "step": 176 }, { "epoch": 0.14478527607361963, "grad_norm": 3.7597968578338623, "learning_rate": 1.2250526315789473e-05, "loss": 0.6116, "step": 177 }, { "epoch": 0.1456032719836401, "grad_norm": 0.3607610762119293, "learning_rate": 1.1717894736842105e-05, "loss": 0.2041, "step": 178 }, { "epoch": 0.14642126789366053, "grad_norm": 0.6213468313217163, "learning_rate": 1.1185263157894736e-05, "loss": 0.6546, "step": 179 }, { "epoch": 0.147239263803681, "grad_norm": 0.4181584119796753, "learning_rate": 1.0652631578947367e-05, "loss": 0.3367, "step": 180 }, { "epoch": 0.14805725971370143, "grad_norm": 0.7935755848884583, "learning_rate": 1.012e-05, "loss": 0.7382, "step": 181 }, { "epoch": 0.1488752556237219, "grad_norm": 0.7149289846420288, "learning_rate": 9.58736842105263e-06, "loss": 0.6995, "step": 182 }, { "epoch": 0.14969325153374233, "grad_norm": 0.6175360083580017, "learning_rate": 9.054736842105262e-06, "loss": 0.515, "step": 183 }, { "epoch": 0.1505112474437628, "grad_norm": 0.432099848985672, "learning_rate": 8.522105263157895e-06, "loss": 0.222, "step": 184 }, { "epoch": 0.15132924335378323, "grad_norm": 0.14052408933639526, "learning_rate": 7.989473684210526e-06, "loss": 0.0522, "step": 185 }, { "epoch": 0.1521472392638037, "grad_norm": 0.06722941249608994, "learning_rate": 7.456842105263157e-06, "loss": 0.0043, "step": 186 }, { "epoch": 0.15296523517382413, "grad_norm": 0.015035667456686497, "learning_rate": 6.924210526315789e-06, "loss": 0.0008, "step": 187 }, { "epoch": 0.1537832310838446, "grad_norm": 0.15017950534820557, "learning_rate": 6.391578947368421e-06, "loss": 0.003, "step": 188 }, { "epoch": 0.15460122699386503, "grad_norm": 0.018131496384739876, "learning_rate": 5.858947368421052e-06, "loss": 0.001, "step": 189 }, { "epoch": 0.1554192229038855, "grad_norm": 0.015157670713961124, "learning_rate": 5.326315789473683e-06, "loss": 0.0008, "step": 190 }, { "epoch": 0.15623721881390593, "grad_norm": 0.11725586652755737, "learning_rate": 4.793684210526315e-06, "loss": 0.0067, "step": 191 }, { "epoch": 0.1570552147239264, "grad_norm": 0.043377045542001724, "learning_rate": 4.261052631578947e-06, "loss": 0.0024, "step": 192 }, { "epoch": 0.15787321063394683, "grad_norm": 0.10283850133419037, "learning_rate": 3.7284210526315786e-06, "loss": 0.004, "step": 193 }, { "epoch": 0.1586912065439673, "grad_norm": 0.1786062866449356, "learning_rate": 3.1957894736842106e-06, "loss": 0.0071, "step": 194 }, { "epoch": 0.15950920245398773, "grad_norm": 0.13341274857521057, "learning_rate": 2.6631578947368417e-06, "loss": 0.0007, "step": 195 }, { "epoch": 0.1603271983640082, "grad_norm": 0.1136331856250763, "learning_rate": 2.1305263157894737e-06, "loss": 0.0033, "step": 196 }, { "epoch": 0.16114519427402862, "grad_norm": 0.052339375019073486, "learning_rate": 1.5978947368421053e-06, "loss": 0.002, "step": 197 }, { "epoch": 0.1619631901840491, "grad_norm": 0.01893027499318123, "learning_rate": 1.0652631578947369e-06, "loss": 0.0008, "step": 198 }, { "epoch": 0.16278118609406952, "grad_norm": 0.01076345145702362, "learning_rate": 5.326315789473684e-07, "loss": 0.0006, "step": 199 }, { "epoch": 0.16359918200409, "grad_norm": 0.029393598437309265, "learning_rate": 0.0, "loss": 0.0015, "step": 200 }, { "epoch": 0.16359918200409, "eval_loss": 0.21266202628612518, "eval_runtime": 165.9021, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.778, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.484049111154688e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }