{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995169859925938, "eval_steps": 100, "global_step": 1552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006440186765416197, "grad_norm": 3.671563370875989, "learning_rate": 1.9999979512632042e-05, "loss": 0.9753, "step": 1 }, { "epoch": 0.0012880373530832394, "grad_norm": 20.647647780962263, "learning_rate": 1.999991805061211e-05, "loss": 0.9156, "step": 2 }, { "epoch": 0.0019320560296248591, "grad_norm": 4.391155304153217, "learning_rate": 1.999981561419204e-05, "loss": 0.8462, "step": 3 }, { "epoch": 0.002576074706166479, "grad_norm": 2.9289844591625087, "learning_rate": 1.9999672203791564e-05, "loss": 0.7719, "step": 4 }, { "epoch": 0.0032200933827080985, "grad_norm": 6.499434041986469, "learning_rate": 1.9999487819998307e-05, "loss": 0.809, "step": 5 }, { "epoch": 0.0038641120592497183, "grad_norm": 3.1578817580531076, "learning_rate": 1.9999262463567772e-05, "loss": 0.7768, "step": 6 }, { "epoch": 0.0045081307357913375, "grad_norm": 1.9251496441273475, "learning_rate": 1.9998996135423355e-05, "loss": 0.6968, "step": 7 }, { "epoch": 0.005152149412332958, "grad_norm": 2.809973793834585, "learning_rate": 1.9998688836656322e-05, "loss": 0.7339, "step": 8 }, { "epoch": 0.005796168088874577, "grad_norm": 2.06855897307237, "learning_rate": 1.9998340568525828e-05, "loss": 0.7167, "step": 9 }, { "epoch": 0.006440186765416197, "grad_norm": 1.8722992612958684, "learning_rate": 1.999795133245889e-05, "loss": 0.6559, "step": 10 }, { "epoch": 0.007084205441957816, "grad_norm": 1.895806361132927, "learning_rate": 1.9997521130050393e-05, "loss": 0.6877, "step": 11 }, { "epoch": 0.0077282241184994365, "grad_norm": 1.0822049952365729, "learning_rate": 1.999704996306308e-05, "loss": 0.6593, "step": 12 }, { "epoch": 0.008372242795041056, "grad_norm": 1.2651875000028487, "learning_rate": 1.9996537833427547e-05, "loss": 0.6588, "step": 13 }, { "epoch": 0.009016261471582675, "grad_norm": 0.981455409776805, "learning_rate": 1.999598474324223e-05, "loss": 0.6519, "step": 14 }, { "epoch": 0.009660280148124296, "grad_norm": 1.0238044991137212, "learning_rate": 1.9995390694773396e-05, "loss": 0.6218, "step": 15 }, { "epoch": 0.010304298824665915, "grad_norm": 0.7788192543700412, "learning_rate": 1.9994755690455154e-05, "loss": 0.5937, "step": 16 }, { "epoch": 0.010948317501207535, "grad_norm": 1.026623484163182, "learning_rate": 1.9994079732889404e-05, "loss": 0.6336, "step": 17 }, { "epoch": 0.011592336177749154, "grad_norm": 0.8075263958893655, "learning_rate": 1.9993362824845878e-05, "loss": 0.6701, "step": 18 }, { "epoch": 0.012236354854290775, "grad_norm": 0.8645740424983249, "learning_rate": 1.9992604969262078e-05, "loss": 0.6488, "step": 19 }, { "epoch": 0.012880373530832394, "grad_norm": 0.8510133651233472, "learning_rate": 1.9991806169243302e-05, "loss": 0.614, "step": 20 }, { "epoch": 0.013524392207374013, "grad_norm": 17.778364636169115, "learning_rate": 1.999096642806261e-05, "loss": 0.6935, "step": 21 }, { "epoch": 0.014168410883915633, "grad_norm": 1.4772188941749318, "learning_rate": 1.999008574916082e-05, "loss": 0.6008, "step": 22 }, { "epoch": 0.014812429560457254, "grad_norm": 0.9278920763041417, "learning_rate": 1.9989164136146492e-05, "loss": 0.6342, "step": 23 }, { "epoch": 0.015456448236998873, "grad_norm": 1.0696012132310473, "learning_rate": 1.998820159279591e-05, "loss": 0.6445, "step": 24 }, { "epoch": 0.016100466913540494, "grad_norm": 1.0833844338954894, "learning_rate": 1.9987198123053066e-05, "loss": 0.6114, "step": 25 }, { "epoch": 0.01674448559008211, "grad_norm": 0.7384542144445283, "learning_rate": 1.9986153731029657e-05, "loss": 0.5965, "step": 26 }, { "epoch": 0.017388504266623733, "grad_norm": 0.957535094810883, "learning_rate": 1.998506842100505e-05, "loss": 0.62, "step": 27 }, { "epoch": 0.01803252294316535, "grad_norm": 1.193437353859679, "learning_rate": 1.9983942197426272e-05, "loss": 0.6309, "step": 28 }, { "epoch": 0.01867654161970697, "grad_norm": 0.6592153205375578, "learning_rate": 1.9982775064907995e-05, "loss": 0.6135, "step": 29 }, { "epoch": 0.019320560296248592, "grad_norm": 0.8670054606720032, "learning_rate": 1.9981567028232514e-05, "loss": 0.6185, "step": 30 }, { "epoch": 0.01996457897279021, "grad_norm": 0.7787671609774228, "learning_rate": 1.9980318092349727e-05, "loss": 0.599, "step": 31 }, { "epoch": 0.02060859764933183, "grad_norm": 0.7192909287860376, "learning_rate": 1.997902826237712e-05, "loss": 0.613, "step": 32 }, { "epoch": 0.02125261632587345, "grad_norm": 0.9238835006976016, "learning_rate": 1.9977697543599727e-05, "loss": 0.6066, "step": 33 }, { "epoch": 0.02189663500241507, "grad_norm": 0.7118084666903962, "learning_rate": 1.9976325941470147e-05, "loss": 0.6212, "step": 34 }, { "epoch": 0.02254065367895669, "grad_norm": 0.8248067351714442, "learning_rate": 1.9974913461608473e-05, "loss": 0.5894, "step": 35 }, { "epoch": 0.023184672355498308, "grad_norm": 0.9546840817827269, "learning_rate": 1.9973460109802306e-05, "loss": 0.6077, "step": 36 }, { "epoch": 0.02382869103203993, "grad_norm": 0.588465741568633, "learning_rate": 1.997196589200672e-05, "loss": 0.596, "step": 37 }, { "epoch": 0.02447270970858155, "grad_norm": 0.9847890931975947, "learning_rate": 1.997043081434423e-05, "loss": 0.6019, "step": 38 }, { "epoch": 0.025116728385123167, "grad_norm": 0.7469026168371792, "learning_rate": 1.9968854883104776e-05, "loss": 0.59, "step": 39 }, { "epoch": 0.02576074706166479, "grad_norm": 0.7117519471733281, "learning_rate": 1.9967238104745695e-05, "loss": 0.6108, "step": 40 }, { "epoch": 0.02640476573820641, "grad_norm": 0.7529751201917951, "learning_rate": 1.9965580485891695e-05, "loss": 0.5968, "step": 41 }, { "epoch": 0.027048784414748027, "grad_norm": 0.6347795206489663, "learning_rate": 1.9963882033334827e-05, "loss": 0.6026, "step": 42 }, { "epoch": 0.027692803091289648, "grad_norm": 0.6117524684761728, "learning_rate": 1.996214275403445e-05, "loss": 0.5978, "step": 43 }, { "epoch": 0.028336821767831265, "grad_norm": 0.6881938522552326, "learning_rate": 1.996036265511722e-05, "loss": 0.6197, "step": 44 }, { "epoch": 0.028980840444372886, "grad_norm": 0.5795370784281334, "learning_rate": 1.995854174387704e-05, "loss": 0.6034, "step": 45 }, { "epoch": 0.029624859120914507, "grad_norm": 0.5439355495770238, "learning_rate": 1.9956680027775054e-05, "loss": 0.6017, "step": 46 }, { "epoch": 0.030268877797456125, "grad_norm": 0.6778328871568495, "learning_rate": 1.995477751443959e-05, "loss": 0.581, "step": 47 }, { "epoch": 0.030912896473997746, "grad_norm": 0.7047698223452711, "learning_rate": 1.995283421166614e-05, "loss": 0.584, "step": 48 }, { "epoch": 0.031556915150539364, "grad_norm": 0.552606802042432, "learning_rate": 1.9950850127417346e-05, "loss": 0.5828, "step": 49 }, { "epoch": 0.03220093382708099, "grad_norm": 0.8233714589287299, "learning_rate": 1.9948825269822934e-05, "loss": 0.5831, "step": 50 }, { "epoch": 0.032844952503622606, "grad_norm": 0.6276311166950466, "learning_rate": 1.9946759647179712e-05, "loss": 0.6143, "step": 51 }, { "epoch": 0.03348897118016422, "grad_norm": 0.9507992004398211, "learning_rate": 1.9944653267951507e-05, "loss": 0.6001, "step": 52 }, { "epoch": 0.03413298985670585, "grad_norm": 0.5903494244164796, "learning_rate": 1.9942506140769155e-05, "loss": 0.5763, "step": 53 }, { "epoch": 0.034777008533247465, "grad_norm": 0.778191822741083, "learning_rate": 1.994031827443045e-05, "loss": 0.583, "step": 54 }, { "epoch": 0.03542102720978908, "grad_norm": 0.6984829525707364, "learning_rate": 1.993808967790012e-05, "loss": 0.5865, "step": 55 }, { "epoch": 0.0360650458863307, "grad_norm": 0.6508511428732785, "learning_rate": 1.993582036030978e-05, "loss": 0.5992, "step": 56 }, { "epoch": 0.036709064562872325, "grad_norm": 0.5980558104272059, "learning_rate": 1.9933510330957896e-05, "loss": 0.5752, "step": 57 }, { "epoch": 0.03735308323941394, "grad_norm": 0.586877195289109, "learning_rate": 1.9931159599309757e-05, "loss": 0.612, "step": 58 }, { "epoch": 0.03799710191595556, "grad_norm": 0.5251942102198216, "learning_rate": 1.992876817499742e-05, "loss": 0.5778, "step": 59 }, { "epoch": 0.038641120592497184, "grad_norm": 0.5912919535197637, "learning_rate": 1.9926336067819686e-05, "loss": 0.5604, "step": 60 }, { "epoch": 0.0392851392690388, "grad_norm": 0.5279009889168285, "learning_rate": 1.9923863287742045e-05, "loss": 0.551, "step": 61 }, { "epoch": 0.03992915794558042, "grad_norm": 0.5655078766919017, "learning_rate": 1.9921349844896655e-05, "loss": 0.5793, "step": 62 }, { "epoch": 0.040573176622122044, "grad_norm": 0.777387459072752, "learning_rate": 1.9918795749582277e-05, "loss": 0.571, "step": 63 }, { "epoch": 0.04121719529866366, "grad_norm": 0.5578533074900993, "learning_rate": 1.9916201012264255e-05, "loss": 0.5824, "step": 64 }, { "epoch": 0.04186121397520528, "grad_norm": 0.613498671949401, "learning_rate": 1.9913565643574447e-05, "loss": 0.5618, "step": 65 }, { "epoch": 0.0425052326517469, "grad_norm": 0.5283413662435674, "learning_rate": 1.991088965431121e-05, "loss": 0.557, "step": 66 }, { "epoch": 0.04314925132828852, "grad_norm": 0.5596341273863153, "learning_rate": 1.9908173055439343e-05, "loss": 0.6016, "step": 67 }, { "epoch": 0.04379327000483014, "grad_norm": 0.6897254601313572, "learning_rate": 1.9905415858090036e-05, "loss": 0.5873, "step": 68 }, { "epoch": 0.04443728868137176, "grad_norm": 0.4818791863966756, "learning_rate": 1.990261807356083e-05, "loss": 0.581, "step": 69 }, { "epoch": 0.04508130735791338, "grad_norm": 0.6507113845158146, "learning_rate": 1.9899779713315577e-05, "loss": 0.5581, "step": 70 }, { "epoch": 0.045725326034455, "grad_norm": 0.6236955677973838, "learning_rate": 1.9896900788984383e-05, "loss": 0.5991, "step": 71 }, { "epoch": 0.046369344710996616, "grad_norm": 0.6574482718391559, "learning_rate": 1.9893981312363563e-05, "loss": 0.5996, "step": 72 }, { "epoch": 0.04701336338753824, "grad_norm": 0.6905671775461695, "learning_rate": 1.9891021295415592e-05, "loss": 0.5758, "step": 73 }, { "epoch": 0.04765738206407986, "grad_norm": 0.566018146330249, "learning_rate": 1.9888020750269067e-05, "loss": 0.5805, "step": 74 }, { "epoch": 0.048301400740621475, "grad_norm": 0.5847944193208381, "learning_rate": 1.988497968921864e-05, "loss": 0.582, "step": 75 }, { "epoch": 0.0489454194171631, "grad_norm": 0.5927570434946263, "learning_rate": 1.988189812472498e-05, "loss": 0.5865, "step": 76 }, { "epoch": 0.04958943809370472, "grad_norm": 0.7069539630742576, "learning_rate": 1.9878776069414714e-05, "loss": 0.5954, "step": 77 }, { "epoch": 0.050233456770246335, "grad_norm": 0.5032783057175374, "learning_rate": 1.987561353608038e-05, "loss": 0.5766, "step": 78 }, { "epoch": 0.05087747544678796, "grad_norm": 0.6368709390571496, "learning_rate": 1.9872410537680378e-05, "loss": 0.5786, "step": 79 }, { "epoch": 0.05152149412332958, "grad_norm": 0.5607050960715791, "learning_rate": 1.9869167087338908e-05, "loss": 0.5776, "step": 80 }, { "epoch": 0.052165512799871194, "grad_norm": 0.6270355682423323, "learning_rate": 1.986588319834592e-05, "loss": 0.5574, "step": 81 }, { "epoch": 0.05280953147641282, "grad_norm": 0.6114417634603536, "learning_rate": 1.9862558884157067e-05, "loss": 0.5851, "step": 82 }, { "epoch": 0.053453550152954436, "grad_norm": 0.5784915073317745, "learning_rate": 1.985919415839364e-05, "loss": 0.6043, "step": 83 }, { "epoch": 0.054097568829496054, "grad_norm": 0.5786221081268356, "learning_rate": 1.9855789034842504e-05, "loss": 0.5787, "step": 84 }, { "epoch": 0.05474158750603768, "grad_norm": 0.5429329884670379, "learning_rate": 1.9852343527456074e-05, "loss": 0.5725, "step": 85 }, { "epoch": 0.055385606182579296, "grad_norm": 0.5989649329817042, "learning_rate": 1.9848857650352213e-05, "loss": 0.5746, "step": 86 }, { "epoch": 0.05602962485912091, "grad_norm": 0.5527838189309078, "learning_rate": 1.9845331417814223e-05, "loss": 0.6049, "step": 87 }, { "epoch": 0.05667364353566253, "grad_norm": 0.6089112006219993, "learning_rate": 1.9841764844290744e-05, "loss": 0.5785, "step": 88 }, { "epoch": 0.057317662212204155, "grad_norm": 0.6909602150718357, "learning_rate": 1.9838157944395713e-05, "loss": 0.5759, "step": 89 }, { "epoch": 0.05796168088874577, "grad_norm": 0.5461315778239271, "learning_rate": 1.9834510732908314e-05, "loss": 0.5833, "step": 90 }, { "epoch": 0.05860569956528739, "grad_norm": 0.7546732380559578, "learning_rate": 1.9830823224772894e-05, "loss": 0.5765, "step": 91 }, { "epoch": 0.059249718241829015, "grad_norm": 0.5720394032917774, "learning_rate": 1.9827095435098926e-05, "loss": 0.5756, "step": 92 }, { "epoch": 0.05989373691837063, "grad_norm": 0.6187074844687228, "learning_rate": 1.9823327379160923e-05, "loss": 0.5957, "step": 93 }, { "epoch": 0.06053775559491225, "grad_norm": 0.7774517970952206, "learning_rate": 1.9819519072398397e-05, "loss": 0.5748, "step": 94 }, { "epoch": 0.061181774271453875, "grad_norm": 0.6837144818603169, "learning_rate": 1.9815670530415788e-05, "loss": 0.5418, "step": 95 }, { "epoch": 0.06182579294799549, "grad_norm": 0.7669262212036821, "learning_rate": 1.9811781768982392e-05, "loss": 0.5687, "step": 96 }, { "epoch": 0.06246981162453711, "grad_norm": 0.7588022794082739, "learning_rate": 1.9807852804032306e-05, "loss": 0.5931, "step": 97 }, { "epoch": 0.06311383030107873, "grad_norm": 0.7102567149711397, "learning_rate": 1.980388365166436e-05, "loss": 0.5837, "step": 98 }, { "epoch": 0.06375784897762035, "grad_norm": 0.7508160170395797, "learning_rate": 1.9799874328142053e-05, "loss": 0.5825, "step": 99 }, { "epoch": 0.06440186765416198, "grad_norm": 0.5907489569425378, "learning_rate": 1.9795824849893483e-05, "loss": 0.567, "step": 100 }, { "epoch": 0.06504588633070359, "grad_norm": 2.8305586554904485, "learning_rate": 1.9791735233511274e-05, "loss": 0.5744, "step": 101 }, { "epoch": 0.06568990500724521, "grad_norm": 0.7891211361134949, "learning_rate": 1.9787605495752528e-05, "loss": 0.5907, "step": 102 }, { "epoch": 0.06633392368378684, "grad_norm": 0.9049075119469284, "learning_rate": 1.9783435653538733e-05, "loss": 0.5904, "step": 103 }, { "epoch": 0.06697794236032845, "grad_norm": 0.7392610917880624, "learning_rate": 1.977922572395571e-05, "loss": 0.5553, "step": 104 }, { "epoch": 0.06762196103687007, "grad_norm": 0.9285627976511761, "learning_rate": 1.977497572425353e-05, "loss": 0.5845, "step": 105 }, { "epoch": 0.0682659797134117, "grad_norm": 0.7414992808016694, "learning_rate": 1.977068567184646e-05, "loss": 0.5681, "step": 106 }, { "epoch": 0.0689099983899533, "grad_norm": 0.752784202157923, "learning_rate": 1.9766355584312866e-05, "loss": 0.5638, "step": 107 }, { "epoch": 0.06955401706649493, "grad_norm": 0.6833260451735423, "learning_rate": 1.976198547939518e-05, "loss": 0.5536, "step": 108 }, { "epoch": 0.07019803574303655, "grad_norm": 0.7263266258506763, "learning_rate": 1.9757575374999785e-05, "loss": 0.5688, "step": 109 }, { "epoch": 0.07084205441957817, "grad_norm": 0.8741464982339473, "learning_rate": 1.975312528919697e-05, "loss": 0.5691, "step": 110 }, { "epoch": 0.07148607309611979, "grad_norm": 0.5739241042271949, "learning_rate": 1.9748635240220846e-05, "loss": 0.572, "step": 111 }, { "epoch": 0.0721300917726614, "grad_norm": 0.76160166182381, "learning_rate": 1.9744105246469264e-05, "loss": 0.5593, "step": 112 }, { "epoch": 0.07277411044920302, "grad_norm": 0.5685728950161864, "learning_rate": 1.9739535326503753e-05, "loss": 0.5712, "step": 113 }, { "epoch": 0.07341812912574465, "grad_norm": 0.8119489174701773, "learning_rate": 1.9734925499049446e-05, "loss": 0.5631, "step": 114 }, { "epoch": 0.07406214780228626, "grad_norm": 0.6006008240803711, "learning_rate": 1.9730275782994984e-05, "loss": 0.5637, "step": 115 }, { "epoch": 0.07470616647882788, "grad_norm": 0.6071416288910575, "learning_rate": 1.972558619739246e-05, "loss": 0.5552, "step": 116 }, { "epoch": 0.07535018515536951, "grad_norm": 0.5900533139156463, "learning_rate": 1.9720856761457326e-05, "loss": 0.5684, "step": 117 }, { "epoch": 0.07599420383191112, "grad_norm": 0.7084388577870796, "learning_rate": 1.9716087494568318e-05, "loss": 0.5635, "step": 118 }, { "epoch": 0.07663822250845274, "grad_norm": 0.6002444172502165, "learning_rate": 1.9711278416267385e-05, "loss": 0.5564, "step": 119 }, { "epoch": 0.07728224118499437, "grad_norm": 0.5349947061635673, "learning_rate": 1.9706429546259592e-05, "loss": 0.5658, "step": 120 }, { "epoch": 0.07792625986153598, "grad_norm": 0.7148148922371023, "learning_rate": 1.9701540904413067e-05, "loss": 0.564, "step": 121 }, { "epoch": 0.0785702785380776, "grad_norm": 0.5057511556323521, "learning_rate": 1.9696612510758878e-05, "loss": 0.5509, "step": 122 }, { "epoch": 0.07921429721461923, "grad_norm": 0.6470696610775742, "learning_rate": 1.9691644385490996e-05, "loss": 0.5564, "step": 123 }, { "epoch": 0.07985831589116084, "grad_norm": 0.5285733506117851, "learning_rate": 1.9686636548966177e-05, "loss": 0.5678, "step": 124 }, { "epoch": 0.08050233456770246, "grad_norm": 0.7558456025478053, "learning_rate": 1.968158902170391e-05, "loss": 0.595, "step": 125 }, { "epoch": 0.08114635324424409, "grad_norm": 0.6102554420287217, "learning_rate": 1.9676501824386295e-05, "loss": 0.5833, "step": 126 }, { "epoch": 0.0817903719207857, "grad_norm": 0.7620095772555008, "learning_rate": 1.9671374977857987e-05, "loss": 0.5863, "step": 127 }, { "epoch": 0.08243439059732732, "grad_norm": 0.6161497476986848, "learning_rate": 1.9666208503126115e-05, "loss": 0.5553, "step": 128 }, { "epoch": 0.08307840927386895, "grad_norm": 0.737499082486554, "learning_rate": 1.9661002421360164e-05, "loss": 0.5886, "step": 129 }, { "epoch": 0.08372242795041056, "grad_norm": 0.7083570445698253, "learning_rate": 1.9655756753891916e-05, "loss": 0.563, "step": 130 }, { "epoch": 0.08436644662695218, "grad_norm": 0.7243796865762516, "learning_rate": 1.965047152221536e-05, "loss": 0.5656, "step": 131 }, { "epoch": 0.0850104653034938, "grad_norm": 0.7152858056483491, "learning_rate": 1.964514674798659e-05, "loss": 0.5638, "step": 132 }, { "epoch": 0.08565448398003542, "grad_norm": 0.5218971341037003, "learning_rate": 1.963978245302373e-05, "loss": 0.5901, "step": 133 }, { "epoch": 0.08629850265657704, "grad_norm": 0.6187821203264378, "learning_rate": 1.9634378659306834e-05, "loss": 0.5486, "step": 134 }, { "epoch": 0.08694252133311867, "grad_norm": 0.6397509754118936, "learning_rate": 1.9628935388977804e-05, "loss": 0.5336, "step": 135 }, { "epoch": 0.08758654000966028, "grad_norm": 0.6938710519485553, "learning_rate": 1.9623452664340305e-05, "loss": 0.5584, "step": 136 }, { "epoch": 0.0882305586862019, "grad_norm": 0.6916395175841887, "learning_rate": 1.9617930507859643e-05, "loss": 0.576, "step": 137 }, { "epoch": 0.08887457736274353, "grad_norm": 0.7034191344807731, "learning_rate": 1.9612368942162717e-05, "loss": 0.5661, "step": 138 }, { "epoch": 0.08951859603928514, "grad_norm": 0.6477515649146067, "learning_rate": 1.9606767990037896e-05, "loss": 0.5697, "step": 139 }, { "epoch": 0.09016261471582676, "grad_norm": 0.5583068592213097, "learning_rate": 1.960112767443493e-05, "loss": 0.5612, "step": 140 }, { "epoch": 0.09080663339236839, "grad_norm": 0.6194438917970964, "learning_rate": 1.959544801846486e-05, "loss": 0.5547, "step": 141 }, { "epoch": 0.09145065206891, "grad_norm": 0.6172764283023002, "learning_rate": 1.9589729045399935e-05, "loss": 0.5718, "step": 142 }, { "epoch": 0.09209467074545162, "grad_norm": 0.5373185855008699, "learning_rate": 1.9583970778673487e-05, "loss": 0.5806, "step": 143 }, { "epoch": 0.09273868942199323, "grad_norm": 0.6244971622553643, "learning_rate": 1.957817324187987e-05, "loss": 0.5376, "step": 144 }, { "epoch": 0.09338270809853486, "grad_norm": 0.522662205822613, "learning_rate": 1.9572336458774336e-05, "loss": 0.5705, "step": 145 }, { "epoch": 0.09402672677507648, "grad_norm": 0.650718040264212, "learning_rate": 1.9566460453272945e-05, "loss": 0.5735, "step": 146 }, { "epoch": 0.09467074545161809, "grad_norm": 0.504328757176255, "learning_rate": 1.9560545249452477e-05, "loss": 0.5751, "step": 147 }, { "epoch": 0.09531476412815972, "grad_norm": 0.5417081955771302, "learning_rate": 1.955459087155033e-05, "loss": 0.5976, "step": 148 }, { "epoch": 0.09595878280470134, "grad_norm": 0.4837609276303342, "learning_rate": 1.9548597343964396e-05, "loss": 0.5635, "step": 149 }, { "epoch": 0.09660280148124295, "grad_norm": 0.557953872466771, "learning_rate": 1.954256469125301e-05, "loss": 0.5721, "step": 150 }, { "epoch": 0.09724682015778457, "grad_norm": 0.5285805885395818, "learning_rate": 1.9536492938134796e-05, "loss": 0.5575, "step": 151 }, { "epoch": 0.0978908388343262, "grad_norm": 0.6046225007824951, "learning_rate": 1.953038210948861e-05, "loss": 0.5569, "step": 152 }, { "epoch": 0.09853485751086781, "grad_norm": 0.5754312743451351, "learning_rate": 1.9524232230353408e-05, "loss": 0.5425, "step": 153 }, { "epoch": 0.09917887618740943, "grad_norm": 0.5385413722731617, "learning_rate": 1.9518043325928157e-05, "loss": 0.5689, "step": 154 }, { "epoch": 0.09982289486395106, "grad_norm": 1.3141042662478268, "learning_rate": 1.9511815421571733e-05, "loss": 0.5513, "step": 155 }, { "epoch": 0.10046691354049267, "grad_norm": 0.5388330745566448, "learning_rate": 1.9505548542802805e-05, "loss": 0.5208, "step": 156 }, { "epoch": 0.1011109322170343, "grad_norm": 0.5030120890668549, "learning_rate": 1.9499242715299743e-05, "loss": 0.5516, "step": 157 }, { "epoch": 0.10175495089357592, "grad_norm": 0.6328002213984539, "learning_rate": 1.9492897964900512e-05, "loss": 0.5616, "step": 158 }, { "epoch": 0.10239896957011753, "grad_norm": 0.5297596246265618, "learning_rate": 1.9486514317602555e-05, "loss": 0.5404, "step": 159 }, { "epoch": 0.10304298824665915, "grad_norm": 0.5360517020297144, "learning_rate": 1.9480091799562706e-05, "loss": 0.548, "step": 160 }, { "epoch": 0.10368700692320078, "grad_norm": 0.518560345271594, "learning_rate": 1.9473630437097056e-05, "loss": 0.5608, "step": 161 }, { "epoch": 0.10433102559974239, "grad_norm": 0.5473158164513207, "learning_rate": 1.9467130256680867e-05, "loss": 0.5671, "step": 162 }, { "epoch": 0.10497504427628401, "grad_norm": 0.5367259226283068, "learning_rate": 1.9460591284948463e-05, "loss": 0.5572, "step": 163 }, { "epoch": 0.10561906295282564, "grad_norm": 0.6040336970919068, "learning_rate": 1.9454013548693103e-05, "loss": 0.5565, "step": 164 }, { "epoch": 0.10626308162936725, "grad_norm": 0.9998183725591161, "learning_rate": 1.944739707486689e-05, "loss": 0.56, "step": 165 }, { "epoch": 0.10690710030590887, "grad_norm": 0.5285711635066909, "learning_rate": 1.9440741890580643e-05, "loss": 0.5679, "step": 166 }, { "epoch": 0.1075511189824505, "grad_norm": 0.579143186197226, "learning_rate": 1.943404802310381e-05, "loss": 0.5609, "step": 167 }, { "epoch": 0.10819513765899211, "grad_norm": 0.5481068617327515, "learning_rate": 1.9427315499864345e-05, "loss": 0.5708, "step": 168 }, { "epoch": 0.10883915633553373, "grad_norm": 0.6819244737525157, "learning_rate": 1.942054434844857e-05, "loss": 0.5687, "step": 169 }, { "epoch": 0.10948317501207536, "grad_norm": 0.5172578610589544, "learning_rate": 1.9413734596601104e-05, "loss": 0.5595, "step": 170 }, { "epoch": 0.11012719368861697, "grad_norm": 0.6235526728821736, "learning_rate": 1.9406886272224724e-05, "loss": 0.5639, "step": 171 }, { "epoch": 0.11077121236515859, "grad_norm": 0.5171737604614556, "learning_rate": 1.9399999403380266e-05, "loss": 0.5635, "step": 172 }, { "epoch": 0.11141523104170022, "grad_norm": 0.5917763564672169, "learning_rate": 1.939307401828648e-05, "loss": 0.5615, "step": 173 }, { "epoch": 0.11205924971824183, "grad_norm": 0.5366633704787982, "learning_rate": 1.9386110145319962e-05, "loss": 0.5682, "step": 174 }, { "epoch": 0.11270326839478345, "grad_norm": 0.5751690590687346, "learning_rate": 1.937910781301499e-05, "loss": 0.5515, "step": 175 }, { "epoch": 0.11334728707132506, "grad_norm": 0.6245137327067746, "learning_rate": 1.937206705006344e-05, "loss": 0.5496, "step": 176 }, { "epoch": 0.11399130574786669, "grad_norm": 0.5396954585266718, "learning_rate": 1.9364987885314645e-05, "loss": 0.5604, "step": 177 }, { "epoch": 0.11463532442440831, "grad_norm": 0.5341129761149012, "learning_rate": 1.93578703477753e-05, "loss": 0.549, "step": 178 }, { "epoch": 0.11527934310094992, "grad_norm": 0.6047678202875356, "learning_rate": 1.935071446660933e-05, "loss": 0.5817, "step": 179 }, { "epoch": 0.11592336177749155, "grad_norm": 0.5732712448621166, "learning_rate": 1.9343520271137764e-05, "loss": 0.5444, "step": 180 }, { "epoch": 0.11656738045403317, "grad_norm": 0.6314374471294412, "learning_rate": 1.933628779083863e-05, "loss": 0.5605, "step": 181 }, { "epoch": 0.11721139913057478, "grad_norm": 0.5942664149914738, "learning_rate": 1.932901705534683e-05, "loss": 0.5536, "step": 182 }, { "epoch": 0.1178554178071164, "grad_norm": 0.6669076648644066, "learning_rate": 1.9321708094454003e-05, "loss": 0.5385, "step": 183 }, { "epoch": 0.11849943648365803, "grad_norm": 0.593543293145718, "learning_rate": 1.9314360938108427e-05, "loss": 0.5604, "step": 184 }, { "epoch": 0.11914345516019964, "grad_norm": 0.6813668165694591, "learning_rate": 1.9306975616414876e-05, "loss": 0.5554, "step": 185 }, { "epoch": 0.11978747383674127, "grad_norm": 0.5345635870918699, "learning_rate": 1.929955215963452e-05, "loss": 0.5257, "step": 186 }, { "epoch": 0.12043149251328289, "grad_norm": 0.8362235231873847, "learning_rate": 1.9292090598184768e-05, "loss": 0.5549, "step": 187 }, { "epoch": 0.1210755111898245, "grad_norm": 0.6011565195448995, "learning_rate": 1.928459096263918e-05, "loss": 0.5544, "step": 188 }, { "epoch": 0.12171952986636612, "grad_norm": 0.7408493460960589, "learning_rate": 1.9277053283727306e-05, "loss": 0.5671, "step": 189 }, { "epoch": 0.12236354854290775, "grad_norm": 0.6561870144717391, "learning_rate": 1.926947759233459e-05, "loss": 0.5734, "step": 190 }, { "epoch": 0.12300756721944936, "grad_norm": 0.6682288982785446, "learning_rate": 1.9261863919502228e-05, "loss": 0.5504, "step": 191 }, { "epoch": 0.12365158589599098, "grad_norm": 0.6721670352164001, "learning_rate": 1.9254212296427043e-05, "loss": 0.5776, "step": 192 }, { "epoch": 0.12429560457253261, "grad_norm": 0.5797265594192421, "learning_rate": 1.924652275446136e-05, "loss": 0.5279, "step": 193 }, { "epoch": 0.12493962324907422, "grad_norm": 0.6437235570461523, "learning_rate": 1.9238795325112867e-05, "loss": 0.5613, "step": 194 }, { "epoch": 0.12558364192561583, "grad_norm": 0.48419631975695027, "learning_rate": 1.9231030040044514e-05, "loss": 0.5296, "step": 195 }, { "epoch": 0.12622766060215745, "grad_norm": 0.6294886316095005, "learning_rate": 1.922322693107434e-05, "loss": 0.5466, "step": 196 }, { "epoch": 0.12687167927869908, "grad_norm": 0.5433542034860996, "learning_rate": 1.9215386030175383e-05, "loss": 0.5428, "step": 197 }, { "epoch": 0.1275156979552407, "grad_norm": 0.5960822553891673, "learning_rate": 1.920750736947553e-05, "loss": 0.5788, "step": 198 }, { "epoch": 0.12815971663178233, "grad_norm": 0.5191972193625295, "learning_rate": 1.919959098125738e-05, "loss": 0.5576, "step": 199 }, { "epoch": 0.12880373530832395, "grad_norm": 0.6038757075662061, "learning_rate": 1.9191636897958123e-05, "loss": 0.5607, "step": 200 }, { "epoch": 0.12944775398486555, "grad_norm": 0.5438484997977986, "learning_rate": 1.918364515216941e-05, "loss": 0.538, "step": 201 }, { "epoch": 0.13009177266140717, "grad_norm": 0.48158797265977493, "learning_rate": 1.9175615776637212e-05, "loss": 0.5567, "step": 202 }, { "epoch": 0.1307357913379488, "grad_norm": 0.5982841588336144, "learning_rate": 1.9167548804261677e-05, "loss": 0.5403, "step": 203 }, { "epoch": 0.13137981001449042, "grad_norm": 1.7207949499523336, "learning_rate": 1.9159444268097012e-05, "loss": 0.5501, "step": 204 }, { "epoch": 0.13202382869103205, "grad_norm": 0.621656157970725, "learning_rate": 1.915130220135134e-05, "loss": 0.5518, "step": 205 }, { "epoch": 0.13266784736757367, "grad_norm": 0.6001837977100303, "learning_rate": 1.9143122637386567e-05, "loss": 0.5592, "step": 206 }, { "epoch": 0.13331186604411527, "grad_norm": 0.5028536673168801, "learning_rate": 1.9134905609718235e-05, "loss": 0.5501, "step": 207 }, { "epoch": 0.1339558847206569, "grad_norm": 0.5653101603369096, "learning_rate": 1.9126651152015404e-05, "loss": 0.5733, "step": 208 }, { "epoch": 0.13459990339719852, "grad_norm": 0.5540120400277606, "learning_rate": 1.9118359298100494e-05, "loss": 0.5455, "step": 209 }, { "epoch": 0.13524392207374014, "grad_norm": 0.5190257854619702, "learning_rate": 1.9110030081949157e-05, "loss": 0.5498, "step": 210 }, { "epoch": 0.13588794075028177, "grad_norm": 0.5113247053237887, "learning_rate": 1.9101663537690134e-05, "loss": 0.5642, "step": 211 }, { "epoch": 0.1365319594268234, "grad_norm": 0.5632236943994993, "learning_rate": 1.9093259699605125e-05, "loss": 0.5583, "step": 212 }, { "epoch": 0.137175978103365, "grad_norm": 0.5071810875554302, "learning_rate": 1.9084818602128627e-05, "loss": 0.5491, "step": 213 }, { "epoch": 0.1378199967799066, "grad_norm": 0.5241428656701698, "learning_rate": 1.907634027984782e-05, "loss": 0.5545, "step": 214 }, { "epoch": 0.13846401545644824, "grad_norm": 0.5321943135615345, "learning_rate": 1.9067824767502404e-05, "loss": 0.5498, "step": 215 }, { "epoch": 0.13910803413298986, "grad_norm": 0.5112773036879799, "learning_rate": 1.905927209998447e-05, "loss": 0.5532, "step": 216 }, { "epoch": 0.13975205280953149, "grad_norm": 0.5590726490855072, "learning_rate": 1.905068231233834e-05, "loss": 0.5389, "step": 217 }, { "epoch": 0.1403960714860731, "grad_norm": 0.5501097211968181, "learning_rate": 1.9042055439760447e-05, "loss": 0.5548, "step": 218 }, { "epoch": 0.1410400901626147, "grad_norm": 0.6195645653345545, "learning_rate": 1.9033391517599166e-05, "loss": 0.5588, "step": 219 }, { "epoch": 0.14168410883915633, "grad_norm": 0.9947028770136911, "learning_rate": 1.90246905813547e-05, "loss": 0.5604, "step": 220 }, { "epoch": 0.14232812751569796, "grad_norm": 0.6202136156988654, "learning_rate": 1.9015952666678902e-05, "loss": 0.5226, "step": 221 }, { "epoch": 0.14297214619223958, "grad_norm": 0.578124130568257, "learning_rate": 1.900717780937514e-05, "loss": 0.5557, "step": 222 }, { "epoch": 0.1436161648687812, "grad_norm": 0.5075118049612316, "learning_rate": 1.8998366045398172e-05, "loss": 0.5379, "step": 223 }, { "epoch": 0.1442601835453228, "grad_norm": 0.6331197117570935, "learning_rate": 1.8989517410853956e-05, "loss": 0.5385, "step": 224 }, { "epoch": 0.14490420222186443, "grad_norm": 0.5206772392550988, "learning_rate": 1.8980631941999544e-05, "loss": 0.5395, "step": 225 }, { "epoch": 0.14554822089840605, "grad_norm": 0.5852926640651946, "learning_rate": 1.897170967524291e-05, "loss": 0.5377, "step": 226 }, { "epoch": 0.14619223957494767, "grad_norm": 0.5147005421702976, "learning_rate": 1.8962750647142808e-05, "loss": 0.5718, "step": 227 }, { "epoch": 0.1468362582514893, "grad_norm": 0.5838360909104077, "learning_rate": 1.8953754894408617e-05, "loss": 0.5579, "step": 228 }, { "epoch": 0.14748027692803092, "grad_norm": 0.5823733652135687, "learning_rate": 1.8944722453900195e-05, "loss": 0.5467, "step": 229 }, { "epoch": 0.14812429560457252, "grad_norm": 0.712247260770518, "learning_rate": 1.893565336262773e-05, "loss": 0.5644, "step": 230 }, { "epoch": 0.14876831428111414, "grad_norm": 0.4976892291094713, "learning_rate": 1.8926547657751585e-05, "loss": 0.5714, "step": 231 }, { "epoch": 0.14941233295765577, "grad_norm": 0.6571648163602213, "learning_rate": 1.8917405376582144e-05, "loss": 0.5367, "step": 232 }, { "epoch": 0.1500563516341974, "grad_norm": 0.5306232765588836, "learning_rate": 1.8908226556579665e-05, "loss": 0.549, "step": 233 }, { "epoch": 0.15070037031073902, "grad_norm": 0.541483246564227, "learning_rate": 1.8899011235354118e-05, "loss": 0.5776, "step": 234 }, { "epoch": 0.15134438898728064, "grad_norm": 0.5668079641200525, "learning_rate": 1.8889759450665036e-05, "loss": 0.539, "step": 235 }, { "epoch": 0.15198840766382224, "grad_norm": 0.48238285398921504, "learning_rate": 1.8880471240421365e-05, "loss": 0.5401, "step": 236 }, { "epoch": 0.15263242634036386, "grad_norm": 0.49647805349835483, "learning_rate": 1.8871146642681304e-05, "loss": 0.5244, "step": 237 }, { "epoch": 0.1532764450169055, "grad_norm": 0.5460829681110462, "learning_rate": 1.8861785695652142e-05, "loss": 0.5555, "step": 238 }, { "epoch": 0.1539204636934471, "grad_norm": 0.5352346718064093, "learning_rate": 1.8852388437690113e-05, "loss": 0.5576, "step": 239 }, { "epoch": 0.15456448236998874, "grad_norm": 0.6180561066212092, "learning_rate": 1.8842954907300236e-05, "loss": 0.5473, "step": 240 }, { "epoch": 0.15520850104653036, "grad_norm": 0.4847196074208255, "learning_rate": 1.883348514313615e-05, "loss": 0.5267, "step": 241 }, { "epoch": 0.15585251972307196, "grad_norm": 0.5444432027028921, "learning_rate": 1.8823979183999965e-05, "loss": 0.5702, "step": 242 }, { "epoch": 0.15649653839961358, "grad_norm": 0.6358972897506083, "learning_rate": 1.8814437068842096e-05, "loss": 0.5601, "step": 243 }, { "epoch": 0.1571405570761552, "grad_norm": 0.5004427960500281, "learning_rate": 1.880485883676111e-05, "loss": 0.5359, "step": 244 }, { "epoch": 0.15778457575269683, "grad_norm": 0.548999656691254, "learning_rate": 1.8795244527003557e-05, "loss": 0.5465, "step": 245 }, { "epoch": 0.15842859442923846, "grad_norm": 0.5474493327872936, "learning_rate": 1.878559417896382e-05, "loss": 0.5723, "step": 246 }, { "epoch": 0.15907261310578008, "grad_norm": 0.5292540699957294, "learning_rate": 1.8775907832183945e-05, "loss": 0.5339, "step": 247 }, { "epoch": 0.15971663178232168, "grad_norm": 0.550128527266154, "learning_rate": 1.876618552635348e-05, "loss": 0.5441, "step": 248 }, { "epoch": 0.1603606504588633, "grad_norm": 0.49922753180546464, "learning_rate": 1.8756427301309317e-05, "loss": 0.5541, "step": 249 }, { "epoch": 0.16100466913540493, "grad_norm": 0.5374035986912075, "learning_rate": 1.8746633197035525e-05, "loss": 0.5574, "step": 250 }, { "epoch": 0.16164868781194655, "grad_norm": 0.5231366173618106, "learning_rate": 1.873680325366319e-05, "loss": 0.5471, "step": 251 }, { "epoch": 0.16229270648848818, "grad_norm": 0.5239526267436165, "learning_rate": 1.8726937511470247e-05, "loss": 0.5594, "step": 252 }, { "epoch": 0.16293672516502977, "grad_norm": 0.6385732376066636, "learning_rate": 1.871703601088131e-05, "loss": 0.5701, "step": 253 }, { "epoch": 0.1635807438415714, "grad_norm": 0.48034920978799195, "learning_rate": 1.870709879246752e-05, "loss": 0.5519, "step": 254 }, { "epoch": 0.16422476251811302, "grad_norm": 0.6194153188567921, "learning_rate": 1.869712589694636e-05, "loss": 0.5542, "step": 255 }, { "epoch": 0.16486878119465465, "grad_norm": 0.4868628741241556, "learning_rate": 1.8687117365181514e-05, "loss": 0.5403, "step": 256 }, { "epoch": 0.16551279987119627, "grad_norm": 0.5551591067712492, "learning_rate": 1.8677073238182667e-05, "loss": 0.5274, "step": 257 }, { "epoch": 0.1661568185477379, "grad_norm": 5.589232078725256, "learning_rate": 1.8666993557105377e-05, "loss": 0.5435, "step": 258 }, { "epoch": 0.1668008372242795, "grad_norm": 0.6122796356689075, "learning_rate": 1.865687836325086e-05, "loss": 0.5289, "step": 259 }, { "epoch": 0.16744485590082112, "grad_norm": 0.5284876276533604, "learning_rate": 1.8646727698065865e-05, "loss": 0.5407, "step": 260 }, { "epoch": 0.16808887457736274, "grad_norm": 0.5727970702045417, "learning_rate": 1.8636541603142467e-05, "loss": 0.5675, "step": 261 }, { "epoch": 0.16873289325390436, "grad_norm": 0.5438968503045687, "learning_rate": 1.8626320120217922e-05, "loss": 0.5596, "step": 262 }, { "epoch": 0.169376911930446, "grad_norm": 0.46767907934537206, "learning_rate": 1.8616063291174492e-05, "loss": 0.5486, "step": 263 }, { "epoch": 0.1700209306069876, "grad_norm": 0.7170320635007718, "learning_rate": 1.8605771158039253e-05, "loss": 0.5353, "step": 264 }, { "epoch": 0.1706649492835292, "grad_norm": 0.4790420324558731, "learning_rate": 1.8595443762983958e-05, "loss": 0.5343, "step": 265 }, { "epoch": 0.17130896796007083, "grad_norm": 0.588410708348578, "learning_rate": 1.858508114832483e-05, "loss": 0.545, "step": 266 }, { "epoch": 0.17195298663661246, "grad_norm": 0.49111716261650495, "learning_rate": 1.8574683356522416e-05, "loss": 0.5566, "step": 267 }, { "epoch": 0.17259700531315408, "grad_norm": 0.6122980185846265, "learning_rate": 1.8564250430181387e-05, "loss": 0.5441, "step": 268 }, { "epoch": 0.1732410239896957, "grad_norm": 0.5111700201134556, "learning_rate": 1.8553782412050384e-05, "loss": 0.5482, "step": 269 }, { "epoch": 0.17388504266623733, "grad_norm": 0.7653659405925481, "learning_rate": 1.8543279345021834e-05, "loss": 0.5767, "step": 270 }, { "epoch": 0.17452906134277893, "grad_norm": 0.512437113758092, "learning_rate": 1.853274127213178e-05, "loss": 0.5449, "step": 271 }, { "epoch": 0.17517308001932055, "grad_norm": 0.5620380797978025, "learning_rate": 1.8522168236559693e-05, "loss": 0.5519, "step": 272 }, { "epoch": 0.17581709869586218, "grad_norm": 0.559549002571924, "learning_rate": 1.8511560281628312e-05, "loss": 0.5599, "step": 273 }, { "epoch": 0.1764611173724038, "grad_norm": 0.5821389278832888, "learning_rate": 1.850091745080345e-05, "loss": 0.566, "step": 274 }, { "epoch": 0.17710513604894543, "grad_norm": 0.6022428773286064, "learning_rate": 1.8490239787693825e-05, "loss": 0.5351, "step": 275 }, { "epoch": 0.17774915472548705, "grad_norm": 0.49811084211320755, "learning_rate": 1.847952733605088e-05, "loss": 0.5368, "step": 276 }, { "epoch": 0.17839317340202865, "grad_norm": 0.5406457831033232, "learning_rate": 1.8468780139768602e-05, "loss": 0.5165, "step": 277 }, { "epoch": 0.17903719207857027, "grad_norm": 0.513822381058114, "learning_rate": 1.8457998242883346e-05, "loss": 0.5264, "step": 278 }, { "epoch": 0.1796812107551119, "grad_norm": 0.6008125845935965, "learning_rate": 1.8447181689573646e-05, "loss": 0.5332, "step": 279 }, { "epoch": 0.18032522943165352, "grad_norm": 0.4953141363761313, "learning_rate": 1.8436330524160048e-05, "loss": 0.5592, "step": 280 }, { "epoch": 0.18096924810819515, "grad_norm": 0.5782697614611221, "learning_rate": 1.8425444791104912e-05, "loss": 0.5493, "step": 281 }, { "epoch": 0.18161326678473677, "grad_norm": 0.48867251583148913, "learning_rate": 1.8414524535012244e-05, "loss": 0.5413, "step": 282 }, { "epoch": 0.18225728546127837, "grad_norm": 0.725882679209439, "learning_rate": 1.84035698006275e-05, "loss": 0.5216, "step": 283 }, { "epoch": 0.18290130413782, "grad_norm": 0.540304926316168, "learning_rate": 1.8392580632837423e-05, "loss": 0.5578, "step": 284 }, { "epoch": 0.18354532281436162, "grad_norm": 0.6877906436715047, "learning_rate": 1.8381557076669837e-05, "loss": 0.554, "step": 285 }, { "epoch": 0.18418934149090324, "grad_norm": 0.6315811286000208, "learning_rate": 1.8370499177293463e-05, "loss": 0.537, "step": 286 }, { "epoch": 0.18483336016744487, "grad_norm": 8.623111455193973, "learning_rate": 1.8359406980017763e-05, "loss": 0.6096, "step": 287 }, { "epoch": 0.18547737884398646, "grad_norm": 1.2097343388340007, "learning_rate": 1.8348280530292712e-05, "loss": 0.5606, "step": 288 }, { "epoch": 0.1861213975205281, "grad_norm": 0.5852127433062314, "learning_rate": 1.8337119873708654e-05, "loss": 0.5338, "step": 289 }, { "epoch": 0.1867654161970697, "grad_norm": 0.9248620838745759, "learning_rate": 1.8325925055996076e-05, "loss": 0.5565, "step": 290 }, { "epoch": 0.18740943487361134, "grad_norm": 0.7393640615766283, "learning_rate": 1.8314696123025456e-05, "loss": 0.5214, "step": 291 }, { "epoch": 0.18805345355015296, "grad_norm": 0.7982278279237686, "learning_rate": 1.8303433120807043e-05, "loss": 0.5448, "step": 292 }, { "epoch": 0.18869747222669458, "grad_norm": 0.6497337429061663, "learning_rate": 1.8292136095490692e-05, "loss": 0.5446, "step": 293 }, { "epoch": 0.18934149090323618, "grad_norm": 0.6349642246097151, "learning_rate": 1.8280805093365674e-05, "loss": 0.5425, "step": 294 }, { "epoch": 0.1899855095797778, "grad_norm": 0.7357365332469622, "learning_rate": 1.826944016086046e-05, "loss": 0.5439, "step": 295 }, { "epoch": 0.19062952825631943, "grad_norm": 0.5099807497736208, "learning_rate": 1.8258041344542567e-05, "loss": 0.5462, "step": 296 }, { "epoch": 0.19127354693286105, "grad_norm": 0.6941622270836214, "learning_rate": 1.8246608691118343e-05, "loss": 0.5474, "step": 297 }, { "epoch": 0.19191756560940268, "grad_norm": 0.5186879348369444, "learning_rate": 1.8235142247432784e-05, "loss": 0.5472, "step": 298 }, { "epoch": 0.1925615842859443, "grad_norm": 0.8184234173251932, "learning_rate": 1.8223642060469338e-05, "loss": 0.571, "step": 299 }, { "epoch": 0.1932056029624859, "grad_norm": 0.5769914140512192, "learning_rate": 1.8212108177349722e-05, "loss": 0.5402, "step": 300 }, { "epoch": 0.19384962163902753, "grad_norm": 0.751118694804033, "learning_rate": 1.820054064533371e-05, "loss": 0.5638, "step": 301 }, { "epoch": 0.19449364031556915, "grad_norm": 0.646964890495266, "learning_rate": 1.8188939511818965e-05, "loss": 0.518, "step": 302 }, { "epoch": 0.19513765899211077, "grad_norm": 0.5850081660573739, "learning_rate": 1.8177304824340823e-05, "loss": 0.5412, "step": 303 }, { "epoch": 0.1957816776686524, "grad_norm": 0.6984552138841986, "learning_rate": 1.816563663057211e-05, "loss": 0.5383, "step": 304 }, { "epoch": 0.19642569634519402, "grad_norm": 0.6678845070901094, "learning_rate": 1.815393497832294e-05, "loss": 0.5414, "step": 305 }, { "epoch": 0.19706971502173562, "grad_norm": 0.6974005850421262, "learning_rate": 1.814219991554053e-05, "loss": 0.5263, "step": 306 }, { "epoch": 0.19771373369827724, "grad_norm": 0.7590011091891484, "learning_rate": 1.813043149030898e-05, "loss": 0.5669, "step": 307 }, { "epoch": 0.19835775237481887, "grad_norm": 0.661253595310937, "learning_rate": 1.8118629750849106e-05, "loss": 0.5405, "step": 308 }, { "epoch": 0.1990017710513605, "grad_norm": 0.6027217394611589, "learning_rate": 1.8106794745518225e-05, "loss": 0.5438, "step": 309 }, { "epoch": 0.19964578972790212, "grad_norm": 0.6249258890377516, "learning_rate": 1.8094926522809958e-05, "loss": 0.5541, "step": 310 }, { "epoch": 0.20028980840444374, "grad_norm": 0.5006468958317862, "learning_rate": 1.808302513135404e-05, "loss": 0.5368, "step": 311 }, { "epoch": 0.20093382708098534, "grad_norm": 0.6347897431799325, "learning_rate": 1.8071090619916095e-05, "loss": 0.5441, "step": 312 }, { "epoch": 0.20157784575752696, "grad_norm": 0.5875697480526517, "learning_rate": 1.8059123037397478e-05, "loss": 0.5306, "step": 313 }, { "epoch": 0.2022218644340686, "grad_norm": 0.5931664304542446, "learning_rate": 1.804712243283504e-05, "loss": 0.5341, "step": 314 }, { "epoch": 0.2028658831106102, "grad_norm": 0.5812558669574834, "learning_rate": 1.803508885540094e-05, "loss": 0.5529, "step": 315 }, { "epoch": 0.20350990178715184, "grad_norm": 0.6328982705472053, "learning_rate": 1.802302235440245e-05, "loss": 0.5412, "step": 316 }, { "epoch": 0.20415392046369343, "grad_norm": 4.063312470803794, "learning_rate": 1.801092297928173e-05, "loss": 0.5657, "step": 317 }, { "epoch": 0.20479793914023506, "grad_norm": 0.9132113253191437, "learning_rate": 1.799879077961566e-05, "loss": 0.5372, "step": 318 }, { "epoch": 0.20544195781677668, "grad_norm": 0.7132906422080909, "learning_rate": 1.7986625805115597e-05, "loss": 0.5338, "step": 319 }, { "epoch": 0.2060859764933183, "grad_norm": 2.4438971500139424, "learning_rate": 1.797442810562721e-05, "loss": 0.5491, "step": 320 }, { "epoch": 0.20672999516985993, "grad_norm": 0.77505152167916, "learning_rate": 1.796219773113025e-05, "loss": 0.5587, "step": 321 }, { "epoch": 0.20737401384640156, "grad_norm": 0.6949127822968382, "learning_rate": 1.7949934731738348e-05, "loss": 0.5376, "step": 322 }, { "epoch": 0.20801803252294315, "grad_norm": 0.6299574356933265, "learning_rate": 1.7937639157698825e-05, "loss": 0.5189, "step": 323 }, { "epoch": 0.20866205119948478, "grad_norm": 0.9245277189368559, "learning_rate": 1.7925311059392472e-05, "loss": 0.5469, "step": 324 }, { "epoch": 0.2093060698760264, "grad_norm": 0.6413696882554818, "learning_rate": 1.7912950487333345e-05, "loss": 0.5131, "step": 325 }, { "epoch": 0.20995008855256803, "grad_norm": 0.830459545314389, "learning_rate": 1.790055749216856e-05, "loss": 0.5589, "step": 326 }, { "epoch": 0.21059410722910965, "grad_norm": 0.7272821309507749, "learning_rate": 1.788813212467809e-05, "loss": 0.5445, "step": 327 }, { "epoch": 0.21123812590565128, "grad_norm": 0.8026614606503828, "learning_rate": 1.7875674435774546e-05, "loss": 0.5321, "step": 328 }, { "epoch": 0.21188214458219287, "grad_norm": 0.7499723993286973, "learning_rate": 1.7863184476502984e-05, "loss": 0.5279, "step": 329 }, { "epoch": 0.2125261632587345, "grad_norm": 0.7916691089886829, "learning_rate": 1.7850662298040676e-05, "loss": 0.5212, "step": 330 }, { "epoch": 0.21317018193527612, "grad_norm": 0.7158058862224479, "learning_rate": 1.7838107951696926e-05, "loss": 0.5331, "step": 331 }, { "epoch": 0.21381420061181775, "grad_norm": 3.8539704383824875, "learning_rate": 1.7825521488912833e-05, "loss": 0.5687, "step": 332 }, { "epoch": 0.21445821928835937, "grad_norm": 0.852900514184333, "learning_rate": 1.7812902961261092e-05, "loss": 0.5443, "step": 333 }, { "epoch": 0.215102237964901, "grad_norm": 0.7316600619548709, "learning_rate": 1.7800252420445788e-05, "loss": 0.5472, "step": 334 }, { "epoch": 0.2157462566414426, "grad_norm": 0.8103187315092233, "learning_rate": 1.7787569918302185e-05, "loss": 0.5291, "step": 335 }, { "epoch": 0.21639027531798422, "grad_norm": 0.8204657015723783, "learning_rate": 1.7774855506796497e-05, "loss": 0.5579, "step": 336 }, { "epoch": 0.21703429399452584, "grad_norm": 0.6852741284052017, "learning_rate": 1.7762109238025682e-05, "loss": 0.523, "step": 337 }, { "epoch": 0.21767831267106746, "grad_norm": 0.9144705080083887, "learning_rate": 1.774933116421725e-05, "loss": 0.5517, "step": 338 }, { "epoch": 0.2183223313476091, "grad_norm": 0.63599119251152, "learning_rate": 1.7736521337729013e-05, "loss": 0.5161, "step": 339 }, { "epoch": 0.2189663500241507, "grad_norm": 0.8480082320292742, "learning_rate": 1.7723679811048904e-05, "loss": 0.5493, "step": 340 }, { "epoch": 0.2196103687006923, "grad_norm": 0.6353424855066929, "learning_rate": 1.7710806636794734e-05, "loss": 0.5597, "step": 341 }, { "epoch": 0.22025438737723393, "grad_norm": 0.67395515967938, "learning_rate": 1.7697901867713997e-05, "loss": 0.5427, "step": 342 }, { "epoch": 0.22089840605377556, "grad_norm": 0.706726120026496, "learning_rate": 1.768496555668364e-05, "loss": 0.5583, "step": 343 }, { "epoch": 0.22154242473031718, "grad_norm": 0.5544728670718565, "learning_rate": 1.767199775670986e-05, "loss": 0.5408, "step": 344 }, { "epoch": 0.2221864434068588, "grad_norm": 0.7634385610741026, "learning_rate": 1.7658998520927878e-05, "loss": 0.5455, "step": 345 }, { "epoch": 0.22283046208340043, "grad_norm": 0.5289522672887458, "learning_rate": 1.764596790260171e-05, "loss": 0.5295, "step": 346 }, { "epoch": 0.22347448075994203, "grad_norm": 0.7059890223674293, "learning_rate": 1.763290595512398e-05, "loss": 0.5268, "step": 347 }, { "epoch": 0.22411849943648365, "grad_norm": 0.6748498539155762, "learning_rate": 1.7619812732015664e-05, "loss": 0.5447, "step": 348 }, { "epoch": 0.22476251811302528, "grad_norm": 0.7889342715252236, "learning_rate": 1.7606688286925905e-05, "loss": 0.5274, "step": 349 }, { "epoch": 0.2254065367895669, "grad_norm": 0.6516926578647317, "learning_rate": 1.7593532673631765e-05, "loss": 0.5488, "step": 350 }, { "epoch": 0.22605055546610853, "grad_norm": 0.6408716342075127, "learning_rate": 1.7580345946038025e-05, "loss": 0.5342, "step": 351 }, { "epoch": 0.22669457414265012, "grad_norm": 0.7442360922765621, "learning_rate": 1.7567128158176955e-05, "loss": 0.5264, "step": 352 }, { "epoch": 0.22733859281919175, "grad_norm": 0.5904178768727316, "learning_rate": 1.7553879364208088e-05, "loss": 0.5368, "step": 353 }, { "epoch": 0.22798261149573337, "grad_norm": 0.5276104196799523, "learning_rate": 1.754059961841801e-05, "loss": 0.5507, "step": 354 }, { "epoch": 0.228626630172275, "grad_norm": 0.5985478431826813, "learning_rate": 1.7527288975220123e-05, "loss": 0.5411, "step": 355 }, { "epoch": 0.22927064884881662, "grad_norm": 0.4867418393879931, "learning_rate": 1.7513947489154443e-05, "loss": 0.5172, "step": 356 }, { "epoch": 0.22991466752535825, "grad_norm": 0.5029540119817894, "learning_rate": 1.7500575214887354e-05, "loss": 0.5205, "step": 357 }, { "epoch": 0.23055868620189984, "grad_norm": 0.49781452278077104, "learning_rate": 1.7487172207211395e-05, "loss": 0.5466, "step": 358 }, { "epoch": 0.23120270487844147, "grad_norm": 0.4853185723890798, "learning_rate": 1.7473738521045042e-05, "loss": 0.5149, "step": 359 }, { "epoch": 0.2318467235549831, "grad_norm": 0.4687138554402621, "learning_rate": 1.7460274211432463e-05, "loss": 0.5322, "step": 360 }, { "epoch": 0.23249074223152472, "grad_norm": 0.5847121181473399, "learning_rate": 1.7446779333543312e-05, "loss": 0.5126, "step": 361 }, { "epoch": 0.23313476090806634, "grad_norm": 0.531141855417285, "learning_rate": 1.7433253942672497e-05, "loss": 0.5454, "step": 362 }, { "epoch": 0.23377877958460797, "grad_norm": 0.5302713992498358, "learning_rate": 1.7419698094239947e-05, "loss": 0.542, "step": 363 }, { "epoch": 0.23442279826114956, "grad_norm": 0.543746288189669, "learning_rate": 1.74061118437904e-05, "loss": 0.5461, "step": 364 }, { "epoch": 0.2350668169376912, "grad_norm": 0.5507857325455144, "learning_rate": 1.739249524699315e-05, "loss": 0.5287, "step": 365 }, { "epoch": 0.2357108356142328, "grad_norm": 0.5045429234802327, "learning_rate": 1.7378848359641846e-05, "loss": 0.5422, "step": 366 }, { "epoch": 0.23635485429077444, "grad_norm": 0.5617486363775281, "learning_rate": 1.736517123765425e-05, "loss": 0.5267, "step": 367 }, { "epoch": 0.23699887296731606, "grad_norm": 0.5599499982986116, "learning_rate": 1.7351463937072008e-05, "loss": 0.5648, "step": 368 }, { "epoch": 0.23764289164385768, "grad_norm": 0.6109756094611624, "learning_rate": 1.733772651406042e-05, "loss": 0.5341, "step": 369 }, { "epoch": 0.23828691032039928, "grad_norm": 0.5568393453569119, "learning_rate": 1.732395902490821e-05, "loss": 0.5389, "step": 370 }, { "epoch": 0.2389309289969409, "grad_norm": 0.5554929607920024, "learning_rate": 1.731016152602731e-05, "loss": 0.5435, "step": 371 }, { "epoch": 0.23957494767348253, "grad_norm": 0.49287223057657387, "learning_rate": 1.7296334073952606e-05, "loss": 0.5399, "step": 372 }, { "epoch": 0.24021896635002415, "grad_norm": 0.5310087361933457, "learning_rate": 1.7282476725341713e-05, "loss": 0.5516, "step": 373 }, { "epoch": 0.24086298502656578, "grad_norm": 0.47655736837646095, "learning_rate": 1.726858953697475e-05, "loss": 0.5428, "step": 374 }, { "epoch": 0.2415070037031074, "grad_norm": 0.4794259196276747, "learning_rate": 1.7254672565754115e-05, "loss": 0.5413, "step": 375 }, { "epoch": 0.242151022379649, "grad_norm": 0.6162302360110299, "learning_rate": 1.7240725868704218e-05, "loss": 0.5488, "step": 376 }, { "epoch": 0.24279504105619062, "grad_norm": 0.4963450348324772, "learning_rate": 1.7226749502971288e-05, "loss": 0.5477, "step": 377 }, { "epoch": 0.24343905973273225, "grad_norm": 0.5756297556895481, "learning_rate": 1.721274352582311e-05, "loss": 0.5348, "step": 378 }, { "epoch": 0.24408307840927387, "grad_norm": 0.5354162257312881, "learning_rate": 1.7198707994648812e-05, "loss": 0.5391, "step": 379 }, { "epoch": 0.2447270970858155, "grad_norm": 0.4962054894142848, "learning_rate": 1.718464296695861e-05, "loss": 0.5372, "step": 380 }, { "epoch": 0.2453711157623571, "grad_norm": 0.55529119567323, "learning_rate": 1.717054850038358e-05, "loss": 0.5282, "step": 381 }, { "epoch": 0.24601513443889872, "grad_norm": 0.49492289819453084, "learning_rate": 1.7156424652675433e-05, "loss": 0.5135, "step": 382 }, { "epoch": 0.24665915311544034, "grad_norm": 0.5432924059858214, "learning_rate": 1.7142271481706256e-05, "loss": 0.5187, "step": 383 }, { "epoch": 0.24730317179198197, "grad_norm": 0.5311903224401996, "learning_rate": 1.7128089045468294e-05, "loss": 0.5306, "step": 384 }, { "epoch": 0.2479471904685236, "grad_norm": 0.5325938017711984, "learning_rate": 1.711387740207371e-05, "loss": 0.5537, "step": 385 }, { "epoch": 0.24859120914506522, "grad_norm": 0.48844973517138895, "learning_rate": 1.709963660975433e-05, "loss": 0.538, "step": 386 }, { "epoch": 0.24923522782160681, "grad_norm": 0.5869943737151249, "learning_rate": 1.708536672686143e-05, "loss": 0.539, "step": 387 }, { "epoch": 0.24987924649814844, "grad_norm": 0.4927112335146023, "learning_rate": 1.7071067811865477e-05, "loss": 0.5532, "step": 388 }, { "epoch": 0.2505232651746901, "grad_norm": 0.6042246512410522, "learning_rate": 1.7056739923355897e-05, "loss": 0.5332, "step": 389 }, { "epoch": 0.25116728385123166, "grad_norm": 0.5179413855765288, "learning_rate": 1.7042383120040837e-05, "loss": 0.5527, "step": 390 }, { "epoch": 0.2518113025277733, "grad_norm": 0.6309709711686236, "learning_rate": 1.7027997460746914e-05, "loss": 0.5324, "step": 391 }, { "epoch": 0.2524553212043149, "grad_norm": 0.47259671347289706, "learning_rate": 1.7013583004418994e-05, "loss": 0.5239, "step": 392 }, { "epoch": 0.25309933988085653, "grad_norm": 0.6524585183311525, "learning_rate": 1.6999139810119927e-05, "loss": 0.5265, "step": 393 }, { "epoch": 0.25374335855739816, "grad_norm": 0.5551531486097826, "learning_rate": 1.698466793703032e-05, "loss": 0.5316, "step": 394 }, { "epoch": 0.2543873772339398, "grad_norm": 0.6661297393466529, "learning_rate": 1.697016744444829e-05, "loss": 0.5392, "step": 395 }, { "epoch": 0.2550313959104814, "grad_norm": 0.5720415818805304, "learning_rate": 1.695563839178923e-05, "loss": 0.5177, "step": 396 }, { "epoch": 0.25567541458702303, "grad_norm": 0.6988185302599157, "learning_rate": 1.6941080838585537e-05, "loss": 0.5202, "step": 397 }, { "epoch": 0.25631943326356466, "grad_norm": 0.5216110639494403, "learning_rate": 1.6926494844486412e-05, "loss": 0.5473, "step": 398 }, { "epoch": 0.2569634519401063, "grad_norm": 0.5793032742494075, "learning_rate": 1.6911880469257576e-05, "loss": 0.5529, "step": 399 }, { "epoch": 0.2576074706166479, "grad_norm": 0.5243461464847616, "learning_rate": 1.6897237772781046e-05, "loss": 0.5245, "step": 400 }, { "epoch": 0.25825148929318953, "grad_norm": 0.5401695257389927, "learning_rate": 1.6882566815054882e-05, "loss": 0.5551, "step": 401 }, { "epoch": 0.2588955079697311, "grad_norm": 0.5658781597354959, "learning_rate": 1.6867867656192946e-05, "loss": 0.5237, "step": 402 }, { "epoch": 0.2595395266462727, "grad_norm": 3.3371183984909725, "learning_rate": 1.685314035642466e-05, "loss": 0.5579, "step": 403 }, { "epoch": 0.26018354532281435, "grad_norm": 1.0539010956690584, "learning_rate": 1.6838384976094738e-05, "loss": 0.544, "step": 404 }, { "epoch": 0.26082756399935597, "grad_norm": 0.6541987436618641, "learning_rate": 1.6823601575662963e-05, "loss": 0.5413, "step": 405 }, { "epoch": 0.2614715826758976, "grad_norm": 1.0095164000077856, "learning_rate": 1.6808790215703933e-05, "loss": 0.5398, "step": 406 }, { "epoch": 0.2621156013524392, "grad_norm": 0.664402082862075, "learning_rate": 1.67939509569068e-05, "loss": 0.5184, "step": 407 }, { "epoch": 0.26275962002898084, "grad_norm": 0.8104343843921324, "learning_rate": 1.6779083860075032e-05, "loss": 0.5366, "step": 408 }, { "epoch": 0.26340363870552247, "grad_norm": 0.6030139826413455, "learning_rate": 1.6764188986126174e-05, "loss": 0.5437, "step": 409 }, { "epoch": 0.2640476573820641, "grad_norm": 0.6871877462541771, "learning_rate": 1.674926639609157e-05, "loss": 0.5203, "step": 410 }, { "epoch": 0.2646916760586057, "grad_norm": 0.6439203018912865, "learning_rate": 1.6734316151116143e-05, "loss": 0.5333, "step": 411 }, { "epoch": 0.26533569473514734, "grad_norm": 0.6280931975694123, "learning_rate": 1.6719338312458123e-05, "loss": 0.535, "step": 412 }, { "epoch": 0.2659797134116889, "grad_norm": 0.6466128385566419, "learning_rate": 1.670433294148882e-05, "loss": 0.5484, "step": 413 }, { "epoch": 0.26662373208823054, "grad_norm": 0.4932296577351758, "learning_rate": 1.6689300099692332e-05, "loss": 0.5303, "step": 414 }, { "epoch": 0.26726775076477216, "grad_norm": 6.2085796458426765, "learning_rate": 1.6674239848665336e-05, "loss": 0.5244, "step": 415 }, { "epoch": 0.2679117694413138, "grad_norm": 0.8823125273166225, "learning_rate": 1.665915225011681e-05, "loss": 0.5455, "step": 416 }, { "epoch": 0.2685557881178554, "grad_norm": 0.48037437918840037, "learning_rate": 1.66440373658678e-05, "loss": 0.5352, "step": 417 }, { "epoch": 0.26919980679439703, "grad_norm": 0.7936169291281236, "learning_rate": 1.6628895257851136e-05, "loss": 0.5258, "step": 418 }, { "epoch": 0.26984382547093866, "grad_norm": 0.474589091581904, "learning_rate": 1.661372598811121e-05, "loss": 0.5277, "step": 419 }, { "epoch": 0.2704878441474803, "grad_norm": 0.6917214716556759, "learning_rate": 1.65985296188037e-05, "loss": 0.5335, "step": 420 }, { "epoch": 0.2711318628240219, "grad_norm": 0.5122207207875377, "learning_rate": 1.658330621219533e-05, "loss": 0.5272, "step": 421 }, { "epoch": 0.27177588150056353, "grad_norm": 0.6912102958741447, "learning_rate": 1.656805583066361e-05, "loss": 0.5464, "step": 422 }, { "epoch": 0.27241990017710516, "grad_norm": 0.5965951730055572, "learning_rate": 1.655277853669657e-05, "loss": 0.548, "step": 423 }, { "epoch": 0.2730639188536468, "grad_norm": 0.6226709025362007, "learning_rate": 1.6537474392892527e-05, "loss": 0.5314, "step": 424 }, { "epoch": 0.27370793753018835, "grad_norm": 0.4753795689028092, "learning_rate": 1.6522143461959796e-05, "loss": 0.5214, "step": 425 }, { "epoch": 0.27435195620673, "grad_norm": 0.605806579418586, "learning_rate": 1.6506785806716464e-05, "loss": 0.5317, "step": 426 }, { "epoch": 0.2749959748832716, "grad_norm": 0.4943173711341386, "learning_rate": 1.6491401490090125e-05, "loss": 0.5358, "step": 427 }, { "epoch": 0.2756399935598132, "grad_norm": 0.6217344759242737, "learning_rate": 1.6475990575117603e-05, "loss": 0.5349, "step": 428 }, { "epoch": 0.27628401223635485, "grad_norm": 0.4913235389806793, "learning_rate": 1.646055312494472e-05, "loss": 0.5403, "step": 429 }, { "epoch": 0.2769280309128965, "grad_norm": 0.48794492359171915, "learning_rate": 1.644508920282601e-05, "loss": 0.5359, "step": 430 }, { "epoch": 0.2775720495894381, "grad_norm": 0.567869675408498, "learning_rate": 1.64295988721245e-05, "loss": 0.5275, "step": 431 }, { "epoch": 0.2782160682659797, "grad_norm": 0.5244620190233568, "learning_rate": 1.6414082196311402e-05, "loss": 0.5427, "step": 432 }, { "epoch": 0.27886008694252135, "grad_norm": 0.56898805788432, "learning_rate": 1.6398539238965886e-05, "loss": 0.5554, "step": 433 }, { "epoch": 0.27950410561906297, "grad_norm": 0.47429495615948214, "learning_rate": 1.638297006377481e-05, "loss": 0.5158, "step": 434 }, { "epoch": 0.2801481242956046, "grad_norm": 0.5984445538662492, "learning_rate": 1.636737473453246e-05, "loss": 0.5142, "step": 435 }, { "epoch": 0.2807921429721462, "grad_norm": 0.4487546071244268, "learning_rate": 1.6351753315140285e-05, "loss": 0.5364, "step": 436 }, { "epoch": 0.2814361616486878, "grad_norm": 0.6243102280382249, "learning_rate": 1.633610586960664e-05, "loss": 0.5461, "step": 437 }, { "epoch": 0.2820801803252294, "grad_norm": 0.6390814664180561, "learning_rate": 1.6320432462046516e-05, "loss": 0.5039, "step": 438 }, { "epoch": 0.28272419900177104, "grad_norm": 0.7147183797212385, "learning_rate": 1.630473315668129e-05, "loss": 0.5247, "step": 439 }, { "epoch": 0.28336821767831266, "grad_norm": 0.6215368465064426, "learning_rate": 1.6289008017838447e-05, "loss": 0.5389, "step": 440 }, { "epoch": 0.2840122363548543, "grad_norm": 0.5904097026080728, "learning_rate": 1.627325710995133e-05, "loss": 0.5252, "step": 441 }, { "epoch": 0.2846562550313959, "grad_norm": 0.5413694199859254, "learning_rate": 1.6257480497558873e-05, "loss": 0.5298, "step": 442 }, { "epoch": 0.28530027370793754, "grad_norm": 0.581025596768252, "learning_rate": 1.6241678245305323e-05, "loss": 0.5208, "step": 443 }, { "epoch": 0.28594429238447916, "grad_norm": 0.48633985759217585, "learning_rate": 1.622585041793999e-05, "loss": 0.5173, "step": 444 }, { "epoch": 0.2865883110610208, "grad_norm": 0.6407007122897388, "learning_rate": 1.6209997080316983e-05, "loss": 0.5366, "step": 445 }, { "epoch": 0.2872323297375624, "grad_norm": 0.4283091606872038, "learning_rate": 1.6194118297394935e-05, "loss": 0.5206, "step": 446 }, { "epoch": 0.28787634841410403, "grad_norm": 0.590600731775924, "learning_rate": 1.6178214134236733e-05, "loss": 0.5277, "step": 447 }, { "epoch": 0.2885203670906456, "grad_norm": 0.44686903797765287, "learning_rate": 1.6162284656009276e-05, "loss": 0.5271, "step": 448 }, { "epoch": 0.2891643857671872, "grad_norm": 0.6100459361188076, "learning_rate": 1.614632992798317e-05, "loss": 0.5365, "step": 449 }, { "epoch": 0.28980840444372885, "grad_norm": 0.5455659854250726, "learning_rate": 1.6130350015532498e-05, "loss": 0.5249, "step": 450 }, { "epoch": 0.2904524231202705, "grad_norm": 0.6064985211025599, "learning_rate": 1.6114344984134523e-05, "loss": 0.543, "step": 451 }, { "epoch": 0.2910964417968121, "grad_norm": 0.5978919219270399, "learning_rate": 1.6098314899369446e-05, "loss": 0.5534, "step": 452 }, { "epoch": 0.2917404604733537, "grad_norm": 0.5138212775941895, "learning_rate": 1.6082259826920114e-05, "loss": 0.5285, "step": 453 }, { "epoch": 0.29238447914989535, "grad_norm": 0.5579821932163525, "learning_rate": 1.6066179832571762e-05, "loss": 0.5289, "step": 454 }, { "epoch": 0.293028497826437, "grad_norm": 0.569971038691254, "learning_rate": 1.6050074982211738e-05, "loss": 0.5405, "step": 455 }, { "epoch": 0.2936725165029786, "grad_norm": 0.47286317045482684, "learning_rate": 1.603394534182925e-05, "loss": 0.5268, "step": 456 }, { "epoch": 0.2943165351795202, "grad_norm": 0.6388203573094701, "learning_rate": 1.6017790977515063e-05, "loss": 0.5222, "step": 457 }, { "epoch": 0.29496055385606185, "grad_norm": 0.5124343599095684, "learning_rate": 1.6001611955461265e-05, "loss": 0.5256, "step": 458 }, { "epoch": 0.29560457253260347, "grad_norm": 0.5788820664641883, "learning_rate": 1.598540834196097e-05, "loss": 0.5157, "step": 459 }, { "epoch": 0.29624859120914504, "grad_norm": 0.5164241639891456, "learning_rate": 1.5969180203408052e-05, "loss": 0.5208, "step": 460 }, { "epoch": 0.29689260988568666, "grad_norm": 0.48110610330750636, "learning_rate": 1.5952927606296888e-05, "loss": 0.5221, "step": 461 }, { "epoch": 0.2975366285622283, "grad_norm": 0.49698632152924693, "learning_rate": 1.5936650617222063e-05, "loss": 0.5119, "step": 462 }, { "epoch": 0.2981806472387699, "grad_norm": 0.634252037315933, "learning_rate": 1.592034930287811e-05, "loss": 0.536, "step": 463 }, { "epoch": 0.29882466591531154, "grad_norm": 0.47233175437886227, "learning_rate": 1.5904023730059227e-05, "loss": 0.5571, "step": 464 }, { "epoch": 0.29946868459185316, "grad_norm": 6.425827781209856, "learning_rate": 1.5887673965659027e-05, "loss": 0.5377, "step": 465 }, { "epoch": 0.3001127032683948, "grad_norm": 0.6053244039249738, "learning_rate": 1.5871300076670236e-05, "loss": 0.5255, "step": 466 }, { "epoch": 0.3007567219449364, "grad_norm": 0.48806704832549996, "learning_rate": 1.5854902130184426e-05, "loss": 0.5189, "step": 467 }, { "epoch": 0.30140074062147804, "grad_norm": 0.5711593465884596, "learning_rate": 1.5838480193391753e-05, "loss": 0.5445, "step": 468 }, { "epoch": 0.30204475929801966, "grad_norm": 0.6578442193666589, "learning_rate": 1.5822034333580675e-05, "loss": 0.5295, "step": 469 }, { "epoch": 0.3026887779745613, "grad_norm": 2.101104122639006, "learning_rate": 1.580556461813766e-05, "loss": 0.5269, "step": 470 }, { "epoch": 0.3033327966511029, "grad_norm": 0.6201276881634421, "learning_rate": 1.5789071114546937e-05, "loss": 0.5613, "step": 471 }, { "epoch": 0.3039768153276445, "grad_norm": 0.5459007543570429, "learning_rate": 1.5772553890390196e-05, "loss": 0.5431, "step": 472 }, { "epoch": 0.3046208340041861, "grad_norm": 0.6612236304363945, "learning_rate": 1.575601301334633e-05, "loss": 0.5618, "step": 473 }, { "epoch": 0.3052648526807277, "grad_norm": 0.549463428378769, "learning_rate": 1.573944855119115e-05, "loss": 0.5489, "step": 474 }, { "epoch": 0.30590887135726935, "grad_norm": 1.4648361672303511, "learning_rate": 1.5722860571797098e-05, "loss": 0.5353, "step": 475 }, { "epoch": 0.306552890033811, "grad_norm": 0.5267452704205237, "learning_rate": 1.5706249143132982e-05, "loss": 0.5196, "step": 476 }, { "epoch": 0.3071969087103526, "grad_norm": 0.6537828033741523, "learning_rate": 1.568961433326369e-05, "loss": 0.5327, "step": 477 }, { "epoch": 0.3078409273868942, "grad_norm": 0.6459383726604815, "learning_rate": 1.5672956210349923e-05, "loss": 0.538, "step": 478 }, { "epoch": 0.30848494606343585, "grad_norm": 0.5815615004214209, "learning_rate": 1.5656274842647895e-05, "loss": 0.5103, "step": 479 }, { "epoch": 0.3091289647399775, "grad_norm": 0.5309554603427217, "learning_rate": 1.5639570298509067e-05, "loss": 0.5447, "step": 480 }, { "epoch": 0.3097729834165191, "grad_norm": 0.5449851380912234, "learning_rate": 1.5622842646379872e-05, "loss": 0.5388, "step": 481 }, { "epoch": 0.3104170020930607, "grad_norm": 0.49185906877204505, "learning_rate": 1.560609195480142e-05, "loss": 0.504, "step": 482 }, { "epoch": 0.3110610207696023, "grad_norm": 0.5320261453323021, "learning_rate": 1.5589318292409227e-05, "loss": 0.5468, "step": 483 }, { "epoch": 0.3117050394461439, "grad_norm": 0.5379800136582633, "learning_rate": 1.5572521727932937e-05, "loss": 0.5308, "step": 484 }, { "epoch": 0.31234905812268554, "grad_norm": 0.509804085716603, "learning_rate": 1.5555702330196024e-05, "loss": 0.5128, "step": 485 }, { "epoch": 0.31299307679922717, "grad_norm": 0.4915137713624788, "learning_rate": 1.5538860168115527e-05, "loss": 0.5421, "step": 486 }, { "epoch": 0.3136370954757688, "grad_norm": 0.5590115694696917, "learning_rate": 1.5521995310701762e-05, "loss": 0.5363, "step": 487 }, { "epoch": 0.3142811141523104, "grad_norm": 0.46188350474506246, "learning_rate": 1.5505107827058038e-05, "loss": 0.5352, "step": 488 }, { "epoch": 0.31492513282885204, "grad_norm": 0.5099704525086937, "learning_rate": 1.5488197786380367e-05, "loss": 0.5076, "step": 489 }, { "epoch": 0.31556915150539366, "grad_norm": 0.49031531714703347, "learning_rate": 1.5471265257957202e-05, "loss": 0.5494, "step": 490 }, { "epoch": 0.3162131701819353, "grad_norm": 0.48200851214348306, "learning_rate": 1.5454310311169126e-05, "loss": 0.5072, "step": 491 }, { "epoch": 0.3168571888584769, "grad_norm": 0.5610390958261301, "learning_rate": 1.5437333015488586e-05, "loss": 0.5326, "step": 492 }, { "epoch": 0.31750120753501854, "grad_norm": 0.4722472940746285, "learning_rate": 1.542033344047961e-05, "loss": 0.5385, "step": 493 }, { "epoch": 0.31814522621156016, "grad_norm": 0.4523704022163936, "learning_rate": 1.5403311655797494e-05, "loss": 0.5301, "step": 494 }, { "epoch": 0.31878924488810173, "grad_norm": 0.4650308750304756, "learning_rate": 1.538626773118856e-05, "loss": 0.5234, "step": 495 }, { "epoch": 0.31943326356464335, "grad_norm": 0.4531119349467922, "learning_rate": 1.536920173648984e-05, "loss": 0.5298, "step": 496 }, { "epoch": 0.320077282241185, "grad_norm": 0.45895136851196094, "learning_rate": 1.5352113741628795e-05, "loss": 0.5409, "step": 497 }, { "epoch": 0.3207213009177266, "grad_norm": 0.48907878000919347, "learning_rate": 1.5335003816623027e-05, "loss": 0.5257, "step": 498 }, { "epoch": 0.32136531959426823, "grad_norm": 0.4540241823517835, "learning_rate": 1.5317872031580012e-05, "loss": 0.54, "step": 499 }, { "epoch": 0.32200933827080985, "grad_norm": 0.546350773113701, "learning_rate": 1.530071845669678e-05, "loss": 0.5441, "step": 500 }, { "epoch": 0.3226533569473515, "grad_norm": 0.4398122030982612, "learning_rate": 1.5283543162259652e-05, "loss": 0.5214, "step": 501 }, { "epoch": 0.3232973756238931, "grad_norm": 0.5518060416269056, "learning_rate": 1.526634621864395e-05, "loss": 0.5502, "step": 502 }, { "epoch": 0.3239413943004347, "grad_norm": 0.5327350156094324, "learning_rate": 1.5249127696313682e-05, "loss": 0.5411, "step": 503 }, { "epoch": 0.32458541297697635, "grad_norm": 0.4684665482683854, "learning_rate": 1.52318876658213e-05, "loss": 0.5206, "step": 504 }, { "epoch": 0.325229431653518, "grad_norm": 0.47263095436691543, "learning_rate": 1.5214626197807373e-05, "loss": 0.5443, "step": 505 }, { "epoch": 0.32587345033005954, "grad_norm": 8.664847911929632, "learning_rate": 1.5197343363000308e-05, "loss": 0.5857, "step": 506 }, { "epoch": 0.32651746900660117, "grad_norm": 0.5062567681161518, "learning_rate": 1.5180039232216062e-05, "loss": 0.5065, "step": 507 }, { "epoch": 0.3271614876831428, "grad_norm": 0.5246999197492658, "learning_rate": 1.516271387635786e-05, "loss": 0.5369, "step": 508 }, { "epoch": 0.3278055063596844, "grad_norm": 0.49531894227452317, "learning_rate": 1.5145367366415882e-05, "loss": 0.5249, "step": 509 }, { "epoch": 0.32844952503622604, "grad_norm": 0.6079837127458365, "learning_rate": 1.5127999773467001e-05, "loss": 0.5259, "step": 510 }, { "epoch": 0.32909354371276767, "grad_norm": 0.5105922056861126, "learning_rate": 1.5110611168674467e-05, "loss": 0.5317, "step": 511 }, { "epoch": 0.3297375623893093, "grad_norm": 0.5289726160910597, "learning_rate": 1.5093201623287631e-05, "loss": 0.4978, "step": 512 }, { "epoch": 0.3303815810658509, "grad_norm": 0.5499853268551226, "learning_rate": 1.5075771208641645e-05, "loss": 0.5464, "step": 513 }, { "epoch": 0.33102559974239254, "grad_norm": 0.5101375606566954, "learning_rate": 1.5058319996157172e-05, "loss": 0.5196, "step": 514 }, { "epoch": 0.33166961841893416, "grad_norm": 0.5752819384683452, "learning_rate": 1.5040848057340097e-05, "loss": 0.52, "step": 515 }, { "epoch": 0.3323136370954758, "grad_norm": 0.47841725807985064, "learning_rate": 1.5023355463781221e-05, "loss": 0.5309, "step": 516 }, { "epoch": 0.3329576557720174, "grad_norm": 0.6041814758530042, "learning_rate": 1.500584228715599e-05, "loss": 0.53, "step": 517 }, { "epoch": 0.333601674448559, "grad_norm": 0.4801266438599919, "learning_rate": 1.4988308599224182e-05, "loss": 0.5217, "step": 518 }, { "epoch": 0.3342456931251006, "grad_norm": 0.4836658509720551, "learning_rate": 1.497075447182962e-05, "loss": 0.5288, "step": 519 }, { "epoch": 0.33488971180164223, "grad_norm": 0.47621448134995104, "learning_rate": 1.4953179976899878e-05, "loss": 0.5366, "step": 520 }, { "epoch": 0.33553373047818386, "grad_norm": 0.5656502939337996, "learning_rate": 1.4935585186445982e-05, "loss": 0.513, "step": 521 }, { "epoch": 0.3361777491547255, "grad_norm": 0.46894666877877583, "learning_rate": 1.4917970172562122e-05, "loss": 0.521, "step": 522 }, { "epoch": 0.3368217678312671, "grad_norm": 0.584345157458721, "learning_rate": 1.4900335007425358e-05, "loss": 0.54, "step": 523 }, { "epoch": 0.33746578650780873, "grad_norm": 0.4864503975540166, "learning_rate": 1.4882679763295307e-05, "loss": 0.5134, "step": 524 }, { "epoch": 0.33810980518435035, "grad_norm": 0.5253363249769422, "learning_rate": 1.4865004512513865e-05, "loss": 0.539, "step": 525 }, { "epoch": 0.338753823860892, "grad_norm": 0.4866714774032188, "learning_rate": 1.484730932750491e-05, "loss": 0.5444, "step": 526 }, { "epoch": 0.3393978425374336, "grad_norm": 0.5337938235807133, "learning_rate": 1.4829594280773993e-05, "loss": 0.5172, "step": 527 }, { "epoch": 0.3400418612139752, "grad_norm": 0.46820992122914074, "learning_rate": 1.4811859444908053e-05, "loss": 0.5224, "step": 528 }, { "epoch": 0.34068587989051685, "grad_norm": 0.47355127512646383, "learning_rate": 1.4794104892575106e-05, "loss": 0.5478, "step": 529 }, { "epoch": 0.3413298985670584, "grad_norm": 0.5173671086120407, "learning_rate": 1.4776330696523964e-05, "loss": 0.5111, "step": 530 }, { "epoch": 0.34197391724360005, "grad_norm": 0.4533813620092789, "learning_rate": 1.4758536929583926e-05, "loss": 0.5284, "step": 531 }, { "epoch": 0.34261793592014167, "grad_norm": 0.45744216522053666, "learning_rate": 1.4740723664664483e-05, "loss": 0.5404, "step": 532 }, { "epoch": 0.3432619545966833, "grad_norm": 0.5824466172604327, "learning_rate": 1.4722890974755014e-05, "loss": 0.5242, "step": 533 }, { "epoch": 0.3439059732732249, "grad_norm": 0.4090575916582126, "learning_rate": 1.4705038932924502e-05, "loss": 0.5251, "step": 534 }, { "epoch": 0.34454999194976654, "grad_norm": 0.6080688014681355, "learning_rate": 1.4687167612321212e-05, "loss": 0.5376, "step": 535 }, { "epoch": 0.34519401062630817, "grad_norm": 0.45544942689915635, "learning_rate": 1.4669277086172406e-05, "loss": 0.5281, "step": 536 }, { "epoch": 0.3458380293028498, "grad_norm": 0.46763324715050675, "learning_rate": 1.4651367427784049e-05, "loss": 0.5244, "step": 537 }, { "epoch": 0.3464820479793914, "grad_norm": 0.5548968920249024, "learning_rate": 1.4633438710540488e-05, "loss": 0.5326, "step": 538 }, { "epoch": 0.34712606665593304, "grad_norm": 0.49754446662758334, "learning_rate": 1.4615491007904172e-05, "loss": 0.5183, "step": 539 }, { "epoch": 0.34777008533247467, "grad_norm": 0.5006669454296423, "learning_rate": 1.4597524393415336e-05, "loss": 0.5309, "step": 540 }, { "epoch": 0.34841410400901623, "grad_norm": 0.6267728115013602, "learning_rate": 1.4579538940691707e-05, "loss": 0.5273, "step": 541 }, { "epoch": 0.34905812268555786, "grad_norm": 0.514520257626635, "learning_rate": 1.4561534723428205e-05, "loss": 0.5385, "step": 542 }, { "epoch": 0.3497021413620995, "grad_norm": 0.5546315566986046, "learning_rate": 1.4543511815396638e-05, "loss": 0.5345, "step": 543 }, { "epoch": 0.3503461600386411, "grad_norm": 0.5402951757137505, "learning_rate": 1.4525470290445392e-05, "loss": 0.5099, "step": 544 }, { "epoch": 0.35099017871518273, "grad_norm": 0.5758543404444292, "learning_rate": 1.4507410222499136e-05, "loss": 0.5364, "step": 545 }, { "epoch": 0.35163419739172436, "grad_norm": 0.45978932085398583, "learning_rate": 1.4489331685558525e-05, "loss": 0.5333, "step": 546 }, { "epoch": 0.352278216068266, "grad_norm": 0.547695540651972, "learning_rate": 1.4471234753699887e-05, "loss": 0.533, "step": 547 }, { "epoch": 0.3529222347448076, "grad_norm": 0.48066001126474867, "learning_rate": 1.4453119501074924e-05, "loss": 0.5331, "step": 548 }, { "epoch": 0.35356625342134923, "grad_norm": 0.524713615347928, "learning_rate": 1.4434986001910403e-05, "loss": 0.543, "step": 549 }, { "epoch": 0.35421027209789085, "grad_norm": 0.5481099201114571, "learning_rate": 1.4416834330507857e-05, "loss": 0.5004, "step": 550 }, { "epoch": 0.3548542907744325, "grad_norm": 0.4722096389935113, "learning_rate": 1.4398664561243278e-05, "loss": 0.5358, "step": 551 }, { "epoch": 0.3554983094509741, "grad_norm": 0.5952405956207183, "learning_rate": 1.4380476768566825e-05, "loss": 0.5269, "step": 552 }, { "epoch": 0.3561423281275157, "grad_norm": 0.4643477688582924, "learning_rate": 1.4362271027002491e-05, "loss": 0.5289, "step": 553 }, { "epoch": 0.3567863468040573, "grad_norm": 0.528103917623065, "learning_rate": 1.434404741114782e-05, "loss": 0.5374, "step": 554 }, { "epoch": 0.3574303654805989, "grad_norm": 0.44088179393345045, "learning_rate": 1.43258059956736e-05, "loss": 0.4993, "step": 555 }, { "epoch": 0.35807438415714055, "grad_norm": 0.4296878118374209, "learning_rate": 1.4307546855323549e-05, "loss": 0.5469, "step": 556 }, { "epoch": 0.35871840283368217, "grad_norm": 0.47751249750000596, "learning_rate": 1.4289270064914012e-05, "loss": 0.5465, "step": 557 }, { "epoch": 0.3593624215102238, "grad_norm": 0.4705106678618365, "learning_rate": 1.4270975699333653e-05, "loss": 0.5138, "step": 558 }, { "epoch": 0.3600064401867654, "grad_norm": 0.47028399167409013, "learning_rate": 1.4252663833543157e-05, "loss": 0.525, "step": 559 }, { "epoch": 0.36065045886330704, "grad_norm": 0.48485269955745164, "learning_rate": 1.4234334542574906e-05, "loss": 0.5383, "step": 560 }, { "epoch": 0.36129447753984867, "grad_norm": 0.4537908684166729, "learning_rate": 1.4215987901532684e-05, "loss": 0.5049, "step": 561 }, { "epoch": 0.3619384962163903, "grad_norm": 0.5413395041419958, "learning_rate": 1.4197623985591373e-05, "loss": 0.5401, "step": 562 }, { "epoch": 0.3625825148929319, "grad_norm": 0.48274113469574687, "learning_rate": 1.4179242869996632e-05, "loss": 0.5211, "step": 563 }, { "epoch": 0.36322653356947354, "grad_norm": 0.5172842571211704, "learning_rate": 1.4160844630064596e-05, "loss": 0.5308, "step": 564 }, { "epoch": 0.3638705522460151, "grad_norm": 0.477937176448983, "learning_rate": 1.4142429341181569e-05, "loss": 0.5215, "step": 565 }, { "epoch": 0.36451457092255674, "grad_norm": 0.5317276547561788, "learning_rate": 1.4123997078803708e-05, "loss": 0.5202, "step": 566 }, { "epoch": 0.36515858959909836, "grad_norm": 0.49518385225700895, "learning_rate": 1.4105547918456726e-05, "loss": 0.4961, "step": 567 }, { "epoch": 0.36580260827564, "grad_norm": 0.5474676744378734, "learning_rate": 1.4087081935735565e-05, "loss": 0.543, "step": 568 }, { "epoch": 0.3664466269521816, "grad_norm": 0.47996318561540274, "learning_rate": 1.4068599206304104e-05, "loss": 0.5308, "step": 569 }, { "epoch": 0.36709064562872323, "grad_norm": 0.5285763088945041, "learning_rate": 1.4050099805894837e-05, "loss": 0.508, "step": 570 }, { "epoch": 0.36773466430526486, "grad_norm": 0.401075017587951, "learning_rate": 1.4031583810308568e-05, "loss": 0.5391, "step": 571 }, { "epoch": 0.3683786829818065, "grad_norm": 0.47692655521825933, "learning_rate": 1.4013051295414108e-05, "loss": 0.5438, "step": 572 }, { "epoch": 0.3690227016583481, "grad_norm": 0.4579064194570195, "learning_rate": 1.399450233714794e-05, "loss": 0.5293, "step": 573 }, { "epoch": 0.36966672033488973, "grad_norm": 0.49247635789052996, "learning_rate": 1.3975937011513931e-05, "loss": 0.5417, "step": 574 }, { "epoch": 0.37031073901143136, "grad_norm": 0.4858866332932424, "learning_rate": 1.3957355394583014e-05, "loss": 0.5238, "step": 575 }, { "epoch": 0.3709547576879729, "grad_norm": 0.5335680720810368, "learning_rate": 1.3938757562492873e-05, "loss": 0.5226, "step": 576 }, { "epoch": 0.37159877636451455, "grad_norm": 0.5064920070598151, "learning_rate": 1.3920143591447635e-05, "loss": 0.5315, "step": 577 }, { "epoch": 0.3722427950410562, "grad_norm": 0.5106128855865595, "learning_rate": 1.3901513557717554e-05, "loss": 0.5043, "step": 578 }, { "epoch": 0.3728868137175978, "grad_norm": 0.539816619174619, "learning_rate": 1.38828675376387e-05, "loss": 0.5139, "step": 579 }, { "epoch": 0.3735308323941394, "grad_norm": 0.5631067015446367, "learning_rate": 1.3864205607612648e-05, "loss": 0.5357, "step": 580 }, { "epoch": 0.37417485107068105, "grad_norm": 0.5256452657256547, "learning_rate": 1.3845527844106168e-05, "loss": 0.5345, "step": 581 }, { "epoch": 0.37481886974722267, "grad_norm": 0.5126615149780287, "learning_rate": 1.3826834323650899e-05, "loss": 0.5052, "step": 582 }, { "epoch": 0.3754628884237643, "grad_norm": 0.554540953290872, "learning_rate": 1.3808125122843048e-05, "loss": 0.5221, "step": 583 }, { "epoch": 0.3761069071003059, "grad_norm": 0.5261608242314687, "learning_rate": 1.378940031834307e-05, "loss": 0.5356, "step": 584 }, { "epoch": 0.37675092577684755, "grad_norm": 0.495236911110382, "learning_rate": 1.3770659986875354e-05, "loss": 0.5245, "step": 585 }, { "epoch": 0.37739494445338917, "grad_norm": 0.4896048097766887, "learning_rate": 1.3751904205227922e-05, "loss": 0.5115, "step": 586 }, { "epoch": 0.3780389631299308, "grad_norm": 0.48055466037449596, "learning_rate": 1.3733133050252087e-05, "loss": 0.5341, "step": 587 }, { "epoch": 0.37868298180647236, "grad_norm": 0.5121829914104667, "learning_rate": 1.3714346598862168e-05, "loss": 0.5023, "step": 588 }, { "epoch": 0.379327000483014, "grad_norm": 0.5532092627133531, "learning_rate": 1.369554492803514e-05, "loss": 0.5376, "step": 589 }, { "epoch": 0.3799710191595556, "grad_norm": 0.47336645856703446, "learning_rate": 1.3676728114810367e-05, "loss": 0.5207, "step": 590 }, { "epoch": 0.38061503783609724, "grad_norm": 0.5868357752686735, "learning_rate": 1.3657896236289237e-05, "loss": 0.5285, "step": 591 }, { "epoch": 0.38125905651263886, "grad_norm": 0.4814876560182141, "learning_rate": 1.3639049369634878e-05, "loss": 0.5147, "step": 592 }, { "epoch": 0.3819030751891805, "grad_norm": 0.5264558930646039, "learning_rate": 1.3620187592071825e-05, "loss": 0.5144, "step": 593 }, { "epoch": 0.3825470938657221, "grad_norm": 0.5769823868214177, "learning_rate": 1.3601310980885714e-05, "loss": 0.5288, "step": 594 }, { "epoch": 0.38319111254226373, "grad_norm": 0.4842584402258486, "learning_rate": 1.3582419613422962e-05, "loss": 0.5055, "step": 595 }, { "epoch": 0.38383513121880536, "grad_norm": 0.5228089972878686, "learning_rate": 1.356351356709045e-05, "loss": 0.5321, "step": 596 }, { "epoch": 0.384479149895347, "grad_norm": 0.5237745822811603, "learning_rate": 1.3544592919355203e-05, "loss": 0.5176, "step": 597 }, { "epoch": 0.3851231685718886, "grad_norm": 0.4974592469425546, "learning_rate": 1.3525657747744073e-05, "loss": 0.5279, "step": 598 }, { "epoch": 0.38576718724843023, "grad_norm": 0.4640692281180609, "learning_rate": 1.3506708129843426e-05, "loss": 0.542, "step": 599 }, { "epoch": 0.3864112059249718, "grad_norm": 0.5264438989775194, "learning_rate": 1.3487744143298822e-05, "loss": 0.5404, "step": 600 }, { "epoch": 0.3870552246015134, "grad_norm": 0.5339047748857655, "learning_rate": 1.3468765865814696e-05, "loss": 0.52, "step": 601 }, { "epoch": 0.38769924327805505, "grad_norm": 0.4538588756489845, "learning_rate": 1.344977337515404e-05, "loss": 0.5281, "step": 602 }, { "epoch": 0.3883432619545967, "grad_norm": 0.5578066067733515, "learning_rate": 1.343076674913808e-05, "loss": 0.4963, "step": 603 }, { "epoch": 0.3889872806311383, "grad_norm": 0.4498499608650284, "learning_rate": 1.3411746065645961e-05, "loss": 0.5425, "step": 604 }, { "epoch": 0.3896312993076799, "grad_norm": 0.47881775972117707, "learning_rate": 1.339271140261444e-05, "loss": 0.5147, "step": 605 }, { "epoch": 0.39027531798422155, "grad_norm": 0.48900992222347184, "learning_rate": 1.3373662838037538e-05, "loss": 0.5404, "step": 606 }, { "epoch": 0.3909193366607632, "grad_norm": 0.4796442238495644, "learning_rate": 1.335460044996625e-05, "loss": 0.5151, "step": 607 }, { "epoch": 0.3915633553373048, "grad_norm": 0.5272827681522313, "learning_rate": 1.3335524316508208e-05, "loss": 0.5275, "step": 608 }, { "epoch": 0.3922073740138464, "grad_norm": 0.4922691396095374, "learning_rate": 1.331643451582736e-05, "loss": 0.5088, "step": 609 }, { "epoch": 0.39285139269038805, "grad_norm": 0.527480671162024, "learning_rate": 1.3297331126143667e-05, "loss": 0.5269, "step": 610 }, { "epoch": 0.3934954113669296, "grad_norm": 0.5214951168482355, "learning_rate": 1.327821422573276e-05, "loss": 0.5398, "step": 611 }, { "epoch": 0.39413943004347124, "grad_norm": 0.6141981672822071, "learning_rate": 1.3259083892925633e-05, "loss": 0.536, "step": 612 }, { "epoch": 0.39478344872001286, "grad_norm": 0.5126976638488424, "learning_rate": 1.3239940206108322e-05, "loss": 0.5113, "step": 613 }, { "epoch": 0.3954274673965545, "grad_norm": 0.5235751760890134, "learning_rate": 1.3220783243721571e-05, "loss": 0.5336, "step": 614 }, { "epoch": 0.3960714860730961, "grad_norm": 0.575748114409721, "learning_rate": 1.3201613084260538e-05, "loss": 0.5088, "step": 615 }, { "epoch": 0.39671550474963774, "grad_norm": 0.4868065663873412, "learning_rate": 1.3182429806274442e-05, "loss": 0.5321, "step": 616 }, { "epoch": 0.39735952342617936, "grad_norm": 0.4892077292591214, "learning_rate": 1.3163233488366254e-05, "loss": 0.5208, "step": 617 }, { "epoch": 0.398003542102721, "grad_norm": 0.5683456741749565, "learning_rate": 1.3144024209192378e-05, "loss": 0.5314, "step": 618 }, { "epoch": 0.3986475607792626, "grad_norm": 0.5041684810220519, "learning_rate": 1.3124802047462335e-05, "loss": 0.5266, "step": 619 }, { "epoch": 0.39929157945580424, "grad_norm": 0.5121706951782409, "learning_rate": 1.3105567081938423e-05, "loss": 0.5253, "step": 620 }, { "epoch": 0.39993559813234586, "grad_norm": 0.5685657394941309, "learning_rate": 1.3086319391435405e-05, "loss": 0.5141, "step": 621 }, { "epoch": 0.4005796168088875, "grad_norm": 2.4626958503029894, "learning_rate": 1.3067059054820184e-05, "loss": 0.5458, "step": 622 }, { "epoch": 0.40122363548542905, "grad_norm": 0.6202986370174992, "learning_rate": 1.304778615101148e-05, "loss": 0.5247, "step": 623 }, { "epoch": 0.4018676541619707, "grad_norm": 0.5133939231814455, "learning_rate": 1.3028500758979507e-05, "loss": 0.5271, "step": 624 }, { "epoch": 0.4025116728385123, "grad_norm": 0.6187646348240546, "learning_rate": 1.3009202957745652e-05, "loss": 0.515, "step": 625 }, { "epoch": 0.4031556915150539, "grad_norm": 0.4534855013360643, "learning_rate": 1.2989892826382144e-05, "loss": 0.5445, "step": 626 }, { "epoch": 0.40379971019159555, "grad_norm": 0.6052997475499055, "learning_rate": 1.2970570444011739e-05, "loss": 0.5046, "step": 627 }, { "epoch": 0.4044437288681372, "grad_norm": 0.4855404909250577, "learning_rate": 1.2951235889807386e-05, "loss": 0.5143, "step": 628 }, { "epoch": 0.4050877475446788, "grad_norm": 0.5001989449883264, "learning_rate": 1.2931889242991906e-05, "loss": 0.5439, "step": 629 }, { "epoch": 0.4057317662212204, "grad_norm": 0.5904684092240299, "learning_rate": 1.2912530582837683e-05, "loss": 0.5357, "step": 630 }, { "epoch": 0.40637578489776205, "grad_norm": 0.5166516261664065, "learning_rate": 1.2893159988666309e-05, "loss": 0.5276, "step": 631 }, { "epoch": 0.4070198035743037, "grad_norm": 0.4590623786427123, "learning_rate": 1.2873777539848284e-05, "loss": 0.5058, "step": 632 }, { "epoch": 0.4076638222508453, "grad_norm": 0.5297074972707667, "learning_rate": 1.2854383315802679e-05, "loss": 0.5245, "step": 633 }, { "epoch": 0.40830784092738687, "grad_norm": 0.4833748504262761, "learning_rate": 1.2834977395996817e-05, "loss": 0.5079, "step": 634 }, { "epoch": 0.4089518596039285, "grad_norm": 0.5320778542906146, "learning_rate": 1.281555985994594e-05, "loss": 0.5155, "step": 635 }, { "epoch": 0.4095958782804701, "grad_norm": 0.5287393405596659, "learning_rate": 1.279613078721289e-05, "loss": 0.5305, "step": 636 }, { "epoch": 0.41023989695701174, "grad_norm": 0.4662036524412335, "learning_rate": 1.2776690257407782e-05, "loss": 0.518, "step": 637 }, { "epoch": 0.41088391563355336, "grad_norm": 0.6314752633432603, "learning_rate": 1.2757238350187669e-05, "loss": 0.5247, "step": 638 }, { "epoch": 0.411527934310095, "grad_norm": 0.4964566329039085, "learning_rate": 1.2737775145256229e-05, "loss": 0.5129, "step": 639 }, { "epoch": 0.4121719529866366, "grad_norm": 0.5992978695083512, "learning_rate": 1.2718300722363431e-05, "loss": 0.5227, "step": 640 }, { "epoch": 0.41281597166317824, "grad_norm": 0.504297782741665, "learning_rate": 1.269881516130521e-05, "loss": 0.5306, "step": 641 }, { "epoch": 0.41345999033971986, "grad_norm": 0.5496151852427029, "learning_rate": 1.2679318541923131e-05, "loss": 0.4866, "step": 642 }, { "epoch": 0.4141040090162615, "grad_norm": 0.5915621578507324, "learning_rate": 1.2659810944104083e-05, "loss": 0.5112, "step": 643 }, { "epoch": 0.4147480276928031, "grad_norm": 0.4779783059603057, "learning_rate": 1.2640292447779932e-05, "loss": 0.508, "step": 644 }, { "epoch": 0.41539204636934474, "grad_norm": 0.6019507937127447, "learning_rate": 1.2620763132927201e-05, "loss": 0.5184, "step": 645 }, { "epoch": 0.4160360650458863, "grad_norm": 0.6301570421607157, "learning_rate": 1.2601223079566745e-05, "loss": 0.5183, "step": 646 }, { "epoch": 0.41668008372242793, "grad_norm": 0.5299040796302318, "learning_rate": 1.2581672367763408e-05, "loss": 0.496, "step": 647 }, { "epoch": 0.41732410239896955, "grad_norm": 0.610001471554248, "learning_rate": 1.2562111077625723e-05, "loss": 0.5375, "step": 648 }, { "epoch": 0.4179681210755112, "grad_norm": 0.7168959887097923, "learning_rate": 1.2542539289305559e-05, "loss": 0.515, "step": 649 }, { "epoch": 0.4186121397520528, "grad_norm": 0.47483828677629086, "learning_rate": 1.25229570829978e-05, "loss": 0.5229, "step": 650 }, { "epoch": 0.4192561584285944, "grad_norm": 0.555528891612195, "learning_rate": 1.250336453894002e-05, "loss": 0.5254, "step": 651 }, { "epoch": 0.41990017710513605, "grad_norm": 0.5568503637934826, "learning_rate": 1.248376173741215e-05, "loss": 0.5281, "step": 652 }, { "epoch": 0.4205441957816777, "grad_norm": 0.5240668965333174, "learning_rate": 1.246414875873615e-05, "loss": 0.5194, "step": 653 }, { "epoch": 0.4211882144582193, "grad_norm": 0.45235618153426793, "learning_rate": 1.2444525683275687e-05, "loss": 0.5146, "step": 654 }, { "epoch": 0.4218322331347609, "grad_norm": 0.5521466326248953, "learning_rate": 1.242489259143579e-05, "loss": 0.5221, "step": 655 }, { "epoch": 0.42247625181130255, "grad_norm": 0.5426175375285518, "learning_rate": 1.2405249563662539e-05, "loss": 0.5415, "step": 656 }, { "epoch": 0.4231202704878442, "grad_norm": 0.43392177722480535, "learning_rate": 1.2385596680442715e-05, "loss": 0.5299, "step": 657 }, { "epoch": 0.42376428916438574, "grad_norm": 0.5617074662455528, "learning_rate": 1.2365934022303491e-05, "loss": 0.5427, "step": 658 }, { "epoch": 0.42440830784092737, "grad_norm": 0.6180056165797571, "learning_rate": 1.2346261669812093e-05, "loss": 0.5093, "step": 659 }, { "epoch": 0.425052326517469, "grad_norm": 0.5581451158175698, "learning_rate": 1.2326579703575464e-05, "loss": 0.5292, "step": 660 }, { "epoch": 0.4256963451940106, "grad_norm": 0.5516432820791123, "learning_rate": 1.2306888204239938e-05, "loss": 0.5266, "step": 661 }, { "epoch": 0.42634036387055224, "grad_norm": 0.5122207849376796, "learning_rate": 1.2287187252490914e-05, "loss": 0.5117, "step": 662 }, { "epoch": 0.42698438254709387, "grad_norm": 0.46459811589510297, "learning_rate": 1.2267476929052521e-05, "loss": 0.522, "step": 663 }, { "epoch": 0.4276284012236355, "grad_norm": 0.5562684753813433, "learning_rate": 1.2247757314687296e-05, "loss": 0.5244, "step": 664 }, { "epoch": 0.4282724199001771, "grad_norm": 0.44236122538210704, "learning_rate": 1.2228028490195831e-05, "loss": 0.5291, "step": 665 }, { "epoch": 0.42891643857671874, "grad_norm": 0.5653767825565299, "learning_rate": 1.2208290536416466e-05, "loss": 0.5199, "step": 666 }, { "epoch": 0.42956045725326036, "grad_norm": 0.462220824672624, "learning_rate": 1.218854353422494e-05, "loss": 0.5292, "step": 667 }, { "epoch": 0.430204475929802, "grad_norm": 0.48761588775757225, "learning_rate": 1.2168787564534078e-05, "loss": 0.5355, "step": 668 }, { "epoch": 0.43084849460634356, "grad_norm": 0.5135769791884622, "learning_rate": 1.2149022708293447e-05, "loss": 0.523, "step": 669 }, { "epoch": 0.4314925132828852, "grad_norm": 0.4418738867287494, "learning_rate": 1.212924904648902e-05, "loss": 0.5208, "step": 670 }, { "epoch": 0.4321365319594268, "grad_norm": 0.5791089628890221, "learning_rate": 1.2109466660142853e-05, "loss": 0.5451, "step": 671 }, { "epoch": 0.43278055063596843, "grad_norm": 0.42943820725315535, "learning_rate": 1.2089675630312755e-05, "loss": 0.5054, "step": 672 }, { "epoch": 0.43342456931251006, "grad_norm": 2.4150592688641708, "learning_rate": 1.2069876038091941e-05, "loss": 0.5446, "step": 673 }, { "epoch": 0.4340685879890517, "grad_norm": 0.6113863975195802, "learning_rate": 1.2050067964608725e-05, "loss": 0.5145, "step": 674 }, { "epoch": 0.4347126066655933, "grad_norm": 0.40895837801557694, "learning_rate": 1.2030251491026162e-05, "loss": 0.5276, "step": 675 }, { "epoch": 0.43535662534213493, "grad_norm": 0.5559889821550731, "learning_rate": 1.2010426698541728e-05, "loss": 0.5406, "step": 676 }, { "epoch": 0.43600064401867655, "grad_norm": 0.48736014447516707, "learning_rate": 1.199059366838699e-05, "loss": 0.5069, "step": 677 }, { "epoch": 0.4366446626952182, "grad_norm": 0.435247808943822, "learning_rate": 1.1970752481827261e-05, "loss": 0.5383, "step": 678 }, { "epoch": 0.4372886813717598, "grad_norm": 9.225683409826644, "learning_rate": 1.1950903220161286e-05, "loss": 0.5351, "step": 679 }, { "epoch": 0.4379327000483014, "grad_norm": 0.6477916937296041, "learning_rate": 1.1931045964720882e-05, "loss": 0.5304, "step": 680 }, { "epoch": 0.438576718724843, "grad_norm": 0.45899096743824197, "learning_rate": 1.1911180796870632e-05, "loss": 0.5302, "step": 681 }, { "epoch": 0.4392207374013846, "grad_norm": 0.5685170213550881, "learning_rate": 1.1891307798007536e-05, "loss": 0.5199, "step": 682 }, { "epoch": 0.43986475607792624, "grad_norm": 0.4885213421194584, "learning_rate": 1.1871427049560687e-05, "loss": 0.515, "step": 683 }, { "epoch": 0.44050877475446787, "grad_norm": 0.5090348543398081, "learning_rate": 1.1851538632990922e-05, "loss": 0.5342, "step": 684 }, { "epoch": 0.4411527934310095, "grad_norm": 0.4489156539818113, "learning_rate": 1.1831642629790503e-05, "loss": 0.5325, "step": 685 }, { "epoch": 0.4417968121075511, "grad_norm": 0.5276136475249494, "learning_rate": 1.1811739121482777e-05, "loss": 0.505, "step": 686 }, { "epoch": 0.44244083078409274, "grad_norm": 0.4942227182726111, "learning_rate": 1.1791828189621848e-05, "loss": 0.5249, "step": 687 }, { "epoch": 0.44308484946063437, "grad_norm": 0.4431184530870821, "learning_rate": 1.177190991579223e-05, "loss": 0.524, "step": 688 }, { "epoch": 0.443728868137176, "grad_norm": 0.5009172837998283, "learning_rate": 1.1751984381608524e-05, "loss": 0.5367, "step": 689 }, { "epoch": 0.4443728868137176, "grad_norm": 0.44996202763437326, "learning_rate": 1.1732051668715082e-05, "loss": 0.5696, "step": 690 }, { "epoch": 0.44501690549025924, "grad_norm": 0.518584371560926, "learning_rate": 1.1712111858785663e-05, "loss": 0.5403, "step": 691 }, { "epoch": 0.44566092416680086, "grad_norm": 0.46335877286811983, "learning_rate": 1.1692165033523117e-05, "loss": 0.5417, "step": 692 }, { "epoch": 0.44630494284334243, "grad_norm": 0.5381700671563514, "learning_rate": 1.1672211274659034e-05, "loss": 0.485, "step": 693 }, { "epoch": 0.44694896151988406, "grad_norm": 0.5563871950730694, "learning_rate": 1.1652250663953415e-05, "loss": 0.5169, "step": 694 }, { "epoch": 0.4475929801964257, "grad_norm": 0.4135966705982468, "learning_rate": 1.1632283283194331e-05, "loss": 0.4888, "step": 695 }, { "epoch": 0.4482369988729673, "grad_norm": 0.7845080238263218, "learning_rate": 1.1612309214197599e-05, "loss": 0.5378, "step": 696 }, { "epoch": 0.44888101754950893, "grad_norm": 0.49773024314572367, "learning_rate": 1.1592328538806439e-05, "loss": 0.5241, "step": 697 }, { "epoch": 0.44952503622605056, "grad_norm": 0.46789510996754863, "learning_rate": 1.1572341338891145e-05, "loss": 0.5047, "step": 698 }, { "epoch": 0.4501690549025922, "grad_norm": 0.5403114622931047, "learning_rate": 1.1552347696348733e-05, "loss": 0.5158, "step": 699 }, { "epoch": 0.4508130735791338, "grad_norm": 0.45790092265378624, "learning_rate": 1.1532347693102632e-05, "loss": 0.5139, "step": 700 }, { "epoch": 0.45145709225567543, "grad_norm": 0.6283568132607721, "learning_rate": 1.1512341411102324e-05, "loss": 0.5223, "step": 701 }, { "epoch": 0.45210111093221705, "grad_norm": 0.5131178273456416, "learning_rate": 1.1492328932323022e-05, "loss": 0.5017, "step": 702 }, { "epoch": 0.4527451296087587, "grad_norm": 0.45658443467674115, "learning_rate": 1.147231033876533e-05, "loss": 0.5223, "step": 703 }, { "epoch": 0.45338914828530025, "grad_norm": 0.6534712449215364, "learning_rate": 1.1452285712454905e-05, "loss": 0.5214, "step": 704 }, { "epoch": 0.45403316696184187, "grad_norm": 0.43215614436861616, "learning_rate": 1.1432255135442126e-05, "loss": 0.527, "step": 705 }, { "epoch": 0.4546771856383835, "grad_norm": 0.4226996506928508, "learning_rate": 1.1412218689801748e-05, "loss": 0.5005, "step": 706 }, { "epoch": 0.4553212043149251, "grad_norm": 0.6079854843976552, "learning_rate": 1.1392176457632586e-05, "loss": 0.5185, "step": 707 }, { "epoch": 0.45596522299146675, "grad_norm": 0.4961516836906766, "learning_rate": 1.1372128521057155e-05, "loss": 0.5197, "step": 708 }, { "epoch": 0.45660924166800837, "grad_norm": 0.44267174935929304, "learning_rate": 1.1352074962221342e-05, "loss": 0.5229, "step": 709 }, { "epoch": 0.45725326034455, "grad_norm": 0.4735975071799169, "learning_rate": 1.1332015863294078e-05, "loss": 0.5254, "step": 710 }, { "epoch": 0.4578972790210916, "grad_norm": 0.48339138784449937, "learning_rate": 1.1311951306466987e-05, "loss": 0.5172, "step": 711 }, { "epoch": 0.45854129769763324, "grad_norm": 0.49481296714347883, "learning_rate": 1.1291881373954066e-05, "loss": 0.5176, "step": 712 }, { "epoch": 0.45918531637417487, "grad_norm": 0.42444087828947774, "learning_rate": 1.127180614799133e-05, "loss": 0.5029, "step": 713 }, { "epoch": 0.4598293350507165, "grad_norm": 0.5110066047350137, "learning_rate": 1.125172571083649e-05, "loss": 0.5348, "step": 714 }, { "epoch": 0.4604733537272581, "grad_norm": 0.44872295214081054, "learning_rate": 1.1231640144768604e-05, "loss": 0.5141, "step": 715 }, { "epoch": 0.4611173724037997, "grad_norm": 0.4648038046643429, "learning_rate": 1.1211549532087749e-05, "loss": 0.5437, "step": 716 }, { "epoch": 0.4617613910803413, "grad_norm": 0.9444591286854669, "learning_rate": 1.1191453955114681e-05, "loss": 0.5301, "step": 717 }, { "epoch": 0.46240540975688293, "grad_norm": 0.450030084262089, "learning_rate": 1.1171353496190499e-05, "loss": 0.5017, "step": 718 }, { "epoch": 0.46304942843342456, "grad_norm": 0.4195404836577633, "learning_rate": 1.1151248237676297e-05, "loss": 0.5185, "step": 719 }, { "epoch": 0.4636934471099662, "grad_norm": 0.5096106351105627, "learning_rate": 1.1131138261952845e-05, "loss": 0.5204, "step": 720 }, { "epoch": 0.4643374657865078, "grad_norm": 0.5117655318704323, "learning_rate": 1.1111023651420231e-05, "loss": 0.5386, "step": 721 }, { "epoch": 0.46498148446304943, "grad_norm": 0.4235812481203796, "learning_rate": 1.109090448849755e-05, "loss": 0.5154, "step": 722 }, { "epoch": 0.46562550313959106, "grad_norm": 0.44900114808206754, "learning_rate": 1.1070780855622537e-05, "loss": 0.5196, "step": 723 }, { "epoch": 0.4662695218161327, "grad_norm": 0.6035354306256341, "learning_rate": 1.105065283525124e-05, "loss": 0.539, "step": 724 }, { "epoch": 0.4669135404926743, "grad_norm": 0.46155743695611784, "learning_rate": 1.10305205098577e-05, "loss": 0.5185, "step": 725 }, { "epoch": 0.46755755916921593, "grad_norm": 0.46843510112287273, "learning_rate": 1.1010383961933582e-05, "loss": 0.5155, "step": 726 }, { "epoch": 0.4682015778457575, "grad_norm": 0.582056662172735, "learning_rate": 1.0990243273987863e-05, "loss": 0.5155, "step": 727 }, { "epoch": 0.4688455965222991, "grad_norm": 0.4702684554491737, "learning_rate": 1.0970098528546482e-05, "loss": 0.4974, "step": 728 }, { "epoch": 0.46948961519884075, "grad_norm": 0.4252736683622367, "learning_rate": 1.0949949808151998e-05, "loss": 0.5151, "step": 729 }, { "epoch": 0.4701336338753824, "grad_norm": 0.5719730792544944, "learning_rate": 1.0929797195363259e-05, "loss": 0.5266, "step": 730 }, { "epoch": 0.470777652551924, "grad_norm": 11.164152383121039, "learning_rate": 1.0909640772755065e-05, "loss": 0.5522, "step": 731 }, { "epoch": 0.4714216712284656, "grad_norm": 0.551446352698729, "learning_rate": 1.088948062291783e-05, "loss": 0.5255, "step": 732 }, { "epoch": 0.47206568990500725, "grad_norm": 0.46745399250318975, "learning_rate": 1.086931682845723e-05, "loss": 0.5144, "step": 733 }, { "epoch": 0.47270970858154887, "grad_norm": 0.46444922001914246, "learning_rate": 1.0849149471993883e-05, "loss": 0.5138, "step": 734 }, { "epoch": 0.4733537272580905, "grad_norm": 0.47292430517810286, "learning_rate": 1.0828978636162996e-05, "loss": 0.5107, "step": 735 }, { "epoch": 0.4739977459346321, "grad_norm": 0.445857873436325, "learning_rate": 1.0808804403614044e-05, "loss": 0.5358, "step": 736 }, { "epoch": 0.47464176461117374, "grad_norm": 0.478386015806705, "learning_rate": 1.0788626857010404e-05, "loss": 0.5211, "step": 737 }, { "epoch": 0.47528578328771537, "grad_norm": 0.4618194721219019, "learning_rate": 1.0768446079029044e-05, "loss": 0.5366, "step": 738 }, { "epoch": 0.47592980196425694, "grad_norm": 0.4198539597278469, "learning_rate": 1.074826215236017e-05, "loss": 0.5127, "step": 739 }, { "epoch": 0.47657382064079856, "grad_norm": 0.47770863679142117, "learning_rate": 1.0728075159706881e-05, "loss": 0.5301, "step": 740 }, { "epoch": 0.4772178393173402, "grad_norm": 0.41000540677804875, "learning_rate": 1.0707885183784857e-05, "loss": 0.5069, "step": 741 }, { "epoch": 0.4778618579938818, "grad_norm": 0.45821239998194363, "learning_rate": 1.0687692307321984e-05, "loss": 0.5156, "step": 742 }, { "epoch": 0.47850587667042344, "grad_norm": 0.5037915368743111, "learning_rate": 1.0667496613058044e-05, "loss": 0.5359, "step": 743 }, { "epoch": 0.47914989534696506, "grad_norm": 0.47142835312992654, "learning_rate": 1.0647298183744359e-05, "loss": 0.5083, "step": 744 }, { "epoch": 0.4797939140235067, "grad_norm": 0.5462666879588723, "learning_rate": 1.0627097102143458e-05, "loss": 0.5086, "step": 745 }, { "epoch": 0.4804379327000483, "grad_norm": 0.3887770581359485, "learning_rate": 1.0606893451028743e-05, "loss": 0.5136, "step": 746 }, { "epoch": 0.48108195137658993, "grad_norm": 0.4854634597462066, "learning_rate": 1.0586687313184141e-05, "loss": 0.5254, "step": 747 }, { "epoch": 0.48172597005313156, "grad_norm": 0.5011236970760449, "learning_rate": 1.0566478771403763e-05, "loss": 0.5266, "step": 748 }, { "epoch": 0.4823699887296732, "grad_norm": 0.3816011264637712, "learning_rate": 1.0546267908491582e-05, "loss": 0.4943, "step": 749 }, { "epoch": 0.4830140074062148, "grad_norm": 0.46642767723567535, "learning_rate": 1.0526054807261067e-05, "loss": 0.506, "step": 750 }, { "epoch": 0.4836580260827564, "grad_norm": 0.5090620003424909, "learning_rate": 1.0505839550534875e-05, "loss": 0.5053, "step": 751 }, { "epoch": 0.484302044759298, "grad_norm": 0.46163744923187133, "learning_rate": 1.0485622221144485e-05, "loss": 0.5253, "step": 752 }, { "epoch": 0.4849460634358396, "grad_norm": 0.4938914838020393, "learning_rate": 1.0465402901929864e-05, "loss": 0.5262, "step": 753 }, { "epoch": 0.48559008211238125, "grad_norm": 0.5345318367237433, "learning_rate": 1.0445181675739144e-05, "loss": 0.5307, "step": 754 }, { "epoch": 0.4862341007889229, "grad_norm": 0.4539637071831038, "learning_rate": 1.0424958625428264e-05, "loss": 0.5113, "step": 755 }, { "epoch": 0.4868781194654645, "grad_norm": 0.4922691161984964, "learning_rate": 1.0404733833860639e-05, "loss": 0.5114, "step": 756 }, { "epoch": 0.4875221381420061, "grad_norm": 1.8822040975263505, "learning_rate": 1.0384507383906819e-05, "loss": 0.5303, "step": 757 }, { "epoch": 0.48816615681854775, "grad_norm": 0.6958911391252204, "learning_rate": 1.0364279358444144e-05, "loss": 0.5287, "step": 758 }, { "epoch": 0.48881017549508937, "grad_norm": 0.43284470341948955, "learning_rate": 1.034404984035642e-05, "loss": 0.5041, "step": 759 }, { "epoch": 0.489454194171631, "grad_norm": 0.5113029332143848, "learning_rate": 1.0323818912533561e-05, "loss": 0.5381, "step": 760 }, { "epoch": 0.4900982128481726, "grad_norm": 0.5845494159500858, "learning_rate": 1.0303586657871258e-05, "loss": 0.498, "step": 761 }, { "epoch": 0.4907422315247142, "grad_norm": 0.5130878294581405, "learning_rate": 1.0283353159270644e-05, "loss": 0.5008, "step": 762 }, { "epoch": 0.4913862502012558, "grad_norm": 0.49481456988159866, "learning_rate": 1.0263118499637942e-05, "loss": 0.5231, "step": 763 }, { "epoch": 0.49203026887779744, "grad_norm": 0.5412269102358809, "learning_rate": 1.0242882761884132e-05, "loss": 0.501, "step": 764 }, { "epoch": 0.49267428755433906, "grad_norm": 0.5116484879652448, "learning_rate": 1.022264602892462e-05, "loss": 0.517, "step": 765 }, { "epoch": 0.4933183062308807, "grad_norm": 0.4980766228761759, "learning_rate": 1.0202408383678887e-05, "loss": 0.5198, "step": 766 }, { "epoch": 0.4939623249074223, "grad_norm": 0.4940474936361702, "learning_rate": 1.0182169909070148e-05, "loss": 0.5097, "step": 767 }, { "epoch": 0.49460634358396394, "grad_norm": 0.4645622308164305, "learning_rate": 1.0161930688025018e-05, "loss": 0.5307, "step": 768 }, { "epoch": 0.49525036226050556, "grad_norm": 0.5058607801341798, "learning_rate": 1.0141690803473167e-05, "loss": 0.508, "step": 769 }, { "epoch": 0.4958943809370472, "grad_norm": 0.45625727980919695, "learning_rate": 1.012145033834699e-05, "loss": 0.536, "step": 770 }, { "epoch": 0.4965383996135888, "grad_norm": 0.45597709293356414, "learning_rate": 1.010120937558126e-05, "loss": 0.5207, "step": 771 }, { "epoch": 0.49718241829013043, "grad_norm": 0.4556881258518035, "learning_rate": 1.0080967998112787e-05, "loss": 0.5256, "step": 772 }, { "epoch": 0.49782643696667206, "grad_norm": 0.4240946054722722, "learning_rate": 1.0060726288880081e-05, "loss": 0.5302, "step": 773 }, { "epoch": 0.49847045564321363, "grad_norm": 0.48087482372886775, "learning_rate": 1.0040484330823006e-05, "loss": 0.5131, "step": 774 }, { "epoch": 0.49911447431975525, "grad_norm": 0.45144947017603165, "learning_rate": 1.0020242206882459e-05, "loss": 0.4979, "step": 775 }, { "epoch": 0.4997584929962969, "grad_norm": 0.48099097911909894, "learning_rate": 1e-05, "loss": 0.5403, "step": 776 }, { "epoch": 0.5004025116728386, "grad_norm": 0.4058938550458502, "learning_rate": 9.979757793117545e-06, "loss": 0.5239, "step": 777 }, { "epoch": 0.5010465303493802, "grad_norm": 0.3993308903278203, "learning_rate": 9.959515669176997e-06, "loss": 0.5233, "step": 778 }, { "epoch": 0.5016905490259218, "grad_norm": 0.4471552990071803, "learning_rate": 9.939273711119922e-06, "loss": 0.5068, "step": 779 }, { "epoch": 0.5023345677024633, "grad_norm": 0.4439715428453816, "learning_rate": 9.919032001887215e-06, "loss": 0.5235, "step": 780 }, { "epoch": 0.5029785863790049, "grad_norm": 0.4462938589021754, "learning_rate": 9.898790624418743e-06, "loss": 0.5146, "step": 781 }, { "epoch": 0.5036226050555466, "grad_norm": 0.47610607557414614, "learning_rate": 9.878549661653013e-06, "loss": 0.5064, "step": 782 }, { "epoch": 0.5042666237320882, "grad_norm": 0.47142225328274867, "learning_rate": 9.858309196526836e-06, "loss": 0.5192, "step": 783 }, { "epoch": 0.5049106424086298, "grad_norm": 0.44673853922194323, "learning_rate": 9.838069311974986e-06, "loss": 0.5216, "step": 784 }, { "epoch": 0.5055546610851714, "grad_norm": 0.5092748987507197, "learning_rate": 9.817830090929856e-06, "loss": 0.5374, "step": 785 }, { "epoch": 0.5061986797617131, "grad_norm": 0.45008037080814534, "learning_rate": 9.797591616321115e-06, "loss": 0.5313, "step": 786 }, { "epoch": 0.5068426984382547, "grad_norm": 0.44650648000776744, "learning_rate": 9.777353971075381e-06, "loss": 0.5342, "step": 787 }, { "epoch": 0.5074867171147963, "grad_norm": 0.43999959209027495, "learning_rate": 9.757117238115871e-06, "loss": 0.532, "step": 788 }, { "epoch": 0.5081307357913379, "grad_norm": 0.5153621248245271, "learning_rate": 9.736881500362064e-06, "loss": 0.5184, "step": 789 }, { "epoch": 0.5087747544678796, "grad_norm": 0.4416171023115438, "learning_rate": 9.71664684072936e-06, "loss": 0.5324, "step": 790 }, { "epoch": 0.5094187731444212, "grad_norm": 0.438762000235425, "learning_rate": 9.696413342128747e-06, "loss": 0.5188, "step": 791 }, { "epoch": 0.5100627918209628, "grad_norm": 0.476020047124946, "learning_rate": 9.676181087466444e-06, "loss": 0.5142, "step": 792 }, { "epoch": 0.5107068104975044, "grad_norm": 0.4062965884057422, "learning_rate": 9.655950159643583e-06, "loss": 0.5284, "step": 793 }, { "epoch": 0.5113508291740461, "grad_norm": 0.49101561101072244, "learning_rate": 9.63572064155586e-06, "loss": 0.5125, "step": 794 }, { "epoch": 0.5119948478505877, "grad_norm": 0.469380579242041, "learning_rate": 9.615492616093188e-06, "loss": 0.5355, "step": 795 }, { "epoch": 0.5126388665271293, "grad_norm": 0.45489108808356027, "learning_rate": 9.595266166139366e-06, "loss": 0.5176, "step": 796 }, { "epoch": 0.5132828852036709, "grad_norm": 0.45885792417950677, "learning_rate": 9.57504137457174e-06, "loss": 0.5258, "step": 797 }, { "epoch": 0.5139269038802126, "grad_norm": 0.43424776259718617, "learning_rate": 9.55481832426086e-06, "loss": 0.5279, "step": 798 }, { "epoch": 0.5145709225567542, "grad_norm": 0.4618051731181909, "learning_rate": 9.53459709807014e-06, "loss": 0.5291, "step": 799 }, { "epoch": 0.5152149412332958, "grad_norm": 0.38576602115706526, "learning_rate": 9.514377778855521e-06, "loss": 0.5078, "step": 800 }, { "epoch": 0.5158589599098374, "grad_norm": 0.4284676087300364, "learning_rate": 9.494160449465123e-06, "loss": 0.5118, "step": 801 }, { "epoch": 0.5165029785863791, "grad_norm": 0.4192222108891612, "learning_rate": 9.473945192738933e-06, "loss": 0.5028, "step": 802 }, { "epoch": 0.5171469972629206, "grad_norm": 0.4285625995349231, "learning_rate": 9.45373209150842e-06, "loss": 0.4966, "step": 803 }, { "epoch": 0.5177910159394622, "grad_norm": 0.43936775557929597, "learning_rate": 9.433521228596237e-06, "loss": 0.5044, "step": 804 }, { "epoch": 0.5184350346160038, "grad_norm": 0.43931927848214275, "learning_rate": 9.41331268681586e-06, "loss": 0.5132, "step": 805 }, { "epoch": 0.5190790532925454, "grad_norm": 0.4102503922949785, "learning_rate": 9.393106548971257e-06, "loss": 0.5104, "step": 806 }, { "epoch": 0.5197230719690871, "grad_norm": 0.5058029915941765, "learning_rate": 9.372902897856542e-06, "loss": 0.4891, "step": 807 }, { "epoch": 0.5203670906456287, "grad_norm": 0.44634316585243883, "learning_rate": 9.352701816255643e-06, "loss": 0.5139, "step": 808 }, { "epoch": 0.5210111093221703, "grad_norm": 0.46046635604340425, "learning_rate": 9.332503386941958e-06, "loss": 0.5236, "step": 809 }, { "epoch": 0.5216551279987119, "grad_norm": 0.48227072122761605, "learning_rate": 9.312307692678016e-06, "loss": 0.5284, "step": 810 }, { "epoch": 0.5222991466752536, "grad_norm": 0.4512522964567395, "learning_rate": 9.292114816215145e-06, "loss": 0.5236, "step": 811 }, { "epoch": 0.5229431653517952, "grad_norm": 0.43925905220228945, "learning_rate": 9.27192484029312e-06, "loss": 0.5056, "step": 812 }, { "epoch": 0.5235871840283368, "grad_norm": 0.4610778085257201, "learning_rate": 9.251737847639834e-06, "loss": 0.5255, "step": 813 }, { "epoch": 0.5242312027048784, "grad_norm": 0.4470322826335541, "learning_rate": 9.231553920970958e-06, "loss": 0.5087, "step": 814 }, { "epoch": 0.5248752213814201, "grad_norm": 0.4815516280982625, "learning_rate": 9.2113731429896e-06, "loss": 0.508, "step": 815 }, { "epoch": 0.5255192400579617, "grad_norm": 0.4131494671879515, "learning_rate": 9.19119559638596e-06, "loss": 0.5274, "step": 816 }, { "epoch": 0.5261632587345033, "grad_norm": 0.49538063048171505, "learning_rate": 9.171021363837005e-06, "loss": 0.5181, "step": 817 }, { "epoch": 0.5268072774110449, "grad_norm": 0.4229909033316674, "learning_rate": 9.150850528006118e-06, "loss": 0.5229, "step": 818 }, { "epoch": 0.5274512960875866, "grad_norm": 0.39765337738220996, "learning_rate": 9.130683171542772e-06, "loss": 0.5, "step": 819 }, { "epoch": 0.5280953147641282, "grad_norm": 0.4383394688456195, "learning_rate": 9.110519377082174e-06, "loss": 0.5186, "step": 820 }, { "epoch": 0.5287393334406698, "grad_norm": 0.4099664513529314, "learning_rate": 9.090359227244936e-06, "loss": 0.4902, "step": 821 }, { "epoch": 0.5293833521172114, "grad_norm": 0.449849599649535, "learning_rate": 9.070202804636745e-06, "loss": 0.507, "step": 822 }, { "epoch": 0.5300273707937531, "grad_norm": 0.42135219523641654, "learning_rate": 9.050050191848006e-06, "loss": 0.5297, "step": 823 }, { "epoch": 0.5306713894702947, "grad_norm": 0.4543533891258066, "learning_rate": 9.02990147145352e-06, "loss": 0.5366, "step": 824 }, { "epoch": 0.5313154081468363, "grad_norm": 0.4867645384533659, "learning_rate": 9.009756726012138e-06, "loss": 0.5068, "step": 825 }, { "epoch": 0.5319594268233778, "grad_norm": 0.4195966184963643, "learning_rate": 8.98961603806642e-06, "loss": 0.5003, "step": 826 }, { "epoch": 0.5326034454999194, "grad_norm": 0.5247886282014729, "learning_rate": 8.969479490142302e-06, "loss": 0.5215, "step": 827 }, { "epoch": 0.5332474641764611, "grad_norm": 0.45543552208863797, "learning_rate": 8.949347164748761e-06, "loss": 0.5343, "step": 828 }, { "epoch": 0.5338914828530027, "grad_norm": 0.41796245406115307, "learning_rate": 8.929219144377468e-06, "loss": 0.5142, "step": 829 }, { "epoch": 0.5345355015295443, "grad_norm": 0.48209703911863444, "learning_rate": 8.909095511502452e-06, "loss": 0.5133, "step": 830 }, { "epoch": 0.535179520206086, "grad_norm": 0.5013872953480938, "learning_rate": 8.88897634857977e-06, "loss": 0.4988, "step": 831 }, { "epoch": 0.5358235388826276, "grad_norm": 0.43204604873846925, "learning_rate": 8.868861738047158e-06, "loss": 0.5195, "step": 832 }, { "epoch": 0.5364675575591692, "grad_norm": 0.5531683521163371, "learning_rate": 8.848751762323705e-06, "loss": 0.5129, "step": 833 }, { "epoch": 0.5371115762357108, "grad_norm": 0.4385324883821049, "learning_rate": 8.828646503809505e-06, "loss": 0.5197, "step": 834 }, { "epoch": 0.5377555949122524, "grad_norm": 0.44028674055573347, "learning_rate": 8.80854604488532e-06, "loss": 0.5099, "step": 835 }, { "epoch": 0.5383996135887941, "grad_norm": 0.5001567551685764, "learning_rate": 8.788450467912254e-06, "loss": 0.503, "step": 836 }, { "epoch": 0.5390436322653357, "grad_norm": 0.3868872468426976, "learning_rate": 8.7683598552314e-06, "loss": 0.5044, "step": 837 }, { "epoch": 0.5396876509418773, "grad_norm": 0.49836753786671817, "learning_rate": 8.748274289163514e-06, "loss": 0.5136, "step": 838 }, { "epoch": 0.5403316696184189, "grad_norm": 0.40324649568364546, "learning_rate": 8.728193852008674e-06, "loss": 0.5202, "step": 839 }, { "epoch": 0.5409756882949606, "grad_norm": 0.39944592911099197, "learning_rate": 8.708118626045939e-06, "loss": 0.5103, "step": 840 }, { "epoch": 0.5416197069715022, "grad_norm": 0.43991517104664113, "learning_rate": 8.688048693533017e-06, "loss": 0.502, "step": 841 }, { "epoch": 0.5422637256480438, "grad_norm": 0.45657412796298086, "learning_rate": 8.667984136705927e-06, "loss": 0.4914, "step": 842 }, { "epoch": 0.5429077443245854, "grad_norm": 0.39691385658710554, "learning_rate": 8.647925037778663e-06, "loss": 0.5142, "step": 843 }, { "epoch": 0.5435517630011271, "grad_norm": 0.4180382682744404, "learning_rate": 8.62787147894285e-06, "loss": 0.5254, "step": 844 }, { "epoch": 0.5441957816776687, "grad_norm": 0.3980418528367154, "learning_rate": 8.607823542367418e-06, "loss": 0.5267, "step": 845 }, { "epoch": 0.5448398003542103, "grad_norm": 0.4246103516389637, "learning_rate": 8.587781310198253e-06, "loss": 0.4933, "step": 846 }, { "epoch": 0.5454838190307519, "grad_norm": 0.4874791227762071, "learning_rate": 8.567744864557879e-06, "loss": 0.5281, "step": 847 }, { "epoch": 0.5461278377072936, "grad_norm": 0.40313420787652304, "learning_rate": 8.5477142875451e-06, "loss": 0.5219, "step": 848 }, { "epoch": 0.5467718563838352, "grad_norm": 0.41756495731283305, "learning_rate": 8.52768966123467e-06, "loss": 0.5106, "step": 849 }, { "epoch": 0.5474158750603767, "grad_norm": 0.4538523105638427, "learning_rate": 8.50767106767698e-06, "loss": 0.4821, "step": 850 }, { "epoch": 0.5480598937369183, "grad_norm": 0.422360890891248, "learning_rate": 8.487658588897676e-06, "loss": 0.4928, "step": 851 }, { "epoch": 0.54870391241346, "grad_norm": 0.4438050592657065, "learning_rate": 8.46765230689737e-06, "loss": 0.5157, "step": 852 }, { "epoch": 0.5493479310900016, "grad_norm": 0.4638417511522228, "learning_rate": 8.447652303651267e-06, "loss": 0.5216, "step": 853 }, { "epoch": 0.5499919497665432, "grad_norm": 0.38899107372640973, "learning_rate": 8.427658661108857e-06, "loss": 0.5007, "step": 854 }, { "epoch": 0.5506359684430848, "grad_norm": 0.3837361016120794, "learning_rate": 8.407671461193561e-06, "loss": 0.5169, "step": 855 }, { "epoch": 0.5512799871196264, "grad_norm": 0.3883473071605957, "learning_rate": 8.387690785802403e-06, "loss": 0.5048, "step": 856 }, { "epoch": 0.5519240057961681, "grad_norm": 0.4166540881068044, "learning_rate": 8.36771671680567e-06, "loss": 0.532, "step": 857 }, { "epoch": 0.5525680244727097, "grad_norm": 0.4509692910741587, "learning_rate": 8.347749336046587e-06, "loss": 0.5092, "step": 858 }, { "epoch": 0.5532120431492513, "grad_norm": 0.44089420197375556, "learning_rate": 8.327788725340966e-06, "loss": 0.5143, "step": 859 }, { "epoch": 0.553856061825793, "grad_norm": 0.3823864525186842, "learning_rate": 8.307834966476885e-06, "loss": 0.4883, "step": 860 }, { "epoch": 0.5545000805023346, "grad_norm": 0.4474458813331274, "learning_rate": 8.287888141214339e-06, "loss": 0.5107, "step": 861 }, { "epoch": 0.5551440991788762, "grad_norm": 0.4288472842006267, "learning_rate": 8.267948331284923e-06, "loss": 0.5181, "step": 862 }, { "epoch": 0.5557881178554178, "grad_norm": 0.40601384329623963, "learning_rate": 8.24801561839148e-06, "loss": 0.5333, "step": 863 }, { "epoch": 0.5564321365319594, "grad_norm": 0.39550652719208357, "learning_rate": 8.228090084207773e-06, "loss": 0.5356, "step": 864 }, { "epoch": 0.5570761552085011, "grad_norm": 0.41930822196868667, "learning_rate": 8.208171810378155e-06, "loss": 0.5024, "step": 865 }, { "epoch": 0.5577201738850427, "grad_norm": 0.44920482734210765, "learning_rate": 8.188260878517224e-06, "loss": 0.5162, "step": 866 }, { "epoch": 0.5583641925615843, "grad_norm": 0.41462280510360283, "learning_rate": 8.168357370209499e-06, "loss": 0.5084, "step": 867 }, { "epoch": 0.5590082112381259, "grad_norm": 0.40831509895413226, "learning_rate": 8.148461367009081e-06, "loss": 0.5222, "step": 868 }, { "epoch": 0.5596522299146676, "grad_norm": 0.43107999146770515, "learning_rate": 8.128572950439314e-06, "loss": 0.5364, "step": 869 }, { "epoch": 0.5602962485912092, "grad_norm": 0.4306922304584797, "learning_rate": 8.108692201992466e-06, "loss": 0.5155, "step": 870 }, { "epoch": 0.5609402672677508, "grad_norm": 0.38657327949254416, "learning_rate": 8.08881920312937e-06, "loss": 0.5144, "step": 871 }, { "epoch": 0.5615842859442924, "grad_norm": 0.4630539509839453, "learning_rate": 8.068954035279121e-06, "loss": 0.521, "step": 872 }, { "epoch": 0.562228304620834, "grad_norm": 0.486333938626765, "learning_rate": 8.04909677983872e-06, "loss": 0.5146, "step": 873 }, { "epoch": 0.5628723232973756, "grad_norm": 0.3954457763201425, "learning_rate": 8.02924751817274e-06, "loss": 0.5023, "step": 874 }, { "epoch": 0.5635163419739172, "grad_norm": 0.42082847199490564, "learning_rate": 8.009406331613014e-06, "loss": 0.5139, "step": 875 }, { "epoch": 0.5641603606504588, "grad_norm": 0.49509513938249344, "learning_rate": 7.989573301458274e-06, "loss": 0.5152, "step": 876 }, { "epoch": 0.5648043793270004, "grad_norm": 0.403875615131615, "learning_rate": 7.969748508973842e-06, "loss": 0.522, "step": 877 }, { "epoch": 0.5654483980035421, "grad_norm": 0.41040986743906105, "learning_rate": 7.949932035391279e-06, "loss": 0.5177, "step": 878 }, { "epoch": 0.5660924166800837, "grad_norm": 0.5147698704382437, "learning_rate": 7.930123961908062e-06, "loss": 0.4947, "step": 879 }, { "epoch": 0.5667364353566253, "grad_norm": 0.4249466303248379, "learning_rate": 7.91032436968725e-06, "loss": 0.497, "step": 880 }, { "epoch": 0.567380454033167, "grad_norm": 0.3973005346461778, "learning_rate": 7.89053333985715e-06, "loss": 0.5008, "step": 881 }, { "epoch": 0.5680244727097086, "grad_norm": 0.4130608043194774, "learning_rate": 7.870750953510983e-06, "loss": 0.5111, "step": 882 }, { "epoch": 0.5686684913862502, "grad_norm": 0.4468696454154296, "learning_rate": 7.850977291706557e-06, "loss": 0.525, "step": 883 }, { "epoch": 0.5693125100627918, "grad_norm": 0.4200347152051154, "learning_rate": 7.831212435465925e-06, "loss": 0.4911, "step": 884 }, { "epoch": 0.5699565287393334, "grad_norm": 0.47920401888187875, "learning_rate": 7.811456465775064e-06, "loss": 0.5157, "step": 885 }, { "epoch": 0.5706005474158751, "grad_norm": 0.49966788850257765, "learning_rate": 7.791709463583541e-06, "loss": 0.5205, "step": 886 }, { "epoch": 0.5712445660924167, "grad_norm": 0.537893423307389, "learning_rate": 7.771971509804174e-06, "loss": 0.5075, "step": 887 }, { "epoch": 0.5718885847689583, "grad_norm": 0.39093711938721637, "learning_rate": 7.752242685312709e-06, "loss": 0.5149, "step": 888 }, { "epoch": 0.5725326034454999, "grad_norm": 0.467949816142811, "learning_rate": 7.732523070947482e-06, "loss": 0.5024, "step": 889 }, { "epoch": 0.5731766221220416, "grad_norm": 0.5072525483202748, "learning_rate": 7.712812747509091e-06, "loss": 0.5055, "step": 890 }, { "epoch": 0.5738206407985832, "grad_norm": 0.36900383333034964, "learning_rate": 7.693111795760067e-06, "loss": 0.5045, "step": 891 }, { "epoch": 0.5744646594751248, "grad_norm": 0.6805045677779038, "learning_rate": 7.673420296424541e-06, "loss": 0.5185, "step": 892 }, { "epoch": 0.5751086781516664, "grad_norm": 0.4371167751386617, "learning_rate": 7.653738330187912e-06, "loss": 0.5051, "step": 893 }, { "epoch": 0.5757526968282081, "grad_norm": 0.4074967225089485, "learning_rate": 7.63406597769651e-06, "loss": 0.5015, "step": 894 }, { "epoch": 0.5763967155047497, "grad_norm": 0.42566754977691934, "learning_rate": 7.6144033195572886e-06, "loss": 0.506, "step": 895 }, { "epoch": 0.5770407341812912, "grad_norm": 0.4398958034165476, "learning_rate": 7.594750436337467e-06, "loss": 0.4998, "step": 896 }, { "epoch": 0.5776847528578328, "grad_norm": 0.433744482012835, "learning_rate": 7.575107408564214e-06, "loss": 0.5235, "step": 897 }, { "epoch": 0.5783287715343745, "grad_norm": 0.45400810742463893, "learning_rate": 7.5554743167243135e-06, "loss": 0.5299, "step": 898 }, { "epoch": 0.5789727902109161, "grad_norm": 0.3955036145112688, "learning_rate": 7.53585124126385e-06, "loss": 0.521, "step": 899 }, { "epoch": 0.5796168088874577, "grad_norm": 0.4251265953575393, "learning_rate": 7.516238262587851e-06, "loss": 0.4977, "step": 900 }, { "epoch": 0.5802608275639993, "grad_norm": 0.41268745578036303, "learning_rate": 7.49663546105998e-06, "loss": 0.4993, "step": 901 }, { "epoch": 0.580904846240541, "grad_norm": 0.39593337223927977, "learning_rate": 7.4770429170022e-06, "loss": 0.5177, "step": 902 }, { "epoch": 0.5815488649170826, "grad_norm": 0.4180699283533851, "learning_rate": 7.457460710694441e-06, "loss": 0.5024, "step": 903 }, { "epoch": 0.5821928835936242, "grad_norm": 0.4431401082719527, "learning_rate": 7.4378889223742766e-06, "loss": 0.5056, "step": 904 }, { "epoch": 0.5828369022701658, "grad_norm": 0.3982129403025228, "learning_rate": 7.4183276322365925e-06, "loss": 0.5105, "step": 905 }, { "epoch": 0.5834809209467074, "grad_norm": 0.42530721417628914, "learning_rate": 7.398776920433257e-06, "loss": 0.5272, "step": 906 }, { "epoch": 0.5841249396232491, "grad_norm": 0.3917595403966409, "learning_rate": 7.379236867072799e-06, "loss": 0.489, "step": 907 }, { "epoch": 0.5847689582997907, "grad_norm": 0.4457839704795384, "learning_rate": 7.35970755222007e-06, "loss": 0.5143, "step": 908 }, { "epoch": 0.5854129769763323, "grad_norm": 0.37820712228724535, "learning_rate": 7.34018905589592e-06, "loss": 0.5057, "step": 909 }, { "epoch": 0.586056995652874, "grad_norm": 0.43949589330680233, "learning_rate": 7.320681458076871e-06, "loss": 0.5174, "step": 910 }, { "epoch": 0.5867010143294156, "grad_norm": 0.42858142149245354, "learning_rate": 7.301184838694795e-06, "loss": 0.511, "step": 911 }, { "epoch": 0.5873450330059572, "grad_norm": 0.4582955216418432, "learning_rate": 7.2816992776365714e-06, "loss": 0.5088, "step": 912 }, { "epoch": 0.5879890516824988, "grad_norm": 0.4020264117015481, "learning_rate": 7.262224854743773e-06, "loss": 0.4963, "step": 913 }, { "epoch": 0.5886330703590404, "grad_norm": 0.3759233993171858, "learning_rate": 7.2427616498123356e-06, "loss": 0.4941, "step": 914 }, { "epoch": 0.5892770890355821, "grad_norm": 0.3655211630322718, "learning_rate": 7.223309742592221e-06, "loss": 0.4855, "step": 915 }, { "epoch": 0.5899211077121237, "grad_norm": 0.431985565670458, "learning_rate": 7.203869212787112e-06, "loss": 0.5144, "step": 916 }, { "epoch": 0.5905651263886653, "grad_norm": 0.39770699418220345, "learning_rate": 7.184440140054063e-06, "loss": 0.5243, "step": 917 }, { "epoch": 0.5912091450652069, "grad_norm": 0.40872023696943854, "learning_rate": 7.165022604003187e-06, "loss": 0.5273, "step": 918 }, { "epoch": 0.5918531637417485, "grad_norm": 0.4246329720213213, "learning_rate": 7.145616684197325e-06, "loss": 0.535, "step": 919 }, { "epoch": 0.5924971824182901, "grad_norm": 0.4021278350416659, "learning_rate": 7.126222460151719e-06, "loss": 0.5049, "step": 920 }, { "epoch": 0.5931412010948317, "grad_norm": 0.41079090242726113, "learning_rate": 7.106840011333694e-06, "loss": 0.5188, "step": 921 }, { "epoch": 0.5937852197713733, "grad_norm": 0.40268135332721233, "learning_rate": 7.08746941716232e-06, "loss": 0.5082, "step": 922 }, { "epoch": 0.594429238447915, "grad_norm": 0.40424849642243943, "learning_rate": 7.068110757008095e-06, "loss": 0.5205, "step": 923 }, { "epoch": 0.5950732571244566, "grad_norm": 0.44238171348907684, "learning_rate": 7.048764110192618e-06, "loss": 0.5154, "step": 924 }, { "epoch": 0.5957172758009982, "grad_norm": 0.4334233889572751, "learning_rate": 7.029429555988263e-06, "loss": 0.5088, "step": 925 }, { "epoch": 0.5963612944775398, "grad_norm": 0.42008233251790184, "learning_rate": 7.010107173617857e-06, "loss": 0.5236, "step": 926 }, { "epoch": 0.5970053131540815, "grad_norm": 0.43723983796067734, "learning_rate": 6.990797042254349e-06, "loss": 0.5095, "step": 927 }, { "epoch": 0.5976493318306231, "grad_norm": 0.4151056344821121, "learning_rate": 6.971499241020495e-06, "loss": 0.5109, "step": 928 }, { "epoch": 0.5982933505071647, "grad_norm": 0.4447360081049652, "learning_rate": 6.952213848988522e-06, "loss": 0.5187, "step": 929 }, { "epoch": 0.5989373691837063, "grad_norm": 0.40393399573416877, "learning_rate": 6.932940945179818e-06, "loss": 0.5014, "step": 930 }, { "epoch": 0.599581387860248, "grad_norm": 0.47167715263290083, "learning_rate": 6.913680608564597e-06, "loss": 0.518, "step": 931 }, { "epoch": 0.6002254065367896, "grad_norm": 0.4663210151494882, "learning_rate": 6.894432918061579e-06, "loss": 0.5151, "step": 932 }, { "epoch": 0.6008694252133312, "grad_norm": 0.4186074570604756, "learning_rate": 6.875197952537667e-06, "loss": 0.4921, "step": 933 }, { "epoch": 0.6015134438898728, "grad_norm": 0.4284199223822939, "learning_rate": 6.855975790807623e-06, "loss": 0.5337, "step": 934 }, { "epoch": 0.6021574625664144, "grad_norm": 0.43387442491176564, "learning_rate": 6.836766511633752e-06, "loss": 0.5127, "step": 935 }, { "epoch": 0.6028014812429561, "grad_norm": 0.44464181459424096, "learning_rate": 6.8175701937255645e-06, "loss": 0.537, "step": 936 }, { "epoch": 0.6034454999194977, "grad_norm": 0.4386523919305499, "learning_rate": 6.798386915739466e-06, "loss": 0.4974, "step": 937 }, { "epoch": 0.6040895185960393, "grad_norm": 0.4125354285539581, "learning_rate": 6.77921675627843e-06, "loss": 0.4984, "step": 938 }, { "epoch": 0.604733537272581, "grad_norm": 0.40934627277495084, "learning_rate": 6.760059793891684e-06, "loss": 0.5199, "step": 939 }, { "epoch": 0.6053775559491226, "grad_norm": 0.42364445344335167, "learning_rate": 6.740916107074372e-06, "loss": 0.5132, "step": 940 }, { "epoch": 0.6060215746256642, "grad_norm": 0.3797061961112294, "learning_rate": 6.7217857742672465e-06, "loss": 0.5024, "step": 941 }, { "epoch": 0.6066655933022058, "grad_norm": 0.4713108835006978, "learning_rate": 6.702668873856339e-06, "loss": 0.5122, "step": 942 }, { "epoch": 0.6073096119787473, "grad_norm": 0.4193321553713539, "learning_rate": 6.683565484172643e-06, "loss": 0.5187, "step": 943 }, { "epoch": 0.607953630655289, "grad_norm": 0.4003903111392671, "learning_rate": 6.664475683491797e-06, "loss": 0.5048, "step": 944 }, { "epoch": 0.6085976493318306, "grad_norm": 0.49874624251405986, "learning_rate": 6.645399550033753e-06, "loss": 0.4852, "step": 945 }, { "epoch": 0.6092416680083722, "grad_norm": 0.3743707220383118, "learning_rate": 6.6263371619624615e-06, "loss": 0.5174, "step": 946 }, { "epoch": 0.6098856866849138, "grad_norm": 0.378268412657302, "learning_rate": 6.607288597385561e-06, "loss": 0.5127, "step": 947 }, { "epoch": 0.6105297053614555, "grad_norm": 0.5162329246358469, "learning_rate": 6.588253934354039e-06, "loss": 0.5244, "step": 948 }, { "epoch": 0.6111737240379971, "grad_norm": 0.45634510933515027, "learning_rate": 6.569233250861924e-06, "loss": 0.5248, "step": 949 }, { "epoch": 0.6118177427145387, "grad_norm": 0.3839207690286241, "learning_rate": 6.550226624845961e-06, "loss": 0.5218, "step": 950 }, { "epoch": 0.6124617613910803, "grad_norm": 0.45451037776921377, "learning_rate": 6.531234134185303e-06, "loss": 0.5075, "step": 951 }, { "epoch": 0.613105780067622, "grad_norm": 0.6597561941697501, "learning_rate": 6.5122558567011775e-06, "loss": 0.5154, "step": 952 }, { "epoch": 0.6137497987441636, "grad_norm": 0.3983752690143602, "learning_rate": 6.493291870156575e-06, "loss": 0.5118, "step": 953 }, { "epoch": 0.6143938174207052, "grad_norm": 0.380868388262707, "learning_rate": 6.474342252255927e-06, "loss": 0.5101, "step": 954 }, { "epoch": 0.6150378360972468, "grad_norm": 0.4843665460672314, "learning_rate": 6.455407080644797e-06, "loss": 0.5156, "step": 955 }, { "epoch": 0.6156818547737885, "grad_norm": 0.42222890238845284, "learning_rate": 6.43648643290955e-06, "loss": 0.5134, "step": 956 }, { "epoch": 0.6163258734503301, "grad_norm": 0.39354614002643484, "learning_rate": 6.4175803865770395e-06, "loss": 0.5026, "step": 957 }, { "epoch": 0.6169698921268717, "grad_norm": 0.4510932433717051, "learning_rate": 6.398689019114289e-06, "loss": 0.5147, "step": 958 }, { "epoch": 0.6176139108034133, "grad_norm": 0.4551804466655937, "learning_rate": 6.379812407928178e-06, "loss": 0.5013, "step": 959 }, { "epoch": 0.618257929479955, "grad_norm": 0.39782931582430364, "learning_rate": 6.360950630365126e-06, "loss": 0.5036, "step": 960 }, { "epoch": 0.6189019481564966, "grad_norm": 0.40902431978224496, "learning_rate": 6.342103763710765e-06, "loss": 0.5225, "step": 961 }, { "epoch": 0.6195459668330382, "grad_norm": 0.402484711835745, "learning_rate": 6.323271885189636e-06, "loss": 0.5134, "step": 962 }, { "epoch": 0.6201899855095798, "grad_norm": 0.41465320252277077, "learning_rate": 6.304455071964861e-06, "loss": 0.5002, "step": 963 }, { "epoch": 0.6208340041861214, "grad_norm": 0.41853041004756064, "learning_rate": 6.2856534011378365e-06, "loss": 0.5215, "step": 964 }, { "epoch": 0.6214780228626631, "grad_norm": 0.38842196572890086, "learning_rate": 6.266866949747914e-06, "loss": 0.5164, "step": 965 }, { "epoch": 0.6221220415392046, "grad_norm": 0.41297533187582736, "learning_rate": 6.24809579477208e-06, "loss": 0.5129, "step": 966 }, { "epoch": 0.6227660602157462, "grad_norm": 0.4510683340827707, "learning_rate": 6.229340013124648e-06, "loss": 0.5152, "step": 967 }, { "epoch": 0.6234100788922878, "grad_norm": 0.4366294476996389, "learning_rate": 6.210599681656933e-06, "loss": 0.5063, "step": 968 }, { "epoch": 0.6240540975688295, "grad_norm": 0.36726687138016023, "learning_rate": 6.191874877156956e-06, "loss": 0.5266, "step": 969 }, { "epoch": 0.6246981162453711, "grad_norm": 0.4436728993414706, "learning_rate": 6.173165676349103e-06, "loss": 0.5192, "step": 970 }, { "epoch": 0.6253421349219127, "grad_norm": 3.5202109478571804, "learning_rate": 6.154472155893833e-06, "loss": 0.522, "step": 971 }, { "epoch": 0.6259861535984543, "grad_norm": 0.47574576531650437, "learning_rate": 6.135794392387353e-06, "loss": 0.492, "step": 972 }, { "epoch": 0.626630172274996, "grad_norm": 0.43530110590203974, "learning_rate": 6.1171324623613016e-06, "loss": 0.5045, "step": 973 }, { "epoch": 0.6272741909515376, "grad_norm": 0.41545061978614345, "learning_rate": 6.09848644228245e-06, "loss": 0.527, "step": 974 }, { "epoch": 0.6279182096280792, "grad_norm": 0.4695072399041791, "learning_rate": 6.079856408552368e-06, "loss": 0.5106, "step": 975 }, { "epoch": 0.6285622283046208, "grad_norm": 0.4608348160542928, "learning_rate": 6.061242437507131e-06, "loss": 0.504, "step": 976 }, { "epoch": 0.6292062469811625, "grad_norm": 0.4262356377630694, "learning_rate": 6.042644605416987e-06, "loss": 0.5128, "step": 977 }, { "epoch": 0.6298502656577041, "grad_norm": 0.4591827263942794, "learning_rate": 6.024062988486072e-06, "loss": 0.4966, "step": 978 }, { "epoch": 0.6304942843342457, "grad_norm": 0.3733192739214609, "learning_rate": 6.005497662852063e-06, "loss": 0.5068, "step": 979 }, { "epoch": 0.6311383030107873, "grad_norm": 0.4212660702768915, "learning_rate": 5.986948704585895e-06, "loss": 0.4899, "step": 980 }, { "epoch": 0.631782321687329, "grad_norm": 0.4445117876479244, "learning_rate": 5.968416189691433e-06, "loss": 0.5301, "step": 981 }, { "epoch": 0.6324263403638706, "grad_norm": 0.38560580104857894, "learning_rate": 5.949900194105167e-06, "loss": 0.5022, "step": 982 }, { "epoch": 0.6330703590404122, "grad_norm": 0.40302036537538505, "learning_rate": 5.9314007936959006e-06, "loss": 0.5321, "step": 983 }, { "epoch": 0.6337143777169538, "grad_norm": 0.4194533455785795, "learning_rate": 5.912918064264441e-06, "loss": 0.5, "step": 984 }, { "epoch": 0.6343583963934954, "grad_norm": 0.43062844615955764, "learning_rate": 5.89445208154328e-06, "loss": 0.4933, "step": 985 }, { "epoch": 0.6350024150700371, "grad_norm": 0.34985119371010265, "learning_rate": 5.876002921196296e-06, "loss": 0.4716, "step": 986 }, { "epoch": 0.6356464337465787, "grad_norm": 0.41353669823246714, "learning_rate": 5.857570658818434e-06, "loss": 0.5206, "step": 987 }, { "epoch": 0.6362904524231203, "grad_norm": 0.4148707108233063, "learning_rate": 5.839155369935407e-06, "loss": 0.5187, "step": 988 }, { "epoch": 0.6369344710996618, "grad_norm": 0.3824168568899086, "learning_rate": 5.820757130003369e-06, "loss": 0.5121, "step": 989 }, { "epoch": 0.6375784897762035, "grad_norm": 0.4214263867642053, "learning_rate": 5.802376014408632e-06, "loss": 0.5081, "step": 990 }, { "epoch": 0.6382225084527451, "grad_norm": 0.4002356789884498, "learning_rate": 5.78401209846732e-06, "loss": 0.5158, "step": 991 }, { "epoch": 0.6388665271292867, "grad_norm": 0.39670027292297144, "learning_rate": 5.765665457425102e-06, "loss": 0.5142, "step": 992 }, { "epoch": 0.6395105458058283, "grad_norm": 0.4154813327247274, "learning_rate": 5.747336166456849e-06, "loss": 0.5235, "step": 993 }, { "epoch": 0.64015456448237, "grad_norm": 0.38337105148189404, "learning_rate": 5.729024300666349e-06, "loss": 0.5072, "step": 994 }, { "epoch": 0.6407985831589116, "grad_norm": 0.3764304916054927, "learning_rate": 5.71072993508599e-06, "loss": 0.5227, "step": 995 }, { "epoch": 0.6414426018354532, "grad_norm": 0.3938103663424598, "learning_rate": 5.692453144676451e-06, "loss": 0.5049, "step": 996 }, { "epoch": 0.6420866205119948, "grad_norm": 0.40131102860211465, "learning_rate": 5.674194004326403e-06, "loss": 0.5009, "step": 997 }, { "epoch": 0.6427306391885365, "grad_norm": 0.3942669677857399, "learning_rate": 5.655952588852181e-06, "loss": 0.5331, "step": 998 }, { "epoch": 0.6433746578650781, "grad_norm": 0.47733693106293296, "learning_rate": 5.637728972997514e-06, "loss": 0.5296, "step": 999 }, { "epoch": 0.6440186765416197, "grad_norm": 0.3906373109173302, "learning_rate": 5.619523231433177e-06, "loss": 0.5101, "step": 1000 }, { "epoch": 0.6446626952181613, "grad_norm": 0.3937784023792871, "learning_rate": 5.60133543875672e-06, "loss": 0.5069, "step": 1001 }, { "epoch": 0.645306713894703, "grad_norm": 0.37592828051351873, "learning_rate": 5.5831656694921465e-06, "loss": 0.4892, "step": 1002 }, { "epoch": 0.6459507325712446, "grad_norm": 0.37532005418632636, "learning_rate": 5.5650139980895985e-06, "loss": 0.5123, "step": 1003 }, { "epoch": 0.6465947512477862, "grad_norm": 0.41247733150159194, "learning_rate": 5.546880498925079e-06, "loss": 0.5273, "step": 1004 }, { "epoch": 0.6472387699243278, "grad_norm": 0.4284936229087815, "learning_rate": 5.528765246300114e-06, "loss": 0.5123, "step": 1005 }, { "epoch": 0.6478827886008695, "grad_norm": 0.3589060303636662, "learning_rate": 5.510668314441474e-06, "loss": 0.4751, "step": 1006 }, { "epoch": 0.6485268072774111, "grad_norm": 0.4133419387911692, "learning_rate": 5.492589777500868e-06, "loss": 0.5151, "step": 1007 }, { "epoch": 0.6491708259539527, "grad_norm": 0.4147806142221937, "learning_rate": 5.4745297095546125e-06, "loss": 0.5234, "step": 1008 }, { "epoch": 0.6498148446304943, "grad_norm": 0.39889809343979304, "learning_rate": 5.456488184603366e-06, "loss": 0.5166, "step": 1009 }, { "epoch": 0.650458863307036, "grad_norm": 0.4092866649989099, "learning_rate": 5.438465276571796e-06, "loss": 0.5283, "step": 1010 }, { "epoch": 0.6511028819835776, "grad_norm": 0.3983592227338486, "learning_rate": 5.420461059308293e-06, "loss": 0.4972, "step": 1011 }, { "epoch": 0.6517469006601191, "grad_norm": 0.4355885255744418, "learning_rate": 5.40247560658467e-06, "loss": 0.4968, "step": 1012 }, { "epoch": 0.6523909193366607, "grad_norm": 0.41203482618510706, "learning_rate": 5.38450899209583e-06, "loss": 0.5239, "step": 1013 }, { "epoch": 0.6530349380132023, "grad_norm": 0.4331609118487775, "learning_rate": 5.366561289459512e-06, "loss": 0.5015, "step": 1014 }, { "epoch": 0.653678956689744, "grad_norm": 0.4322766012096395, "learning_rate": 5.3486325722159535e-06, "loss": 0.5205, "step": 1015 }, { "epoch": 0.6543229753662856, "grad_norm": 0.40084706398790476, "learning_rate": 5.330722913827594e-06, "loss": 0.4824, "step": 1016 }, { "epoch": 0.6549669940428272, "grad_norm": 0.3671574177958818, "learning_rate": 5.312832387678793e-06, "loss": 0.509, "step": 1017 }, { "epoch": 0.6556110127193688, "grad_norm": 0.41909250278128046, "learning_rate": 5.2949610670755e-06, "loss": 0.5064, "step": 1018 }, { "epoch": 0.6562550313959105, "grad_norm": 0.42817095417264045, "learning_rate": 5.2771090252449855e-06, "loss": 0.5215, "step": 1019 }, { "epoch": 0.6568990500724521, "grad_norm": 0.3997136359031232, "learning_rate": 5.259276335335522e-06, "loss": 0.5082, "step": 1020 }, { "epoch": 0.6575430687489937, "grad_norm": 0.3803249104317911, "learning_rate": 5.241463070416076e-06, "loss": 0.508, "step": 1021 }, { "epoch": 0.6581870874255353, "grad_norm": 0.3677364360087339, "learning_rate": 5.223669303476041e-06, "loss": 0.4847, "step": 1022 }, { "epoch": 0.658831106102077, "grad_norm": 0.44516737243056365, "learning_rate": 5.2058951074248985e-06, "loss": 0.5435, "step": 1023 }, { "epoch": 0.6594751247786186, "grad_norm": 0.38538792831751695, "learning_rate": 5.18814055509195e-06, "loss": 0.5167, "step": 1024 }, { "epoch": 0.6601191434551602, "grad_norm": 0.4255521556207079, "learning_rate": 5.170405719226009e-06, "loss": 0.5041, "step": 1025 }, { "epoch": 0.6607631621317018, "grad_norm": 0.3824922910864398, "learning_rate": 5.152690672495091e-06, "loss": 0.5236, "step": 1026 }, { "epoch": 0.6614071808082435, "grad_norm": 0.37238226389048984, "learning_rate": 5.134995487486139e-06, "loss": 0.5127, "step": 1027 }, { "epoch": 0.6620511994847851, "grad_norm": 0.376199996411082, "learning_rate": 5.117320236704697e-06, "loss": 0.5195, "step": 1028 }, { "epoch": 0.6626952181613267, "grad_norm": 0.4104657132592845, "learning_rate": 5.099664992574645e-06, "loss": 0.5163, "step": 1029 }, { "epoch": 0.6633392368378683, "grad_norm": 0.37941140792257355, "learning_rate": 5.08202982743788e-06, "loss": 0.526, "step": 1030 }, { "epoch": 0.66398325551441, "grad_norm": 0.36904780762204076, "learning_rate": 5.064414813554022e-06, "loss": 0.527, "step": 1031 }, { "epoch": 0.6646272741909516, "grad_norm": 0.37447999182737973, "learning_rate": 5.046820023100129e-06, "loss": 0.4953, "step": 1032 }, { "epoch": 0.6652712928674932, "grad_norm": 0.41951417682063297, "learning_rate": 5.029245528170383e-06, "loss": 0.5048, "step": 1033 }, { "epoch": 0.6659153115440348, "grad_norm": 0.362162936547233, "learning_rate": 5.01169140077582e-06, "loss": 0.506, "step": 1034 }, { "epoch": 0.6665593302205765, "grad_norm": 0.4272799124668392, "learning_rate": 4.9941577128440144e-06, "loss": 0.5025, "step": 1035 }, { "epoch": 0.667203348897118, "grad_norm": 0.3937441161700201, "learning_rate": 4.976644536218783e-06, "loss": 0.5048, "step": 1036 }, { "epoch": 0.6678473675736596, "grad_norm": 0.37833191342426914, "learning_rate": 4.959151942659911e-06, "loss": 0.512, "step": 1037 }, { "epoch": 0.6684913862502012, "grad_norm": 0.3724234475343195, "learning_rate": 4.9416800038428326e-06, "loss": 0.5166, "step": 1038 }, { "epoch": 0.6691354049267428, "grad_norm": 0.3800672410825139, "learning_rate": 4.924228791358358e-06, "loss": 0.4922, "step": 1039 }, { "epoch": 0.6697794236032845, "grad_norm": 0.459917432219705, "learning_rate": 4.9067983767123736e-06, "loss": 0.5059, "step": 1040 }, { "epoch": 0.6704234422798261, "grad_norm": 0.38247721736103096, "learning_rate": 4.889388831325537e-06, "loss": 0.5012, "step": 1041 }, { "epoch": 0.6710674609563677, "grad_norm": 0.3710038855903467, "learning_rate": 4.872000226533001e-06, "loss": 0.5002, "step": 1042 }, { "epoch": 0.6717114796329093, "grad_norm": 0.41634431614963463, "learning_rate": 4.854632633584118e-06, "loss": 0.5193, "step": 1043 }, { "epoch": 0.672355498309451, "grad_norm": 0.3909835662877898, "learning_rate": 4.837286123642141e-06, "loss": 0.5138, "step": 1044 }, { "epoch": 0.6729995169859926, "grad_norm": 0.3475520460050808, "learning_rate": 4.819960767783939e-06, "loss": 0.5195, "step": 1045 }, { "epoch": 0.6736435356625342, "grad_norm": 0.39999189096904436, "learning_rate": 4.802656636999693e-06, "loss": 0.5172, "step": 1046 }, { "epoch": 0.6742875543390758, "grad_norm": 0.36261802148059064, "learning_rate": 4.7853738021926284e-06, "loss": 0.5039, "step": 1047 }, { "epoch": 0.6749315730156175, "grad_norm": 0.3982679456733602, "learning_rate": 4.7681123341787e-06, "loss": 0.5105, "step": 1048 }, { "epoch": 0.6755755916921591, "grad_norm": 0.35190904069604106, "learning_rate": 4.750872303686317e-06, "loss": 0.492, "step": 1049 }, { "epoch": 0.6762196103687007, "grad_norm": 0.3834189567027512, "learning_rate": 4.733653781356055e-06, "loss": 0.519, "step": 1050 }, { "epoch": 0.6768636290452423, "grad_norm": 0.3688826531168592, "learning_rate": 4.716456837740347e-06, "loss": 0.5196, "step": 1051 }, { "epoch": 0.677507647721784, "grad_norm": 0.4296783330178615, "learning_rate": 4.699281543303222e-06, "loss": 0.5321, "step": 1052 }, { "epoch": 0.6781516663983256, "grad_norm": 0.3677581912046351, "learning_rate": 4.68212796841999e-06, "loss": 0.483, "step": 1053 }, { "epoch": 0.6787956850748672, "grad_norm": 0.33946063596162, "learning_rate": 4.664996183376972e-06, "loss": 0.5113, "step": 1054 }, { "epoch": 0.6794397037514088, "grad_norm": 0.38782577314443323, "learning_rate": 4.6478862583712096e-06, "loss": 0.5142, "step": 1055 }, { "epoch": 0.6800837224279505, "grad_norm": 0.3653875655089823, "learning_rate": 4.630798263510162e-06, "loss": 0.5002, "step": 1056 }, { "epoch": 0.6807277411044921, "grad_norm": 0.40427151687636464, "learning_rate": 4.613732268811444e-06, "loss": 0.5068, "step": 1057 }, { "epoch": 0.6813717597810337, "grad_norm": 0.3782500131720935, "learning_rate": 4.596688344202509e-06, "loss": 0.5294, "step": 1058 }, { "epoch": 0.6820157784575752, "grad_norm": 0.3989257373619616, "learning_rate": 4.579666559520395e-06, "loss": 0.4981, "step": 1059 }, { "epoch": 0.6826597971341168, "grad_norm": 0.37702471164896123, "learning_rate": 4.562666984511416e-06, "loss": 0.5108, "step": 1060 }, { "epoch": 0.6833038158106585, "grad_norm": 0.3428232519257165, "learning_rate": 4.545689688830877e-06, "loss": 0.497, "step": 1061 }, { "epoch": 0.6839478344872001, "grad_norm": 0.36679063615614516, "learning_rate": 4.528734742042803e-06, "loss": 0.5207, "step": 1062 }, { "epoch": 0.6845918531637417, "grad_norm": 0.3887535630859325, "learning_rate": 4.511802213619635e-06, "loss": 0.5235, "step": 1063 }, { "epoch": 0.6852358718402833, "grad_norm": 0.3908045793500379, "learning_rate": 4.494892172941965e-06, "loss": 0.5223, "step": 1064 }, { "epoch": 0.685879890516825, "grad_norm": 0.39013929933705915, "learning_rate": 4.478004689298241e-06, "loss": 0.5225, "step": 1065 }, { "epoch": 0.6865239091933666, "grad_norm": 0.3704716277115631, "learning_rate": 4.461139831884475e-06, "loss": 0.5097, "step": 1066 }, { "epoch": 0.6871679278699082, "grad_norm": 0.39408117095716655, "learning_rate": 4.444297669803981e-06, "loss": 0.5466, "step": 1067 }, { "epoch": 0.6878119465464498, "grad_norm": 0.368852276994511, "learning_rate": 4.427478272067066e-06, "loss": 0.5063, "step": 1068 }, { "epoch": 0.6884559652229915, "grad_norm": 0.34518711357809134, "learning_rate": 4.410681707590774e-06, "loss": 0.5015, "step": 1069 }, { "epoch": 0.6890999838995331, "grad_norm": 0.34767643121450964, "learning_rate": 4.393908045198585e-06, "loss": 0.5035, "step": 1070 }, { "epoch": 0.6897440025760747, "grad_norm": 0.40751559010001726, "learning_rate": 4.3771573536201314e-06, "loss": 0.4933, "step": 1071 }, { "epoch": 0.6903880212526163, "grad_norm": 0.3787889278682592, "learning_rate": 4.360429701490935e-06, "loss": 0.5111, "step": 1072 }, { "epoch": 0.691032039929158, "grad_norm": 0.37085716536512725, "learning_rate": 4.34372515735211e-06, "loss": 0.5194, "step": 1073 }, { "epoch": 0.6916760586056996, "grad_norm": 0.34130707068120697, "learning_rate": 4.327043789650078e-06, "loss": 0.5022, "step": 1074 }, { "epoch": 0.6923200772822412, "grad_norm": 0.3572281100413778, "learning_rate": 4.310385666736311e-06, "loss": 0.4873, "step": 1075 }, { "epoch": 0.6929640959587828, "grad_norm": 0.3742970142894756, "learning_rate": 4.2937508568670194e-06, "loss": 0.5028, "step": 1076 }, { "epoch": 0.6936081146353245, "grad_norm": 0.3823877102925387, "learning_rate": 4.277139428202902e-06, "loss": 0.4887, "step": 1077 }, { "epoch": 0.6942521333118661, "grad_norm": 0.4110079289580979, "learning_rate": 4.260551448808852e-06, "loss": 0.5257, "step": 1078 }, { "epoch": 0.6948961519884077, "grad_norm": 0.44148568934309357, "learning_rate": 4.24398698665367e-06, "loss": 0.5135, "step": 1079 }, { "epoch": 0.6955401706649493, "grad_norm": 0.3488998388639505, "learning_rate": 4.2274461096098085e-06, "loss": 0.4938, "step": 1080 }, { "epoch": 0.696184189341491, "grad_norm": 0.3938008406786717, "learning_rate": 4.210928885453068e-06, "loss": 0.5271, "step": 1081 }, { "epoch": 0.6968282080180325, "grad_norm": 0.3491210056073957, "learning_rate": 4.194435381862343e-06, "loss": 0.4906, "step": 1082 }, { "epoch": 0.6974722266945741, "grad_norm": 0.41359159116626465, "learning_rate": 4.17796566641933e-06, "loss": 0.4992, "step": 1083 }, { "epoch": 0.6981162453711157, "grad_norm": 0.40036442028616337, "learning_rate": 4.1615198066082475e-06, "loss": 0.5167, "step": 1084 }, { "epoch": 0.6987602640476573, "grad_norm": 0.40596425541509396, "learning_rate": 4.145097869815579e-06, "loss": 0.505, "step": 1085 }, { "epoch": 0.699404282724199, "grad_norm": 0.3668306035169273, "learning_rate": 4.12869992332977e-06, "loss": 0.5199, "step": 1086 }, { "epoch": 0.7000483014007406, "grad_norm": 0.3839673132959655, "learning_rate": 4.112326034340975e-06, "loss": 0.5056, "step": 1087 }, { "epoch": 0.7006923200772822, "grad_norm": 0.413522365459507, "learning_rate": 4.095976269940777e-06, "loss": 0.5016, "step": 1088 }, { "epoch": 0.7013363387538238, "grad_norm": 0.45847964140624897, "learning_rate": 4.079650697121895e-06, "loss": 0.4868, "step": 1089 }, { "epoch": 0.7019803574303655, "grad_norm": 0.3602909581472877, "learning_rate": 4.0633493827779425e-06, "loss": 0.4966, "step": 1090 }, { "epoch": 0.7026243761069071, "grad_norm": 0.37610910023992694, "learning_rate": 4.047072393703115e-06, "loss": 0.4898, "step": 1091 }, { "epoch": 0.7032683947834487, "grad_norm": 0.4168288351214389, "learning_rate": 4.03081979659195e-06, "loss": 0.4965, "step": 1092 }, { "epoch": 0.7039124134599903, "grad_norm": 0.4263790885331947, "learning_rate": 4.0145916580390335e-06, "loss": 0.5237, "step": 1093 }, { "epoch": 0.704556432136532, "grad_norm": 0.38309200369207885, "learning_rate": 3.998388044538737e-06, "loss": 0.5028, "step": 1094 }, { "epoch": 0.7052004508130736, "grad_norm": 0.3940650330700949, "learning_rate": 3.98220902248494e-06, "loss": 0.539, "step": 1095 }, { "epoch": 0.7058444694896152, "grad_norm": 0.3532996281721323, "learning_rate": 3.966054658170754e-06, "loss": 0.512, "step": 1096 }, { "epoch": 0.7064884881661568, "grad_norm": 0.3655456740316186, "learning_rate": 3.949925017788261e-06, "loss": 0.5013, "step": 1097 }, { "epoch": 0.7071325068426985, "grad_norm": 0.43165005862917005, "learning_rate": 3.933820167428241e-06, "loss": 0.5134, "step": 1098 }, { "epoch": 0.7077765255192401, "grad_norm": 0.4471382835434644, "learning_rate": 3.917740173079886e-06, "loss": 0.5231, "step": 1099 }, { "epoch": 0.7084205441957817, "grad_norm": 0.3530081192416579, "learning_rate": 3.901685100630554e-06, "loss": 0.4888, "step": 1100 }, { "epoch": 0.7090645628723233, "grad_norm": 0.3421206001903985, "learning_rate": 3.885655015865477e-06, "loss": 0.4952, "step": 1101 }, { "epoch": 0.709708581548865, "grad_norm": 0.38156543828456924, "learning_rate": 3.869649984467504e-06, "loss": 0.5261, "step": 1102 }, { "epoch": 0.7103526002254066, "grad_norm": 0.36818584779598, "learning_rate": 3.853670072016833e-06, "loss": 0.5052, "step": 1103 }, { "epoch": 0.7109966189019482, "grad_norm": 0.357608156935585, "learning_rate": 3.837715343990727e-06, "loss": 0.5089, "step": 1104 }, { "epoch": 0.7116406375784897, "grad_norm": 0.37135212043750754, "learning_rate": 3.821785865763269e-06, "loss": 0.4909, "step": 1105 }, { "epoch": 0.7122846562550313, "grad_norm": 0.34365380988134014, "learning_rate": 3.8058817026050676e-06, "loss": 0.5066, "step": 1106 }, { "epoch": 0.712928674931573, "grad_norm": 0.36867952991337855, "learning_rate": 3.7900029196830167e-06, "loss": 0.5132, "step": 1107 }, { "epoch": 0.7135726936081146, "grad_norm": 0.40252478689336696, "learning_rate": 3.7741495820600128e-06, "loss": 0.502, "step": 1108 }, { "epoch": 0.7142167122846562, "grad_norm": 0.39032443979865866, "learning_rate": 3.7583217546946805e-06, "loss": 0.4943, "step": 1109 }, { "epoch": 0.7148607309611978, "grad_norm": 0.3405554106851177, "learning_rate": 3.742519502441132e-06, "loss": 0.509, "step": 1110 }, { "epoch": 0.7155047496377395, "grad_norm": 0.3750678341067509, "learning_rate": 3.726742890048671e-06, "loss": 0.5022, "step": 1111 }, { "epoch": 0.7161487683142811, "grad_norm": 0.3752498583635196, "learning_rate": 3.7109919821615546e-06, "loss": 0.5146, "step": 1112 }, { "epoch": 0.7167927869908227, "grad_norm": 0.3741366413635315, "learning_rate": 3.6952668433187145e-06, "loss": 0.502, "step": 1113 }, { "epoch": 0.7174368056673643, "grad_norm": 0.4177268987160756, "learning_rate": 3.6795675379534857e-06, "loss": 0.5112, "step": 1114 }, { "epoch": 0.718080824343906, "grad_norm": 0.3468952672595873, "learning_rate": 3.663894130393364e-06, "loss": 0.5006, "step": 1115 }, { "epoch": 0.7187248430204476, "grad_norm": 0.34698672322150537, "learning_rate": 3.6482466848597164e-06, "loss": 0.5023, "step": 1116 }, { "epoch": 0.7193688616969892, "grad_norm": 0.35055900883533986, "learning_rate": 3.63262526546754e-06, "loss": 0.5002, "step": 1117 }, { "epoch": 0.7200128803735308, "grad_norm": 0.37299962803165865, "learning_rate": 3.6170299362251926e-06, "loss": 0.5001, "step": 1118 }, { "epoch": 0.7206568990500725, "grad_norm": 0.3425273884960766, "learning_rate": 3.601460761034117e-06, "loss": 0.5062, "step": 1119 }, { "epoch": 0.7213009177266141, "grad_norm": 0.34247016874764075, "learning_rate": 3.585917803688603e-06, "loss": 0.4987, "step": 1120 }, { "epoch": 0.7219449364031557, "grad_norm": 0.36948322869503697, "learning_rate": 3.5704011278755035e-06, "loss": 0.5139, "step": 1121 }, { "epoch": 0.7225889550796973, "grad_norm": 0.40544354769854685, "learning_rate": 3.5549107971739905e-06, "loss": 0.513, "step": 1122 }, { "epoch": 0.723232973756239, "grad_norm": 0.3616762109375749, "learning_rate": 3.539446875055287e-06, "loss": 0.5262, "step": 1123 }, { "epoch": 0.7238769924327806, "grad_norm": 0.38115033180230123, "learning_rate": 3.5240094248824e-06, "loss": 0.4959, "step": 1124 }, { "epoch": 0.7245210111093222, "grad_norm": 0.35104034200865525, "learning_rate": 3.5085985099098753e-06, "loss": 0.4958, "step": 1125 }, { "epoch": 0.7251650297858638, "grad_norm": 0.3619988939246392, "learning_rate": 3.4932141932835362e-06, "loss": 0.5094, "step": 1126 }, { "epoch": 0.7258090484624055, "grad_norm": 0.3686056371556214, "learning_rate": 3.4778565380402064e-06, "loss": 0.5086, "step": 1127 }, { "epoch": 0.7264530671389471, "grad_norm": 0.3460008721659868, "learning_rate": 3.4625256071074776e-06, "loss": 0.4837, "step": 1128 }, { "epoch": 0.7270970858154886, "grad_norm": 0.36827706750788813, "learning_rate": 3.4472214633034295e-06, "loss": 0.4969, "step": 1129 }, { "epoch": 0.7277411044920302, "grad_norm": 0.3592998015734723, "learning_rate": 3.431944169336391e-06, "loss": 0.4937, "step": 1130 }, { "epoch": 0.7283851231685718, "grad_norm": 0.3566027836199043, "learning_rate": 3.4166937878046723e-06, "loss": 0.4956, "step": 1131 }, { "epoch": 0.7290291418451135, "grad_norm": 0.3311043725865018, "learning_rate": 3.4014703811963024e-06, "loss": 0.4898, "step": 1132 }, { "epoch": 0.7296731605216551, "grad_norm": 0.3753634339859225, "learning_rate": 3.386274011888796e-06, "loss": 0.5048, "step": 1133 }, { "epoch": 0.7303171791981967, "grad_norm": 0.36072414972307354, "learning_rate": 3.3711047421488676e-06, "loss": 0.51, "step": 1134 }, { "epoch": 0.7309611978747383, "grad_norm": 0.35821615470543716, "learning_rate": 3.3559626341322027e-06, "loss": 0.5112, "step": 1135 }, { "epoch": 0.73160521655128, "grad_norm": 0.3492357028098777, "learning_rate": 3.3408477498831917e-06, "loss": 0.4998, "step": 1136 }, { "epoch": 0.7322492352278216, "grad_norm": 0.36589819368755466, "learning_rate": 3.325760151334668e-06, "loss": 0.5131, "step": 1137 }, { "epoch": 0.7328932539043632, "grad_norm": 0.36691217712191776, "learning_rate": 3.3106999003076745e-06, "loss": 0.518, "step": 1138 }, { "epoch": 0.7335372725809048, "grad_norm": 0.3860508815466741, "learning_rate": 3.295667058511186e-06, "loss": 0.5099, "step": 1139 }, { "epoch": 0.7341812912574465, "grad_norm": 0.33203677818610017, "learning_rate": 3.280661687541876e-06, "loss": 0.4948, "step": 1140 }, { "epoch": 0.7348253099339881, "grad_norm": 0.36291705601032676, "learning_rate": 3.265683848883859e-06, "loss": 0.5205, "step": 1141 }, { "epoch": 0.7354693286105297, "grad_norm": 0.3668543561517825, "learning_rate": 3.2507336039084315e-06, "loss": 0.509, "step": 1142 }, { "epoch": 0.7361133472870713, "grad_norm": 0.36681140205537255, "learning_rate": 3.2358110138738297e-06, "loss": 0.4952, "step": 1143 }, { "epoch": 0.736757365963613, "grad_norm": 0.37920402445946233, "learning_rate": 3.2209161399249677e-06, "loss": 0.5001, "step": 1144 }, { "epoch": 0.7374013846401546, "grad_norm": 0.3592372843017544, "learning_rate": 3.2060490430932033e-06, "loss": 0.5057, "step": 1145 }, { "epoch": 0.7380454033166962, "grad_norm": 0.35453362680055256, "learning_rate": 3.1912097842960676e-06, "loss": 0.5158, "step": 1146 }, { "epoch": 0.7386894219932378, "grad_norm": 0.3788440585336443, "learning_rate": 3.176398424337035e-06, "loss": 0.513, "step": 1147 }, { "epoch": 0.7393334406697795, "grad_norm": 0.38497526548450667, "learning_rate": 3.1616150239052647e-06, "loss": 0.513, "step": 1148 }, { "epoch": 0.7399774593463211, "grad_norm": 0.38186010022038885, "learning_rate": 3.1468596435753418e-06, "loss": 0.5159, "step": 1149 }, { "epoch": 0.7406214780228627, "grad_norm": 0.3895136662590793, "learning_rate": 3.132132343807056e-06, "loss": 0.5121, "step": 1150 }, { "epoch": 0.7412654966994043, "grad_norm": 0.33982154330205155, "learning_rate": 3.117433184945121e-06, "loss": 0.4892, "step": 1151 }, { "epoch": 0.7419095153759458, "grad_norm": 0.3533546081343906, "learning_rate": 3.1027622272189572e-06, "loss": 0.5161, "step": 1152 }, { "epoch": 0.7425535340524875, "grad_norm": 0.4057690409378272, "learning_rate": 3.0881195307424282e-06, "loss": 0.5417, "step": 1153 }, { "epoch": 0.7431975527290291, "grad_norm": 0.4045767296063783, "learning_rate": 3.073505155513591e-06, "loss": 0.5117, "step": 1154 }, { "epoch": 0.7438415714055707, "grad_norm": 0.34885769453276483, "learning_rate": 3.058919161414463e-06, "loss": 0.5063, "step": 1155 }, { "epoch": 0.7444855900821123, "grad_norm": 0.3537905798028837, "learning_rate": 3.0443616082107753e-06, "loss": 0.52, "step": 1156 }, { "epoch": 0.745129608758654, "grad_norm": 0.3485208706816209, "learning_rate": 3.0298325555517105e-06, "loss": 0.4994, "step": 1157 }, { "epoch": 0.7457736274351956, "grad_norm": 0.3519440566227377, "learning_rate": 3.015332062969685e-06, "loss": 0.5016, "step": 1158 }, { "epoch": 0.7464176461117372, "grad_norm": 0.3653806307908223, "learning_rate": 3.0008601898800772e-06, "loss": 0.4963, "step": 1159 }, { "epoch": 0.7470616647882788, "grad_norm": 0.339628431304075, "learning_rate": 2.9864169955810085e-06, "loss": 0.5041, "step": 1160 }, { "epoch": 0.7477056834648205, "grad_norm": 0.3519223283178697, "learning_rate": 2.972002539253088e-06, "loss": 0.5138, "step": 1161 }, { "epoch": 0.7483497021413621, "grad_norm": 0.34244533985404707, "learning_rate": 2.9576168799591663e-06, "loss": 0.4891, "step": 1162 }, { "epoch": 0.7489937208179037, "grad_norm": 0.3762909561392004, "learning_rate": 2.9432600766441066e-06, "loss": 0.5006, "step": 1163 }, { "epoch": 0.7496377394944453, "grad_norm": 0.3439953482488337, "learning_rate": 2.9289321881345257e-06, "loss": 0.5187, "step": 1164 }, { "epoch": 0.750281758170987, "grad_norm": 0.34571690081695533, "learning_rate": 2.914633273138572e-06, "loss": 0.5092, "step": 1165 }, { "epoch": 0.7509257768475286, "grad_norm": 0.4067810480526117, "learning_rate": 2.900363390245674e-06, "loss": 0.5346, "step": 1166 }, { "epoch": 0.7515697955240702, "grad_norm": 0.36862151128311627, "learning_rate": 2.886122597926294e-06, "loss": 0.5217, "step": 1167 }, { "epoch": 0.7522138142006118, "grad_norm": 0.3540135161607594, "learning_rate": 2.8719109545317102e-06, "loss": 0.5182, "step": 1168 }, { "epoch": 0.7528578328771535, "grad_norm": 0.3527788095027769, "learning_rate": 2.8577285182937477e-06, "loss": 0.5104, "step": 1169 }, { "epoch": 0.7535018515536951, "grad_norm": 0.3398981795300321, "learning_rate": 2.8435753473245697e-06, "loss": 0.4962, "step": 1170 }, { "epoch": 0.7541458702302367, "grad_norm": 0.361752487303256, "learning_rate": 2.8294514996164224e-06, "loss": 0.5086, "step": 1171 }, { "epoch": 0.7547898889067783, "grad_norm": 0.35686620622096304, "learning_rate": 2.8153570330413925e-06, "loss": 0.5016, "step": 1172 }, { "epoch": 0.75543390758332, "grad_norm": 0.3736878023297454, "learning_rate": 2.8012920053511916e-06, "loss": 0.5209, "step": 1173 }, { "epoch": 0.7560779262598616, "grad_norm": 0.379061118132694, "learning_rate": 2.7872564741768917e-06, "loss": 0.4983, "step": 1174 }, { "epoch": 0.7567219449364031, "grad_norm": 0.3697534658165469, "learning_rate": 2.7732504970287154e-06, "loss": 0.5428, "step": 1175 }, { "epoch": 0.7573659636129447, "grad_norm": 0.38155587766822746, "learning_rate": 2.759274131295787e-06, "loss": 0.4849, "step": 1176 }, { "epoch": 0.7580099822894864, "grad_norm": 0.38833166927464186, "learning_rate": 2.7453274342458903e-06, "loss": 0.5135, "step": 1177 }, { "epoch": 0.758654000966028, "grad_norm": 0.37031690744377993, "learning_rate": 2.7314104630252502e-06, "loss": 0.5167, "step": 1178 }, { "epoch": 0.7592980196425696, "grad_norm": 0.35117406608828455, "learning_rate": 2.7175232746582926e-06, "loss": 0.4833, "step": 1179 }, { "epoch": 0.7599420383191112, "grad_norm": 0.3844592583386798, "learning_rate": 2.7036659260473973e-06, "loss": 0.5142, "step": 1180 }, { "epoch": 0.7605860569956528, "grad_norm": 0.36754348532473335, "learning_rate": 2.6898384739726934e-06, "loss": 0.495, "step": 1181 }, { "epoch": 0.7612300756721945, "grad_norm": 0.3451869977489554, "learning_rate": 2.6760409750917925e-06, "loss": 0.4933, "step": 1182 }, { "epoch": 0.7618740943487361, "grad_norm": 0.3752043809390738, "learning_rate": 2.662273485939586e-06, "loss": 0.498, "step": 1183 }, { "epoch": 0.7625181130252777, "grad_norm": 0.35715276547745844, "learning_rate": 2.648536062927999e-06, "loss": 0.5221, "step": 1184 }, { "epoch": 0.7631621317018193, "grad_norm": 0.3534916677723126, "learning_rate": 2.6348287623457534e-06, "loss": 0.4993, "step": 1185 }, { "epoch": 0.763806150378361, "grad_norm": 0.3660423793387728, "learning_rate": 2.6211516403581585e-06, "loss": 0.5126, "step": 1186 }, { "epoch": 0.7644501690549026, "grad_norm": 0.37158606655160825, "learning_rate": 2.6075047530068544e-06, "loss": 0.493, "step": 1187 }, { "epoch": 0.7650941877314442, "grad_norm": 0.3579318050722728, "learning_rate": 2.593888156209603e-06, "loss": 0.4896, "step": 1188 }, { "epoch": 0.7657382064079858, "grad_norm": 0.34838255418113395, "learning_rate": 2.580301905760052e-06, "loss": 0.5068, "step": 1189 }, { "epoch": 0.7663822250845275, "grad_norm": 0.3738316197968873, "learning_rate": 2.5667460573275028e-06, "loss": 0.5078, "step": 1190 }, { "epoch": 0.7670262437610691, "grad_norm": 0.3696366245416866, "learning_rate": 2.5532206664566907e-06, "loss": 0.522, "step": 1191 }, { "epoch": 0.7676702624376107, "grad_norm": 0.3696710537710691, "learning_rate": 2.5397257885675396e-06, "loss": 0.4941, "step": 1192 }, { "epoch": 0.7683142811141523, "grad_norm": 0.3360046065185081, "learning_rate": 2.5262614789549624e-06, "loss": 0.5063, "step": 1193 }, { "epoch": 0.768958299790694, "grad_norm": 0.383147902418144, "learning_rate": 2.512827792788606e-06, "loss": 0.5109, "step": 1194 }, { "epoch": 0.7696023184672356, "grad_norm": 0.36453139811716473, "learning_rate": 2.4994247851126475e-06, "loss": 0.5128, "step": 1195 }, { "epoch": 0.7702463371437772, "grad_norm": 0.3738704388367858, "learning_rate": 2.48605251084556e-06, "loss": 0.494, "step": 1196 }, { "epoch": 0.7708903558203188, "grad_norm": 0.3479562528690192, "learning_rate": 2.472711024779879e-06, "loss": 0.5022, "step": 1197 }, { "epoch": 0.7715343744968605, "grad_norm": 0.35124217500708393, "learning_rate": 2.459400381581997e-06, "loss": 0.4894, "step": 1198 }, { "epoch": 0.772178393173402, "grad_norm": 0.3444177441787963, "learning_rate": 2.4461206357919154e-06, "loss": 0.5087, "step": 1199 }, { "epoch": 0.7728224118499436, "grad_norm": 0.37437895786899544, "learning_rate": 2.432871841823047e-06, "loss": 0.4876, "step": 1200 }, { "epoch": 0.7734664305264852, "grad_norm": 0.36656974038653617, "learning_rate": 2.4196540539619774e-06, "loss": 0.5119, "step": 1201 }, { "epoch": 0.7741104492030269, "grad_norm": 0.3691921301778148, "learning_rate": 2.406467326368237e-06, "loss": 0.4888, "step": 1202 }, { "epoch": 0.7747544678795685, "grad_norm": 0.34817455893454774, "learning_rate": 2.3933117130741e-06, "loss": 0.4864, "step": 1203 }, { "epoch": 0.7753984865561101, "grad_norm": 0.3326292126595736, "learning_rate": 2.3801872679843384e-06, "loss": 0.5189, "step": 1204 }, { "epoch": 0.7760425052326517, "grad_norm": 0.37446477635059455, "learning_rate": 2.367094044876023e-06, "loss": 0.5091, "step": 1205 }, { "epoch": 0.7766865239091933, "grad_norm": 0.3440839699979203, "learning_rate": 2.3540320973982924e-06, "loss": 0.4892, "step": 1206 }, { "epoch": 0.777330542585735, "grad_norm": 0.3674754739904339, "learning_rate": 2.3410014790721245e-06, "loss": 0.5034, "step": 1207 }, { "epoch": 0.7779745612622766, "grad_norm": 0.36807232956646224, "learning_rate": 2.328002243290138e-06, "loss": 0.5219, "step": 1208 }, { "epoch": 0.7786185799388182, "grad_norm": 0.3513505654900653, "learning_rate": 2.3150344433163617e-06, "loss": 0.4783, "step": 1209 }, { "epoch": 0.7792625986153598, "grad_norm": 0.41657665942954514, "learning_rate": 2.3020981322860057e-06, "loss": 0.5153, "step": 1210 }, { "epoch": 0.7799066172919015, "grad_norm": 0.35318788921918814, "learning_rate": 2.2891933632052697e-06, "loss": 0.4989, "step": 1211 }, { "epoch": 0.7805506359684431, "grad_norm": 0.3535026507038893, "learning_rate": 2.2763201889510987e-06, "loss": 0.5008, "step": 1212 }, { "epoch": 0.7811946546449847, "grad_norm": 0.32999230584856604, "learning_rate": 2.263478662270987e-06, "loss": 0.5005, "step": 1213 }, { "epoch": 0.7818386733215263, "grad_norm": 0.3629735206011998, "learning_rate": 2.2506688357827546e-06, "loss": 0.5064, "step": 1214 }, { "epoch": 0.782482691998068, "grad_norm": 0.3618151651702103, "learning_rate": 2.2378907619743196e-06, "loss": 0.5239, "step": 1215 }, { "epoch": 0.7831267106746096, "grad_norm": 0.3626583171744881, "learning_rate": 2.2251444932035094e-06, "loss": 0.494, "step": 1216 }, { "epoch": 0.7837707293511512, "grad_norm": 0.34092098694893, "learning_rate": 2.212430081697816e-06, "loss": 0.4949, "step": 1217 }, { "epoch": 0.7844147480276928, "grad_norm": 0.3541152666367315, "learning_rate": 2.1997475795542113e-06, "loss": 0.5005, "step": 1218 }, { "epoch": 0.7850587667042345, "grad_norm": 0.34401487073483983, "learning_rate": 2.1870970387389124e-06, "loss": 0.4717, "step": 1219 }, { "epoch": 0.7857027853807761, "grad_norm": 0.36528463643884207, "learning_rate": 2.1744785110871713e-06, "loss": 0.5059, "step": 1220 }, { "epoch": 0.7863468040573177, "grad_norm": 0.3948570755908672, "learning_rate": 2.161892048303078e-06, "loss": 0.5153, "step": 1221 }, { "epoch": 0.7869908227338592, "grad_norm": 0.38017456613909834, "learning_rate": 2.149337701959325e-06, "loss": 0.5247, "step": 1222 }, { "epoch": 0.7876348414104009, "grad_norm": 0.33519617588926603, "learning_rate": 2.136815523497019e-06, "loss": 0.4862, "step": 1223 }, { "epoch": 0.7882788600869425, "grad_norm": 0.3367659657049994, "learning_rate": 2.124325564225458e-06, "loss": 0.5009, "step": 1224 }, { "epoch": 0.7889228787634841, "grad_norm": 0.384395125490022, "learning_rate": 2.1118678753219137e-06, "loss": 0.5051, "step": 1225 }, { "epoch": 0.7895668974400257, "grad_norm": 0.3975667873730337, "learning_rate": 2.099442507831444e-06, "loss": 0.5124, "step": 1226 }, { "epoch": 0.7902109161165674, "grad_norm": 0.3595136217535018, "learning_rate": 2.087049512666658e-06, "loss": 0.5044, "step": 1227 }, { "epoch": 0.790854934793109, "grad_norm": 0.3378959619530086, "learning_rate": 2.074688940607529e-06, "loss": 0.5055, "step": 1228 }, { "epoch": 0.7914989534696506, "grad_norm": 0.3514944744763787, "learning_rate": 2.062360842301178e-06, "loss": 0.5318, "step": 1229 }, { "epoch": 0.7921429721461922, "grad_norm": 0.33334280177614795, "learning_rate": 2.050065268261655e-06, "loss": 0.5055, "step": 1230 }, { "epoch": 0.7927869908227339, "grad_norm": 0.35517036543715474, "learning_rate": 2.0378022688697563e-06, "loss": 0.5053, "step": 1231 }, { "epoch": 0.7934310094992755, "grad_norm": 0.3673497250220277, "learning_rate": 2.025571894372794e-06, "loss": 0.5116, "step": 1232 }, { "epoch": 0.7940750281758171, "grad_norm": 0.3515448243027021, "learning_rate": 2.0133741948844056e-06, "loss": 0.5054, "step": 1233 }, { "epoch": 0.7947190468523587, "grad_norm": 0.33904638061560494, "learning_rate": 2.001209220384346e-06, "loss": 0.4851, "step": 1234 }, { "epoch": 0.7953630655289003, "grad_norm": 0.34986104247148964, "learning_rate": 1.9890770207182706e-06, "loss": 0.5229, "step": 1235 }, { "epoch": 0.796007084205442, "grad_norm": 0.3867308398321565, "learning_rate": 1.976977645597552e-06, "loss": 0.5066, "step": 1236 }, { "epoch": 0.7966511028819836, "grad_norm": 0.35497271693548227, "learning_rate": 1.9649111445990588e-06, "loss": 0.5073, "step": 1237 }, { "epoch": 0.7972951215585252, "grad_norm": 0.3899753461808767, "learning_rate": 1.9528775671649593e-06, "loss": 0.5209, "step": 1238 }, { "epoch": 0.7979391402350668, "grad_norm": 0.3413262706492361, "learning_rate": 1.9408769626025237e-06, "loss": 0.498, "step": 1239 }, { "epoch": 0.7985831589116085, "grad_norm": 0.36718707021369335, "learning_rate": 1.9289093800839067e-06, "loss": 0.5055, "step": 1240 }, { "epoch": 0.7992271775881501, "grad_norm": 0.3581439500052506, "learning_rate": 1.9169748686459655e-06, "loss": 0.5162, "step": 1241 }, { "epoch": 0.7998711962646917, "grad_norm": 0.3843705500931474, "learning_rate": 1.9050734771900414e-06, "loss": 0.5014, "step": 1242 }, { "epoch": 0.8005152149412333, "grad_norm": 0.35558474250949845, "learning_rate": 1.8932052544817747e-06, "loss": 0.4981, "step": 1243 }, { "epoch": 0.801159233617775, "grad_norm": 0.34023606887895824, "learning_rate": 1.8813702491508956e-06, "loss": 0.5198, "step": 1244 }, { "epoch": 0.8018032522943165, "grad_norm": 0.36497982001924045, "learning_rate": 1.869568509691022e-06, "loss": 0.497, "step": 1245 }, { "epoch": 0.8024472709708581, "grad_norm": 0.32715275285542916, "learning_rate": 1.8578000844594746e-06, "loss": 0.4951, "step": 1246 }, { "epoch": 0.8030912896473997, "grad_norm": 0.3393422011052391, "learning_rate": 1.8460650216770604e-06, "loss": 0.4977, "step": 1247 }, { "epoch": 0.8037353083239414, "grad_norm": 0.37954184554736503, "learning_rate": 1.8343633694278895e-06, "loss": 0.508, "step": 1248 }, { "epoch": 0.804379327000483, "grad_norm": 0.3322505424554738, "learning_rate": 1.8226951756591783e-06, "loss": 0.4962, "step": 1249 }, { "epoch": 0.8050233456770246, "grad_norm": 0.3159037632752118, "learning_rate": 1.8110604881810357e-06, "loss": 0.4827, "step": 1250 }, { "epoch": 0.8056673643535662, "grad_norm": 0.32572724145541326, "learning_rate": 1.799459354666293e-06, "loss": 0.4849, "step": 1251 }, { "epoch": 0.8063113830301079, "grad_norm": 0.344730833943389, "learning_rate": 1.7878918226502816e-06, "loss": 0.5113, "step": 1252 }, { "epoch": 0.8069554017066495, "grad_norm": 0.3582455992488915, "learning_rate": 1.776357939530663e-06, "loss": 0.5224, "step": 1253 }, { "epoch": 0.8075994203831911, "grad_norm": 0.3293205905743023, "learning_rate": 1.7648577525672195e-06, "loss": 0.4906, "step": 1254 }, { "epoch": 0.8082434390597327, "grad_norm": 0.32592286531872383, "learning_rate": 1.753391308881659e-06, "loss": 0.4958, "step": 1255 }, { "epoch": 0.8088874577362744, "grad_norm": 0.3976393524246138, "learning_rate": 1.7419586554574364e-06, "loss": 0.5442, "step": 1256 }, { "epoch": 0.809531476412816, "grad_norm": 0.33947424641194485, "learning_rate": 1.7305598391395429e-06, "loss": 0.4928, "step": 1257 }, { "epoch": 0.8101754950893576, "grad_norm": 0.35644035531428503, "learning_rate": 1.7191949066343306e-06, "loss": 0.5292, "step": 1258 }, { "epoch": 0.8108195137658992, "grad_norm": 0.3858896096300513, "learning_rate": 1.7078639045093105e-06, "loss": 0.5177, "step": 1259 }, { "epoch": 0.8114635324424408, "grad_norm": 0.34773032041982627, "learning_rate": 1.69656687919296e-06, "loss": 0.4836, "step": 1260 }, { "epoch": 0.8121075511189825, "grad_norm": 0.3559804808799843, "learning_rate": 1.6853038769745466e-06, "loss": 0.5162, "step": 1261 }, { "epoch": 0.8127515697955241, "grad_norm": 0.34325350635596374, "learning_rate": 1.6740749440039262e-06, "loss": 0.5164, "step": 1262 }, { "epoch": 0.8133955884720657, "grad_norm": 0.3359812628149839, "learning_rate": 1.6628801262913485e-06, "loss": 0.5113, "step": 1263 }, { "epoch": 0.8140396071486073, "grad_norm": 0.3333207532395854, "learning_rate": 1.6517194697072903e-06, "loss": 0.5081, "step": 1264 }, { "epoch": 0.814683625825149, "grad_norm": 0.3245216179774198, "learning_rate": 1.6405930199822406e-06, "loss": 0.5097, "step": 1265 }, { "epoch": 0.8153276445016906, "grad_norm": 0.3520008057340081, "learning_rate": 1.6295008227065367e-06, "loss": 0.4999, "step": 1266 }, { "epoch": 0.8159716631782322, "grad_norm": 0.3500247650466787, "learning_rate": 1.6184429233301669e-06, "loss": 0.4968, "step": 1267 }, { "epoch": 0.8166156818547737, "grad_norm": 0.3513907496249367, "learning_rate": 1.607419367162577e-06, "loss": 0.5015, "step": 1268 }, { "epoch": 0.8172597005313154, "grad_norm": 0.33264214189371044, "learning_rate": 1.5964301993725006e-06, "loss": 0.501, "step": 1269 }, { "epoch": 0.817903719207857, "grad_norm": 0.3428986730711293, "learning_rate": 1.58547546498776e-06, "loss": 0.5174, "step": 1270 }, { "epoch": 0.8185477378843986, "grad_norm": 0.3406425750722631, "learning_rate": 1.5745552088950899e-06, "loss": 0.5032, "step": 1271 }, { "epoch": 0.8191917565609402, "grad_norm": 0.354911028494689, "learning_rate": 1.5636694758399563e-06, "loss": 0.5209, "step": 1272 }, { "epoch": 0.8198357752374819, "grad_norm": 0.5126590105345391, "learning_rate": 1.552818310426356e-06, "loss": 0.5253, "step": 1273 }, { "epoch": 0.8204797939140235, "grad_norm": 0.354364349885467, "learning_rate": 1.542001757116658e-06, "loss": 0.5122, "step": 1274 }, { "epoch": 0.8211238125905651, "grad_norm": 0.32882314587114336, "learning_rate": 1.5312198602314e-06, "loss": 0.5037, "step": 1275 }, { "epoch": 0.8217678312671067, "grad_norm": 0.3354529924728954, "learning_rate": 1.520472663949122e-06, "loss": 0.4962, "step": 1276 }, { "epoch": 0.8224118499436484, "grad_norm": 0.34690234686227484, "learning_rate": 1.5097602123061772e-06, "loss": 0.5077, "step": 1277 }, { "epoch": 0.82305586862019, "grad_norm": 0.35831708333674867, "learning_rate": 1.4990825491965522e-06, "loss": 0.5023, "step": 1278 }, { "epoch": 0.8236998872967316, "grad_norm": 0.3372227835768637, "learning_rate": 1.4884397183716902e-06, "loss": 0.5055, "step": 1279 }, { "epoch": 0.8243439059732732, "grad_norm": 0.3236821511363991, "learning_rate": 1.4778317634403082e-06, "loss": 0.479, "step": 1280 }, { "epoch": 0.8249879246498149, "grad_norm": 0.3419847643896573, "learning_rate": 1.4672587278682228e-06, "loss": 0.4965, "step": 1281 }, { "epoch": 0.8256319433263565, "grad_norm": 0.33622260051138286, "learning_rate": 1.4567206549781699e-06, "loss": 0.4979, "step": 1282 }, { "epoch": 0.8262759620028981, "grad_norm": 0.3514570322790945, "learning_rate": 1.4462175879496198e-06, "loss": 0.5229, "step": 1283 }, { "epoch": 0.8269199806794397, "grad_norm": 0.33072383094290453, "learning_rate": 1.4357495698186186e-06, "loss": 0.4994, "step": 1284 }, { "epoch": 0.8275639993559814, "grad_norm": 0.33406609271749454, "learning_rate": 1.4253166434775867e-06, "loss": 0.5332, "step": 1285 }, { "epoch": 0.828208018032523, "grad_norm": 0.3340526109307583, "learning_rate": 1.41491885167517e-06, "loss": 0.507, "step": 1286 }, { "epoch": 0.8288520367090646, "grad_norm": 0.3537194928689518, "learning_rate": 1.4045562370160426e-06, "loss": 0.5104, "step": 1287 }, { "epoch": 0.8294960553856062, "grad_norm": 0.3347025982980687, "learning_rate": 1.3942288419607476e-06, "loss": 0.4929, "step": 1288 }, { "epoch": 0.8301400740621478, "grad_norm": 0.3261287201357999, "learning_rate": 1.383936708825513e-06, "loss": 0.4979, "step": 1289 }, { "epoch": 0.8307840927386895, "grad_norm": 0.34239192564789883, "learning_rate": 1.3736798797820783e-06, "loss": 0.5111, "step": 1290 }, { "epoch": 0.8314281114152311, "grad_norm": 0.3271470654316128, "learning_rate": 1.3634583968575343e-06, "loss": 0.5017, "step": 1291 }, { "epoch": 0.8320721300917726, "grad_norm": 0.3544281113305755, "learning_rate": 1.3532723019341376e-06, "loss": 0.4813, "step": 1292 }, { "epoch": 0.8327161487683142, "grad_norm": 0.3391263538659487, "learning_rate": 1.3431216367491384e-06, "loss": 0.5241, "step": 1293 }, { "epoch": 0.8333601674448559, "grad_norm": 0.3338966622214297, "learning_rate": 1.3330064428946255e-06, "loss": 0.5067, "step": 1294 }, { "epoch": 0.8340041861213975, "grad_norm": 0.3546998506519645, "learning_rate": 1.3229267618173324e-06, "loss": 0.4966, "step": 1295 }, { "epoch": 0.8346482047979391, "grad_norm": 0.34302334091007014, "learning_rate": 1.3128826348184886e-06, "loss": 0.4891, "step": 1296 }, { "epoch": 0.8352922234744807, "grad_norm": 0.35283803706017125, "learning_rate": 1.3028741030536418e-06, "loss": 0.4913, "step": 1297 }, { "epoch": 0.8359362421510224, "grad_norm": 0.32612718170258687, "learning_rate": 1.2929012075324832e-06, "loss": 0.4921, "step": 1298 }, { "epoch": 0.836580260827564, "grad_norm": 0.3360436741607197, "learning_rate": 1.2829639891186917e-06, "loss": 0.5083, "step": 1299 }, { "epoch": 0.8372242795041056, "grad_norm": 0.3483320907386437, "learning_rate": 1.2730624885297537e-06, "loss": 0.5258, "step": 1300 }, { "epoch": 0.8378682981806472, "grad_norm": 0.3484702545466743, "learning_rate": 1.2631967463368077e-06, "loss": 0.5115, "step": 1301 }, { "epoch": 0.8385123168571889, "grad_norm": 0.34180609625050973, "learning_rate": 1.2533668029644751e-06, "loss": 0.4842, "step": 1302 }, { "epoch": 0.8391563355337305, "grad_norm": 0.3277941204136625, "learning_rate": 1.243572698690685e-06, "loss": 0.5016, "step": 1303 }, { "epoch": 0.8398003542102721, "grad_norm": 0.34420987080159693, "learning_rate": 1.233814473646524e-06, "loss": 0.4991, "step": 1304 }, { "epoch": 0.8404443728868137, "grad_norm": 0.34747702146245557, "learning_rate": 1.224092167816059e-06, "loss": 0.5151, "step": 1305 }, { "epoch": 0.8410883915633554, "grad_norm": 0.33732481177248713, "learning_rate": 1.214405821036182e-06, "loss": 0.5056, "step": 1306 }, { "epoch": 0.841732410239897, "grad_norm": 0.32840945371581, "learning_rate": 1.204755472996445e-06, "loss": 0.5025, "step": 1307 }, { "epoch": 0.8423764289164386, "grad_norm": 0.33214660948367214, "learning_rate": 1.195141163238892e-06, "loss": 0.4903, "step": 1308 }, { "epoch": 0.8430204475929802, "grad_norm": 0.3311263252497031, "learning_rate": 1.1855629311579065e-06, "loss": 0.5185, "step": 1309 }, { "epoch": 0.8436644662695219, "grad_norm": 0.3284453540284584, "learning_rate": 1.1760208160000364e-06, "loss": 0.485, "step": 1310 }, { "epoch": 0.8443084849460635, "grad_norm": 0.3494908973183085, "learning_rate": 1.1665148568638496e-06, "loss": 0.4931, "step": 1311 }, { "epoch": 0.8449525036226051, "grad_norm": 0.35046425371788714, "learning_rate": 1.1570450926997657e-06, "loss": 0.5142, "step": 1312 }, { "epoch": 0.8455965222991467, "grad_norm": 0.308176388269128, "learning_rate": 1.147611562309887e-06, "loss": 0.492, "step": 1313 }, { "epoch": 0.8462405409756883, "grad_norm": 0.3261208379658252, "learning_rate": 1.1382143043478599e-06, "loss": 0.482, "step": 1314 }, { "epoch": 0.8468845596522299, "grad_norm": 0.3648840400086203, "learning_rate": 1.1288533573186976e-06, "loss": 0.5087, "step": 1315 }, { "epoch": 0.8475285783287715, "grad_norm": 0.33458808415765556, "learning_rate": 1.1195287595786352e-06, "loss": 0.4955, "step": 1316 }, { "epoch": 0.8481725970053131, "grad_norm": 0.35398715161490835, "learning_rate": 1.1102405493349676e-06, "loss": 0.4896, "step": 1317 }, { "epoch": 0.8488166156818547, "grad_norm": 0.3331876790531892, "learning_rate": 1.1009887646458862e-06, "loss": 0.5094, "step": 1318 }, { "epoch": 0.8494606343583964, "grad_norm": 0.35131555097679856, "learning_rate": 1.0917734434203365e-06, "loss": 0.5394, "step": 1319 }, { "epoch": 0.850104653034938, "grad_norm": 0.33210667766755725, "learning_rate": 1.0825946234178575e-06, "loss": 0.4957, "step": 1320 }, { "epoch": 0.8507486717114796, "grad_norm": 0.35373484305038555, "learning_rate": 1.0734523422484156e-06, "loss": 0.5034, "step": 1321 }, { "epoch": 0.8513926903880212, "grad_norm": 0.33503135928492866, "learning_rate": 1.064346637372271e-06, "loss": 0.4935, "step": 1322 }, { "epoch": 0.8520367090645629, "grad_norm": 0.31694689445412116, "learning_rate": 1.0552775460998067e-06, "loss": 0.4865, "step": 1323 }, { "epoch": 0.8526807277411045, "grad_norm": 0.3454821165081998, "learning_rate": 1.0462451055913847e-06, "loss": 0.5199, "step": 1324 }, { "epoch": 0.8533247464176461, "grad_norm": 0.3329651961258978, "learning_rate": 1.0372493528571947e-06, "loss": 0.4684, "step": 1325 }, { "epoch": 0.8539687650941877, "grad_norm": 0.33192015341992553, "learning_rate": 1.0282903247570908e-06, "loss": 0.5323, "step": 1326 }, { "epoch": 0.8546127837707294, "grad_norm": 0.33116245954890894, "learning_rate": 1.0193680580004594e-06, "loss": 0.5035, "step": 1327 }, { "epoch": 0.855256802447271, "grad_norm": 0.3253866896112924, "learning_rate": 1.010482589146048e-06, "loss": 0.4888, "step": 1328 }, { "epoch": 0.8559008211238126, "grad_norm": 0.3199883964949596, "learning_rate": 1.0016339546018328e-06, "loss": 0.5023, "step": 1329 }, { "epoch": 0.8565448398003542, "grad_norm": 0.3231782863105249, "learning_rate": 9.928221906248614e-07, "loss": 0.4937, "step": 1330 }, { "epoch": 0.8571888584768959, "grad_norm": 0.3585325773342001, "learning_rate": 9.84047333321102e-07, "loss": 0.4884, "step": 1331 }, { "epoch": 0.8578328771534375, "grad_norm": 0.32503211444421404, "learning_rate": 9.753094186453028e-07, "loss": 0.5242, "step": 1332 }, { "epoch": 0.8584768958299791, "grad_norm": 0.33090171013498754, "learning_rate": 9.666084824008349e-07, "loss": 0.4931, "step": 1333 }, { "epoch": 0.8591209145065207, "grad_norm": 0.32203731020962656, "learning_rate": 9.579445602395577e-07, "loss": 0.4986, "step": 1334 }, { "epoch": 0.8597649331830624, "grad_norm": 0.3322928249325587, "learning_rate": 9.493176876616616e-07, "loss": 0.4925, "step": 1335 }, { "epoch": 0.860408951859604, "grad_norm": 0.3229454348693366, "learning_rate": 9.407279000155311e-07, "loss": 0.4979, "step": 1336 }, { "epoch": 0.8610529705361456, "grad_norm": 0.34544504897596257, "learning_rate": 9.321752324975952e-07, "loss": 0.5081, "step": 1337 }, { "epoch": 0.8616969892126871, "grad_norm": 0.32358163641395393, "learning_rate": 9.23659720152179e-07, "loss": 0.478, "step": 1338 }, { "epoch": 0.8623410078892287, "grad_norm": 0.31993234736544307, "learning_rate": 9.151813978713741e-07, "loss": 0.4783, "step": 1339 }, { "epoch": 0.8629850265657704, "grad_norm": 0.371505848387323, "learning_rate": 9.067403003948783e-07, "loss": 0.5176, "step": 1340 }, { "epoch": 0.863629045242312, "grad_norm": 0.353343917181823, "learning_rate": 8.983364623098678e-07, "loss": 0.504, "step": 1341 }, { "epoch": 0.8642730639188536, "grad_norm": 0.35921824300874944, "learning_rate": 8.89969918050847e-07, "loss": 0.5268, "step": 1342 }, { "epoch": 0.8649170825953952, "grad_norm": 0.31602253843693634, "learning_rate": 8.816407018995088e-07, "loss": 0.5092, "step": 1343 }, { "epoch": 0.8655611012719369, "grad_norm": 0.3268221972155922, "learning_rate": 8.733488479845997e-07, "loss": 0.4968, "step": 1344 }, { "epoch": 0.8662051199484785, "grad_norm": 0.34206340973669097, "learning_rate": 8.650943902817677e-07, "loss": 0.5009, "step": 1345 }, { "epoch": 0.8668491386250201, "grad_norm": 0.31995417622694, "learning_rate": 8.568773626134363e-07, "loss": 0.5132, "step": 1346 }, { "epoch": 0.8674931573015617, "grad_norm": 0.3341465411432408, "learning_rate": 8.486977986486633e-07, "loss": 0.5099, "step": 1347 }, { "epoch": 0.8681371759781034, "grad_norm": 0.3252017246013724, "learning_rate": 8.405557319029911e-07, "loss": 0.5005, "step": 1348 }, { "epoch": 0.868781194654645, "grad_norm": 0.3461955216777936, "learning_rate": 8.324511957383252e-07, "loss": 0.511, "step": 1349 }, { "epoch": 0.8694252133311866, "grad_norm": 0.3320631397448815, "learning_rate": 8.243842233627897e-07, "loss": 0.5228, "step": 1350 }, { "epoch": 0.8700692320077282, "grad_norm": 0.3193423628232718, "learning_rate": 8.163548478305883e-07, "loss": 0.48, "step": 1351 }, { "epoch": 0.8707132506842699, "grad_norm": 0.3365957113512182, "learning_rate": 8.083631020418792e-07, "loss": 0.5199, "step": 1352 }, { "epoch": 0.8713572693608115, "grad_norm": 0.32950477082107593, "learning_rate": 8.004090187426238e-07, "loss": 0.5043, "step": 1353 }, { "epoch": 0.8720012880373531, "grad_norm": 0.34245679144384744, "learning_rate": 7.924926305244729e-07, "loss": 0.4993, "step": 1354 }, { "epoch": 0.8726453067138947, "grad_norm": 0.3176123139073869, "learning_rate": 7.846139698246191e-07, "loss": 0.4894, "step": 1355 }, { "epoch": 0.8732893253904364, "grad_norm": 0.33122416690836365, "learning_rate": 7.767730689256614e-07, "loss": 0.504, "step": 1356 }, { "epoch": 0.873933344066978, "grad_norm": 0.3307533980203514, "learning_rate": 7.689699599554901e-07, "loss": 0.5183, "step": 1357 }, { "epoch": 0.8745773627435196, "grad_norm": 0.34169627977408384, "learning_rate": 7.612046748871327e-07, "loss": 0.519, "step": 1358 }, { "epoch": 0.8752213814200612, "grad_norm": 0.3498637632124744, "learning_rate": 7.53477245538643e-07, "loss": 0.5265, "step": 1359 }, { "epoch": 0.8758654000966029, "grad_norm": 0.34552256626028355, "learning_rate": 7.457877035729588e-07, "loss": 0.5332, "step": 1360 }, { "epoch": 0.8765094187731444, "grad_norm": 0.31951341778969583, "learning_rate": 7.381360804977733e-07, "loss": 0.5, "step": 1361 }, { "epoch": 0.877153437449686, "grad_norm": 0.29700216854208006, "learning_rate": 7.305224076654127e-07, "loss": 0.4855, "step": 1362 }, { "epoch": 0.8777974561262276, "grad_norm": 0.32553657260201724, "learning_rate": 7.229467162726966e-07, "loss": 0.4892, "step": 1363 }, { "epoch": 0.8784414748027692, "grad_norm": 0.32801339133062796, "learning_rate": 7.154090373608236e-07, "loss": 0.5023, "step": 1364 }, { "epoch": 0.8790854934793109, "grad_norm": 0.3260646080187078, "learning_rate": 7.079094018152333e-07, "loss": 0.502, "step": 1365 }, { "epoch": 0.8797295121558525, "grad_norm": 0.3367765733629452, "learning_rate": 7.004478403654835e-07, "loss": 0.5222, "step": 1366 }, { "epoch": 0.8803735308323941, "grad_norm": 0.3250257158295539, "learning_rate": 6.930243835851258e-07, "loss": 0.4986, "step": 1367 }, { "epoch": 0.8810175495089357, "grad_norm": 0.33742827580368107, "learning_rate": 6.856390618915775e-07, "loss": 0.5178, "step": 1368 }, { "epoch": 0.8816615681854774, "grad_norm": 0.31736906007809873, "learning_rate": 6.782919055460002e-07, "loss": 0.4841, "step": 1369 }, { "epoch": 0.882305586862019, "grad_norm": 0.32995320233528447, "learning_rate": 6.709829446531734e-07, "loss": 0.499, "step": 1370 }, { "epoch": 0.8829496055385606, "grad_norm": 0.31797542795009254, "learning_rate": 6.637122091613702e-07, "loss": 0.5092, "step": 1371 }, { "epoch": 0.8835936242151022, "grad_norm": 0.33734409883783506, "learning_rate": 6.564797288622371e-07, "loss": 0.5194, "step": 1372 }, { "epoch": 0.8842376428916439, "grad_norm": 0.3130815882466205, "learning_rate": 6.492855333906733e-07, "loss": 0.5094, "step": 1373 }, { "epoch": 0.8848816615681855, "grad_norm": 0.31213623123887585, "learning_rate": 6.421296522247012e-07, "loss": 0.4968, "step": 1374 }, { "epoch": 0.8855256802447271, "grad_norm": 0.3340723448809502, "learning_rate": 6.350121146853582e-07, "loss": 0.5227, "step": 1375 }, { "epoch": 0.8861696989212687, "grad_norm": 0.33267641155906735, "learning_rate": 6.279329499365649e-07, "loss": 0.4995, "step": 1376 }, { "epoch": 0.8868137175978104, "grad_norm": 0.3665004134366373, "learning_rate": 6.208921869850104e-07, "loss": 0.5213, "step": 1377 }, { "epoch": 0.887457736274352, "grad_norm": 0.33342787545203356, "learning_rate": 6.138898546800398e-07, "loss": 0.4927, "step": 1378 }, { "epoch": 0.8881017549508936, "grad_norm": 0.3493243633809496, "learning_rate": 6.069259817135187e-07, "loss": 0.5027, "step": 1379 }, { "epoch": 0.8887457736274352, "grad_norm": 0.3370101984980294, "learning_rate": 6.000005966197387e-07, "loss": 0.4968, "step": 1380 }, { "epoch": 0.8893897923039769, "grad_norm": 0.33020958513845067, "learning_rate": 5.931137277752764e-07, "loss": 0.5012, "step": 1381 }, { "epoch": 0.8900338109805185, "grad_norm": 0.32049900945847704, "learning_rate": 5.86265403398899e-07, "loss": 0.5096, "step": 1382 }, { "epoch": 0.8906778296570601, "grad_norm": 0.31002160289190733, "learning_rate": 5.794556515514327e-07, "loss": 0.4738, "step": 1383 }, { "epoch": 0.8913218483336017, "grad_norm": 0.3276276689676106, "learning_rate": 5.726845001356573e-07, "loss": 0.5058, "step": 1384 }, { "epoch": 0.8919658670101432, "grad_norm": 0.3570247422173048, "learning_rate": 5.659519768961885e-07, "loss": 0.5044, "step": 1385 }, { "epoch": 0.8926098856866849, "grad_norm": 0.32715495660505395, "learning_rate": 5.592581094193584e-07, "loss": 0.5115, "step": 1386 }, { "epoch": 0.8932539043632265, "grad_norm": 0.33241048349584484, "learning_rate": 5.526029251331155e-07, "loss": 0.511, "step": 1387 }, { "epoch": 0.8938979230397681, "grad_norm": 0.32058008387644016, "learning_rate": 5.459864513068991e-07, "loss": 0.4856, "step": 1388 }, { "epoch": 0.8945419417163097, "grad_norm": 0.33601590343050497, "learning_rate": 5.39408715051537e-07, "loss": 0.5208, "step": 1389 }, { "epoch": 0.8951859603928514, "grad_norm": 0.3119097058491575, "learning_rate": 5.328697433191321e-07, "loss": 0.5043, "step": 1390 }, { "epoch": 0.895829979069393, "grad_norm": 0.3176219832378358, "learning_rate": 5.263695629029452e-07, "loss": 0.4929, "step": 1391 }, { "epoch": 0.8964739977459346, "grad_norm": 0.32246791130897806, "learning_rate": 5.199082004372958e-07, "loss": 0.4937, "step": 1392 }, { "epoch": 0.8971180164224762, "grad_norm": 0.32277537124684547, "learning_rate": 5.134856823974444e-07, "loss": 0.4827, "step": 1393 }, { "epoch": 0.8977620350990179, "grad_norm": 0.3227424579436927, "learning_rate": 5.071020350994893e-07, "loss": 0.5077, "step": 1394 }, { "epoch": 0.8984060537755595, "grad_norm": 0.32558919614105497, "learning_rate": 5.007572847002595e-07, "loss": 0.52, "step": 1395 }, { "epoch": 0.8990500724521011, "grad_norm": 0.3502931560321367, "learning_rate": 4.944514571971981e-07, "loss": 0.4938, "step": 1396 }, { "epoch": 0.8996940911286427, "grad_norm": 0.3262493949361299, "learning_rate": 4.881845784282701e-07, "loss": 0.5018, "step": 1397 }, { "epoch": 0.9003381098051844, "grad_norm": 0.33637857469623905, "learning_rate": 4.81956674071844e-07, "loss": 0.5134, "step": 1398 }, { "epoch": 0.900982128481726, "grad_norm": 0.32280120315286437, "learning_rate": 4.7576776964659344e-07, "loss": 0.495, "step": 1399 }, { "epoch": 0.9016261471582676, "grad_norm": 0.31536694974531215, "learning_rate": 4.696178905113913e-07, "loss": 0.4861, "step": 1400 }, { "epoch": 0.9022701658348092, "grad_norm": 0.3344894280248359, "learning_rate": 4.635070618652049e-07, "loss": 0.5071, "step": 1401 }, { "epoch": 0.9029141845113509, "grad_norm": 0.3242330318426173, "learning_rate": 4.5743530874699293e-07, "loss": 0.5271, "step": 1402 }, { "epoch": 0.9035582031878925, "grad_norm": 0.3404176906726383, "learning_rate": 4.514026560356044e-07, "loss": 0.5141, "step": 1403 }, { "epoch": 0.9042022218644341, "grad_norm": 0.32393401857003584, "learning_rate": 4.454091284496731e-07, "loss": 0.5174, "step": 1404 }, { "epoch": 0.9048462405409757, "grad_norm": 0.3197443931707106, "learning_rate": 4.3945475054752216e-07, "loss": 0.4961, "step": 1405 }, { "epoch": 0.9054902592175174, "grad_norm": 0.3292406410164791, "learning_rate": 4.3353954672705533e-07, "loss": 0.5336, "step": 1406 }, { "epoch": 0.906134277894059, "grad_norm": 0.341120787479547, "learning_rate": 4.2766354122566465e-07, "loss": 0.4974, "step": 1407 }, { "epoch": 0.9067782965706005, "grad_norm": 0.3302722728744402, "learning_rate": 4.218267581201296e-07, "loss": 0.5053, "step": 1408 }, { "epoch": 0.9074223152471421, "grad_norm": 0.3299858763794231, "learning_rate": 4.160292213265116e-07, "loss": 0.5087, "step": 1409 }, { "epoch": 0.9080663339236837, "grad_norm": 0.3405246470017635, "learning_rate": 4.1027095460006715e-07, "loss": 0.4944, "step": 1410 }, { "epoch": 0.9087103526002254, "grad_norm": 0.33702892796797534, "learning_rate": 4.0455198153514064e-07, "loss": 0.5209, "step": 1411 }, { "epoch": 0.909354371276767, "grad_norm": 0.3484602005597187, "learning_rate": 3.988723255650728e-07, "loss": 0.5207, "step": 1412 }, { "epoch": 0.9099983899533086, "grad_norm": 0.31720738808294835, "learning_rate": 3.9323200996210673e-07, "loss": 0.5088, "step": 1413 }, { "epoch": 0.9106424086298502, "grad_norm": 0.3167672155409996, "learning_rate": 3.876310578372833e-07, "loss": 0.4908, "step": 1414 }, { "epoch": 0.9112864273063919, "grad_norm": 0.33333154674639726, "learning_rate": 3.8206949214035785e-07, "loss": 0.4998, "step": 1415 }, { "epoch": 0.9119304459829335, "grad_norm": 0.3277907859845637, "learning_rate": 3.7654733565969826e-07, "loss": 0.4922, "step": 1416 }, { "epoch": 0.9125744646594751, "grad_norm": 0.33014067402525826, "learning_rate": 3.7106461102219495e-07, "loss": 0.5102, "step": 1417 }, { "epoch": 0.9132184833360167, "grad_norm": 0.32698312397902596, "learning_rate": 3.6562134069316857e-07, "loss": 0.5053, "step": 1418 }, { "epoch": 0.9138625020125584, "grad_norm": 0.32466073546297985, "learning_rate": 3.602175469762725e-07, "loss": 0.5232, "step": 1419 }, { "epoch": 0.9145065206891, "grad_norm": 0.3349307698334408, "learning_rate": 3.548532520134129e-07, "loss": 0.4878, "step": 1420 }, { "epoch": 0.9151505393656416, "grad_norm": 0.327281687967225, "learning_rate": 3.4952847778464306e-07, "loss": 0.5082, "step": 1421 }, { "epoch": 0.9157945580421832, "grad_norm": 0.3284215548598372, "learning_rate": 3.442432461080858e-07, "loss": 0.4901, "step": 1422 }, { "epoch": 0.9164385767187249, "grad_norm": 0.3389487851942482, "learning_rate": 3.389975786398403e-07, "loss": 0.4984, "step": 1423 }, { "epoch": 0.9170825953952665, "grad_norm": 0.3308653489231506, "learning_rate": 3.3379149687388866e-07, "loss": 0.5089, "step": 1424 }, { "epoch": 0.9177266140718081, "grad_norm": 0.3213564407246247, "learning_rate": 3.2862502214201396e-07, "loss": 0.5016, "step": 1425 }, { "epoch": 0.9183706327483497, "grad_norm": 0.30780248585137615, "learning_rate": 3.23498175613709e-07, "loss": 0.5019, "step": 1426 }, { "epoch": 0.9190146514248914, "grad_norm": 0.3409157864584587, "learning_rate": 3.1841097829609313e-07, "loss": 0.5057, "step": 1427 }, { "epoch": 0.919658670101433, "grad_norm": 0.32308699682768793, "learning_rate": 3.133634510338235e-07, "loss": 0.4918, "step": 1428 }, { "epoch": 0.9203026887779746, "grad_norm": 0.35148044555025776, "learning_rate": 3.083556145090072e-07, "loss": 0.5162, "step": 1429 }, { "epoch": 0.9209467074545162, "grad_norm": 0.3070923978657446, "learning_rate": 3.0338748924112483e-07, "loss": 0.4961, "step": 1430 }, { "epoch": 0.9215907261310577, "grad_norm": 0.32423627170324054, "learning_rate": 2.9845909558693707e-07, "loss": 0.52, "step": 1431 }, { "epoch": 0.9222347448075994, "grad_norm": 0.33002280915002014, "learning_rate": 2.935704537404083e-07, "loss": 0.5184, "step": 1432 }, { "epoch": 0.922878763484141, "grad_norm": 0.31288885442329545, "learning_rate": 2.8872158373261847e-07, "loss": 0.4891, "step": 1433 }, { "epoch": 0.9235227821606826, "grad_norm": 0.3349783767696664, "learning_rate": 2.839125054316838e-07, "loss": 0.4936, "step": 1434 }, { "epoch": 0.9241668008372242, "grad_norm": 0.34821177205084386, "learning_rate": 2.791432385426762e-07, "loss": 0.4984, "step": 1435 }, { "epoch": 0.9248108195137659, "grad_norm": 0.32282046703004696, "learning_rate": 2.744138026075405e-07, "loss": 0.5105, "step": 1436 }, { "epoch": 0.9254548381903075, "grad_norm": 0.3112788076466935, "learning_rate": 2.697242170050152e-07, "loss": 0.4852, "step": 1437 }, { "epoch": 0.9260988568668491, "grad_norm": 0.31642973725451146, "learning_rate": 2.650745009505562e-07, "loss": 0.5117, "step": 1438 }, { "epoch": 0.9267428755433907, "grad_norm": 0.3377170663114201, "learning_rate": 2.604646734962479e-07, "loss": 0.501, "step": 1439 }, { "epoch": 0.9273868942199324, "grad_norm": 0.3240395409475092, "learning_rate": 2.5589475353073987e-07, "loss": 0.4776, "step": 1440 }, { "epoch": 0.928030912896474, "grad_norm": 0.32736202783291896, "learning_rate": 2.5136475977915686e-07, "loss": 0.5089, "step": 1441 }, { "epoch": 0.9286749315730156, "grad_norm": 0.3174522728391781, "learning_rate": 2.468747108030289e-07, "loss": 0.4996, "step": 1442 }, { "epoch": 0.9293189502495572, "grad_norm": 0.3455576905104296, "learning_rate": 2.424246250002138e-07, "loss": 0.5152, "step": 1443 }, { "epoch": 0.9299629689260989, "grad_norm": 0.3376183422399604, "learning_rate": 2.380145206048201e-07, "loss": 0.5135, "step": 1444 }, { "epoch": 0.9306069876026405, "grad_norm": 0.32351125512309303, "learning_rate": 2.3364441568713424e-07, "loss": 0.496, "step": 1445 }, { "epoch": 0.9312510062791821, "grad_norm": 0.33491600224375784, "learning_rate": 2.2931432815354593e-07, "loss": 0.5061, "step": 1446 }, { "epoch": 0.9318950249557237, "grad_norm": 0.3164661923034047, "learning_rate": 2.2502427574647268e-07, "loss": 0.5109, "step": 1447 }, { "epoch": 0.9325390436322654, "grad_norm": 0.33780686337386273, "learning_rate": 2.2077427604429435e-07, "loss": 0.5192, "step": 1448 }, { "epoch": 0.933183062308807, "grad_norm": 0.335442160073025, "learning_rate": 2.1656434646126878e-07, "loss": 0.504, "step": 1449 }, { "epoch": 0.9338270809853486, "grad_norm": 0.310088074407167, "learning_rate": 2.123945042474751e-07, "loss": 0.4766, "step": 1450 }, { "epoch": 0.9344710996618902, "grad_norm": 0.323072751010256, "learning_rate": 2.082647664887283e-07, "loss": 0.504, "step": 1451 }, { "epoch": 0.9351151183384319, "grad_norm": 0.3325148799614355, "learning_rate": 2.0417515010652032e-07, "loss": 0.4939, "step": 1452 }, { "epoch": 0.9357591370149735, "grad_norm": 0.3254168311456042, "learning_rate": 2.0012567185794808e-07, "loss": 0.5144, "step": 1453 }, { "epoch": 0.936403155691515, "grad_norm": 0.33997422039230435, "learning_rate": 1.9611634833564096e-07, "loss": 0.4979, "step": 1454 }, { "epoch": 0.9370471743680566, "grad_norm": 0.32833315763093884, "learning_rate": 1.921471959676957e-07, "loss": 0.5074, "step": 1455 }, { "epoch": 0.9376911930445982, "grad_norm": 0.33632627811466054, "learning_rate": 1.8821823101760949e-07, "loss": 0.5149, "step": 1456 }, { "epoch": 0.9383352117211399, "grad_norm": 0.31890360991808997, "learning_rate": 1.8432946958421238e-07, "loss": 0.5116, "step": 1457 }, { "epoch": 0.9389792303976815, "grad_norm": 0.32426358381158854, "learning_rate": 1.8048092760160286e-07, "loss": 0.5089, "step": 1458 }, { "epoch": 0.9396232490742231, "grad_norm": 0.31817255548607454, "learning_rate": 1.7667262083907789e-07, "loss": 0.5019, "step": 1459 }, { "epoch": 0.9402672677507647, "grad_norm": 0.30487008769191365, "learning_rate": 1.7290456490107522e-07, "loss": 0.5031, "step": 1460 }, { "epoch": 0.9409112864273064, "grad_norm": 0.3448985490446457, "learning_rate": 1.6917677522710564e-07, "loss": 0.5198, "step": 1461 }, { "epoch": 0.941555305103848, "grad_norm": 0.3121538707817271, "learning_rate": 1.6548926709168634e-07, "loss": 0.5022, "step": 1462 }, { "epoch": 0.9421993237803896, "grad_norm": 0.37219441612042414, "learning_rate": 1.6184205560428655e-07, "loss": 0.5273, "step": 1463 }, { "epoch": 0.9428433424569312, "grad_norm": 0.32208828965496183, "learning_rate": 1.5823515570925763e-07, "loss": 0.5011, "step": 1464 }, { "epoch": 0.9434873611334729, "grad_norm": 0.32684510746778384, "learning_rate": 1.546685821857774e-07, "loss": 0.4987, "step": 1465 }, { "epoch": 0.9441313798100145, "grad_norm": 0.3317277626505544, "learning_rate": 1.5114234964778707e-07, "loss": 0.5033, "step": 1466 }, { "epoch": 0.9447753984865561, "grad_norm": 0.318763614743111, "learning_rate": 1.4765647254393113e-07, "loss": 0.5047, "step": 1467 }, { "epoch": 0.9454194171630977, "grad_norm": 0.32172147121048666, "learning_rate": 1.4421096515749855e-07, "loss": 0.513, "step": 1468 }, { "epoch": 0.9460634358396394, "grad_norm": 0.31508288162424936, "learning_rate": 1.4080584160636402e-07, "loss": 0.507, "step": 1469 }, { "epoch": 0.946707454516181, "grad_norm": 0.3090046048829247, "learning_rate": 1.374411158429323e-07, "loss": 0.4898, "step": 1470 }, { "epoch": 0.9473514731927226, "grad_norm": 0.31255014474472487, "learning_rate": 1.3411680165407948e-07, "loss": 0.4981, "step": 1471 }, { "epoch": 0.9479954918692642, "grad_norm": 0.31518608167301276, "learning_rate": 1.30832912661093e-07, "loss": 0.5037, "step": 1472 }, { "epoch": 0.9486395105458059, "grad_norm": 0.32065446572426687, "learning_rate": 1.2758946231962389e-07, "loss": 0.4888, "step": 1473 }, { "epoch": 0.9492835292223475, "grad_norm": 0.3139010369682072, "learning_rate": 1.243864639196213e-07, "loss": 0.5001, "step": 1474 }, { "epoch": 0.9499275478988891, "grad_norm": 0.3401560232953053, "learning_rate": 1.2122393058528803e-07, "loss": 0.4995, "step": 1475 }, { "epoch": 0.9505715665754307, "grad_norm": 0.33299111463941666, "learning_rate": 1.1810187527502182e-07, "loss": 0.5178, "step": 1476 }, { "epoch": 0.9512155852519724, "grad_norm": 0.3161844703651403, "learning_rate": 1.1502031078136078e-07, "loss": 0.4981, "step": 1477 }, { "epoch": 0.9518596039285139, "grad_norm": 0.3184457902079813, "learning_rate": 1.1197924973093466e-07, "loss": 0.5053, "step": 1478 }, { "epoch": 0.9525036226050555, "grad_norm": 0.31038231789417303, "learning_rate": 1.089787045844093e-07, "loss": 0.4924, "step": 1479 }, { "epoch": 0.9531476412815971, "grad_norm": 0.31655573415034727, "learning_rate": 1.0601868763643997e-07, "loss": 0.5048, "step": 1480 }, { "epoch": 0.9537916599581387, "grad_norm": 0.3254130879149464, "learning_rate": 1.0309921101561926e-07, "loss": 0.5027, "step": 1481 }, { "epoch": 0.9544356786346804, "grad_norm": 0.3398784641631524, "learning_rate": 1.0022028668442374e-07, "loss": 0.505, "step": 1482 }, { "epoch": 0.955079697311222, "grad_norm": 0.3230644519253946, "learning_rate": 9.738192643917066e-08, "loss": 0.5239, "step": 1483 }, { "epoch": 0.9557237159877636, "grad_norm": 0.33945217142130113, "learning_rate": 9.45841419099669e-08, "loss": 0.5259, "step": 1484 }, { "epoch": 0.9563677346643052, "grad_norm": 0.3225837947384017, "learning_rate": 9.1826944560659e-08, "loss": 0.5004, "step": 1485 }, { "epoch": 0.9570117533408469, "grad_norm": 0.31249603415296895, "learning_rate": 8.911034568879207e-08, "loss": 0.4984, "step": 1486 }, { "epoch": 0.9576557720173885, "grad_norm": 0.30356118713226354, "learning_rate": 8.643435642555653e-08, "loss": 0.4894, "step": 1487 }, { "epoch": 0.9582997906939301, "grad_norm": 0.3249939222024513, "learning_rate": 8.379898773574924e-08, "loss": 0.5187, "step": 1488 }, { "epoch": 0.9589438093704717, "grad_norm": 0.3448750267987429, "learning_rate": 8.12042504177224e-08, "loss": 0.5202, "step": 1489 }, { "epoch": 0.9595878280470134, "grad_norm": 0.3309529795326833, "learning_rate": 7.865015510334473e-08, "loss": 0.5035, "step": 1490 }, { "epoch": 0.960231846723555, "grad_norm": 0.32514755805080986, "learning_rate": 7.613671225795371e-08, "loss": 0.4834, "step": 1491 }, { "epoch": 0.9608758654000966, "grad_norm": 0.32646393495858445, "learning_rate": 7.366393218031564e-08, "loss": 0.5068, "step": 1492 }, { "epoch": 0.9615198840766382, "grad_norm": 0.3188399700522663, "learning_rate": 7.123182500258119e-08, "loss": 0.4936, "step": 1493 }, { "epoch": 0.9621639027531799, "grad_norm": 0.31841093863455616, "learning_rate": 6.884040069024434e-08, "loss": 0.5218, "step": 1494 }, { "epoch": 0.9628079214297215, "grad_norm": 0.3183147047489415, "learning_rate": 6.648966904210463e-08, "loss": 0.508, "step": 1495 }, { "epoch": 0.9634519401062631, "grad_norm": 0.3148529904073374, "learning_rate": 6.417963969022389e-08, "loss": 0.5075, "step": 1496 }, { "epoch": 0.9640959587828047, "grad_norm": 0.33712899695485904, "learning_rate": 6.19103220998829e-08, "loss": 0.5229, "step": 1497 }, { "epoch": 0.9647399774593464, "grad_norm": 0.31535868857950927, "learning_rate": 5.968172556955365e-08, "loss": 0.5142, "step": 1498 }, { "epoch": 0.965383996135888, "grad_norm": 0.323013663166573, "learning_rate": 5.749385923084938e-08, "loss": 0.4974, "step": 1499 }, { "epoch": 0.9660280148124296, "grad_norm": 0.31928813969845643, "learning_rate": 5.534673204849572e-08, "loss": 0.4887, "step": 1500 }, { "epoch": 0.9666720334889711, "grad_norm": 0.32789534885510685, "learning_rate": 5.324035282029072e-08, "loss": 0.5166, "step": 1501 }, { "epoch": 0.9673160521655128, "grad_norm": 0.32239982705955633, "learning_rate": 5.1174730177064866e-08, "loss": 0.5135, "step": 1502 }, { "epoch": 0.9679600708420544, "grad_norm": 0.3229614067673402, "learning_rate": 4.914987258265558e-08, "loss": 0.5114, "step": 1503 }, { "epoch": 0.968604089518596, "grad_norm": 0.3187769601713629, "learning_rate": 4.716578833386054e-08, "loss": 0.507, "step": 1504 }, { "epoch": 0.9692481081951376, "grad_norm": 0.3569592976676255, "learning_rate": 4.522248556041331e-08, "loss": 0.513, "step": 1505 }, { "epoch": 0.9698921268716792, "grad_norm": 0.34973977209017737, "learning_rate": 4.331997222494777e-08, "loss": 0.5367, "step": 1506 }, { "epoch": 0.9705361455482209, "grad_norm": 0.32344976250169655, "learning_rate": 4.145825612295928e-08, "loss": 0.5038, "step": 1507 }, { "epoch": 0.9711801642247625, "grad_norm": 0.31917645441339465, "learning_rate": 3.963734488278248e-08, "loss": 0.4909, "step": 1508 }, { "epoch": 0.9718241829013041, "grad_norm": 0.33328145631239886, "learning_rate": 3.7857245965551294e-08, "loss": 0.5136, "step": 1509 }, { "epoch": 0.9724682015778457, "grad_norm": 0.31973307508774307, "learning_rate": 3.6117966665175644e-08, "loss": 0.4887, "step": 1510 }, { "epoch": 0.9731122202543874, "grad_norm": 0.3101125930396609, "learning_rate": 3.4419514108305905e-08, "loss": 0.5005, "step": 1511 }, { "epoch": 0.973756238930929, "grad_norm": 0.3240364414189752, "learning_rate": 3.2761895254306285e-08, "loss": 0.5226, "step": 1512 }, { "epoch": 0.9744002576074706, "grad_norm": 0.32794680273261473, "learning_rate": 3.114511689522592e-08, "loss": 0.4926, "step": 1513 }, { "epoch": 0.9750442762840122, "grad_norm": 0.3158829255288427, "learning_rate": 2.9569185655773382e-08, "loss": 0.5116, "step": 1514 }, { "epoch": 0.9756882949605539, "grad_norm": 0.3279062484898434, "learning_rate": 2.803410799328221e-08, "loss": 0.5198, "step": 1515 }, { "epoch": 0.9763323136370955, "grad_norm": 0.31633217705607497, "learning_rate": 2.6539890197695428e-08, "loss": 0.4851, "step": 1516 }, { "epoch": 0.9769763323136371, "grad_norm": 0.3100431963638588, "learning_rate": 2.5086538391529968e-08, "loss": 0.4834, "step": 1517 }, { "epoch": 0.9776203509901787, "grad_norm": 0.334015640841443, "learning_rate": 2.3674058529855603e-08, "loss": 0.4968, "step": 1518 }, { "epoch": 0.9782643696667204, "grad_norm": 0.3409614377644398, "learning_rate": 2.230245640027273e-08, "loss": 0.5297, "step": 1519 }, { "epoch": 0.978908388343262, "grad_norm": 0.3339401965364627, "learning_rate": 2.0971737622883515e-08, "loss": 0.506, "step": 1520 }, { "epoch": 0.9795524070198036, "grad_norm": 0.33349732144235245, "learning_rate": 1.9681907650274113e-08, "loss": 0.538, "step": 1521 }, { "epoch": 0.9801964256963452, "grad_norm": 0.3101078908544523, "learning_rate": 1.8432971767488038e-08, "loss": 0.5044, "step": 1522 }, { "epoch": 0.9808404443728869, "grad_norm": 0.3116951561961041, "learning_rate": 1.722493509200729e-08, "loss": 0.5156, "step": 1523 }, { "epoch": 0.9814844630494284, "grad_norm": 0.3183368607134099, "learning_rate": 1.605780257373124e-08, "loss": 0.5016, "step": 1524 }, { "epoch": 0.98212848172597, "grad_norm": 0.33093659028418326, "learning_rate": 1.4931578994952235e-08, "loss": 0.5314, "step": 1525 }, { "epoch": 0.9827725004025116, "grad_norm": 0.33870558542795703, "learning_rate": 1.3846268970344467e-08, "loss": 0.5074, "step": 1526 }, { "epoch": 0.9834165190790533, "grad_norm": 0.312779970756693, "learning_rate": 1.2801876946935133e-08, "loss": 0.4966, "step": 1527 }, { "epoch": 0.9840605377555949, "grad_norm": 0.31519948982968526, "learning_rate": 1.179840720409331e-08, "loss": 0.5069, "step": 1528 }, { "epoch": 0.9847045564321365, "grad_norm": 0.30728900007122767, "learning_rate": 1.0835863853509988e-08, "loss": 0.4958, "step": 1529 }, { "epoch": 0.9853485751086781, "grad_norm": 0.3360691794022974, "learning_rate": 9.914250839180296e-09, "loss": 0.4993, "step": 1530 }, { "epoch": 0.9859925937852198, "grad_norm": 0.31184681291801003, "learning_rate": 9.033571937391294e-09, "loss": 0.5025, "step": 1531 }, { "epoch": 0.9866366124617614, "grad_norm": 0.32478255239375275, "learning_rate": 8.193830756699773e-09, "loss": 0.4869, "step": 1532 }, { "epoch": 0.987280631138303, "grad_norm": 0.31789773676079386, "learning_rate": 7.395030737924469e-09, "loss": 0.5061, "step": 1533 }, { "epoch": 0.9879246498148446, "grad_norm": 0.32703052681671896, "learning_rate": 6.6371751541249865e-09, "loss": 0.5017, "step": 1534 }, { "epoch": 0.9885686684913862, "grad_norm": 0.32277001959949897, "learning_rate": 5.920267110597344e-09, "loss": 0.5079, "step": 1535 }, { "epoch": 0.9892126871679279, "grad_norm": 0.3095194106483935, "learning_rate": 5.2443095448506674e-09, "loss": 0.5046, "step": 1536 }, { "epoch": 0.9898567058444695, "grad_norm": 0.3226258224234109, "learning_rate": 4.609305226606076e-09, "loss": 0.5006, "step": 1537 }, { "epoch": 0.9905007245210111, "grad_norm": 0.3117692304166334, "learning_rate": 4.015256757774477e-09, "loss": 0.5038, "step": 1538 }, { "epoch": 0.9911447431975527, "grad_norm": 0.3412765632761588, "learning_rate": 3.462166572454351e-09, "loss": 0.4971, "step": 1539 }, { "epoch": 0.9917887618740944, "grad_norm": 0.3219832614138702, "learning_rate": 2.9500369369195313e-09, "loss": 0.5123, "step": 1540 }, { "epoch": 0.992432780550636, "grad_norm": 0.3172695334255751, "learning_rate": 2.478869949606999e-09, "loss": 0.4935, "step": 1541 }, { "epoch": 0.9930767992271776, "grad_norm": 0.3199077493532577, "learning_rate": 2.0486675411102165e-09, "loss": 0.5035, "step": 1542 }, { "epoch": 0.9937208179037192, "grad_norm": 0.3122506518414418, "learning_rate": 1.6594314741724682e-09, "loss": 0.5092, "step": 1543 }, { "epoch": 0.9943648365802609, "grad_norm": 0.33036094100343394, "learning_rate": 1.3111633436779792e-09, "loss": 0.4953, "step": 1544 }, { "epoch": 0.9950088552568025, "grad_norm": 0.32495070684760774, "learning_rate": 1.003864576647473e-09, "loss": 0.5059, "step": 1545 }, { "epoch": 0.9956528739333441, "grad_norm": 0.332630829179801, "learning_rate": 7.375364322292911e-10, "loss": 0.5199, "step": 1546 }, { "epoch": 0.9962968926098856, "grad_norm": 0.33169943894250803, "learning_rate": 5.121800016949508e-10, "loss": 0.5164, "step": 1547 }, { "epoch": 0.9969409112864273, "grad_norm": 0.33471730916198206, "learning_rate": 3.277962084369257e-10, "loss": 0.4913, "step": 1548 }, { "epoch": 0.9975849299629689, "grad_norm": 0.31221579915474884, "learning_rate": 1.843858079642047e-10, "loss": 0.503, "step": 1549 }, { "epoch": 0.9982289486395105, "grad_norm": 0.3275285751669639, "learning_rate": 8.19493878945199e-11, "loss": 0.5122, "step": 1550 }, { "epoch": 0.9988729673160521, "grad_norm": 0.3355487735592334, "learning_rate": 2.0487367959898253e-11, "loss": 0.5307, "step": 1551 }, { "epoch": 0.9995169859925938, "grad_norm": 0.3388553601045606, "learning_rate": 0.0, "loss": 0.5076, "step": 1552 }, { "epoch": 0.9995169859925938, "step": 1552, "total_flos": 1627220267237376.0, "train_loss": 0.5273892322612792, "train_runtime": 41682.4586, "train_samples_per_second": 4.768, "train_steps_per_second": 0.037 } ], "logging_steps": 1, "max_steps": 1552, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1627220267237376.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }