{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0978085351787774, "eval_steps": 345, "global_step": 1378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.097861769142046e-05, "grad_norm": 1.0071473121643066, "learning_rate": 2e-05, "loss": 1.1744, "step": 1 }, { "epoch": 7.097861769142046e-05, "eval_loss": 1.7213321924209595, "eval_runtime": 316.3421, "eval_samples_per_second": 18.755, "eval_steps_per_second": 9.379, "step": 1 }, { "epoch": 0.00014195723538284091, "grad_norm": 1.1630088090896606, "learning_rate": 4e-05, "loss": 1.4684, "step": 2 }, { "epoch": 0.00021293585307426138, "grad_norm": 1.011231541633606, "learning_rate": 6e-05, "loss": 1.3388, "step": 3 }, { "epoch": 0.00028391447076568183, "grad_norm": 1.1395065784454346, "learning_rate": 8e-05, "loss": 1.4141, "step": 4 }, { "epoch": 0.00035489308845710227, "grad_norm": 0.9518294930458069, "learning_rate": 0.0001, "loss": 1.2138, "step": 5 }, { "epoch": 0.00042587170614852277, "grad_norm": 1.1404623985290527, "learning_rate": 0.00012, "loss": 1.5258, "step": 6 }, { "epoch": 0.0004968503238399432, "grad_norm": 1.1148234605789185, "learning_rate": 0.00014, "loss": 1.5945, "step": 7 }, { "epoch": 0.0005678289415313637, "grad_norm": 0.7065153121948242, "learning_rate": 0.00016, "loss": 1.2237, "step": 8 }, { "epoch": 0.0006388075592227841, "grad_norm": 0.5677412152290344, "learning_rate": 0.00018, "loss": 1.2327, "step": 9 }, { "epoch": 0.0007097861769142045, "grad_norm": 0.7821382284164429, "learning_rate": 0.0002, "loss": 1.3952, "step": 10 }, { "epoch": 0.0007807647946056251, "grad_norm": 0.8164384961128235, "learning_rate": 0.00019999973630775112, "loss": 1.0, "step": 11 }, { "epoch": 0.0008517434122970455, "grad_norm": 0.7434931993484497, "learning_rate": 0.00019999894523239515, "loss": 0.9922, "step": 12 }, { "epoch": 0.000922722029988466, "grad_norm": 0.6980607509613037, "learning_rate": 0.00019999762677810407, "loss": 1.314, "step": 13 }, { "epoch": 0.0009937006476798864, "grad_norm": 0.7483981251716614, "learning_rate": 0.00019999578095183124, "loss": 1.2244, "step": 14 }, { "epoch": 0.001064679265371307, "grad_norm": 0.602606475353241, "learning_rate": 0.00019999340776331127, "loss": 1.0677, "step": 15 }, { "epoch": 0.0011356578830627273, "grad_norm": 0.5919442772865295, "learning_rate": 0.00019999050722505993, "loss": 0.9667, "step": 16 }, { "epoch": 0.0012066365007541479, "grad_norm": 0.4755237102508545, "learning_rate": 0.00019998707935237425, "loss": 1.0105, "step": 17 }, { "epoch": 0.0012776151184455682, "grad_norm": 0.5926672220230103, "learning_rate": 0.00019998312416333227, "loss": 0.8434, "step": 18 }, { "epoch": 0.0013485937361369887, "grad_norm": 1.0684069395065308, "learning_rate": 0.00019997864167879312, "loss": 1.3092, "step": 19 }, { "epoch": 0.001419572353828409, "grad_norm": 0.6111437678337097, "learning_rate": 0.00019997363192239664, "loss": 0.942, "step": 20 }, { "epoch": 0.0014905509715198296, "grad_norm": 0.5219926238059998, "learning_rate": 0.00019996809492056356, "loss": 1.0819, "step": 21 }, { "epoch": 0.0015615295892112502, "grad_norm": 0.4983609914779663, "learning_rate": 0.00019996203070249516, "loss": 1.072, "step": 22 }, { "epoch": 0.0016325082069026705, "grad_norm": 1.860731601715088, "learning_rate": 0.00019995543930017315, "loss": 2.3149, "step": 23 }, { "epoch": 0.001703486824594091, "grad_norm": 0.653619647026062, "learning_rate": 0.00019994832074835963, "loss": 1.1855, "step": 24 }, { "epoch": 0.0017744654422855114, "grad_norm": 0.4535515308380127, "learning_rate": 0.0001999406750845967, "loss": 1.063, "step": 25 }, { "epoch": 0.001845444059976932, "grad_norm": 0.5094876885414124, "learning_rate": 0.00019993250234920636, "loss": 0.9361, "step": 26 }, { "epoch": 0.0019164226776683525, "grad_norm": 0.4468972980976105, "learning_rate": 0.00019992380258529045, "loss": 1.045, "step": 27 }, { "epoch": 0.001987401295359773, "grad_norm": 0.5492545962333679, "learning_rate": 0.0001999145758387301, "loss": 0.8953, "step": 28 }, { "epoch": 0.002058379913051193, "grad_norm": 0.6947111487388611, "learning_rate": 0.0001999048221581858, "loss": 1.1556, "step": 29 }, { "epoch": 0.002129358530742614, "grad_norm": 1.403872013092041, "learning_rate": 0.0001998945415950969, "loss": 0.8239, "step": 30 }, { "epoch": 0.0022003371484340343, "grad_norm": 0.5107519030570984, "learning_rate": 0.00019988373420368148, "loss": 1.0179, "step": 31 }, { "epoch": 0.0022713157661254546, "grad_norm": 0.5256995558738708, "learning_rate": 0.0001998724000409361, "loss": 1.2171, "step": 32 }, { "epoch": 0.002342294383816875, "grad_norm": 0.6153073310852051, "learning_rate": 0.00019986053916663533, "loss": 1.1574, "step": 33 }, { "epoch": 0.0024132730015082957, "grad_norm": 0.7516297101974487, "learning_rate": 0.00019984815164333163, "loss": 1.0181, "step": 34 }, { "epoch": 0.002484251619199716, "grad_norm": 0.5266925692558289, "learning_rate": 0.00019983523753635486, "loss": 0.9523, "step": 35 }, { "epoch": 0.0025552302368911364, "grad_norm": 0.5152684450149536, "learning_rate": 0.000199821796913812, "loss": 0.8336, "step": 36 }, { "epoch": 0.002626208854582557, "grad_norm": 0.4673595726490021, "learning_rate": 0.00019980782984658683, "loss": 0.9766, "step": 37 }, { "epoch": 0.0026971874722739775, "grad_norm": 0.7287444472312927, "learning_rate": 0.00019979333640833947, "loss": 1.4679, "step": 38 }, { "epoch": 0.002768166089965398, "grad_norm": 0.4323437213897705, "learning_rate": 0.00019977831667550611, "loss": 0.9334, "step": 39 }, { "epoch": 0.002839144707656818, "grad_norm": 0.5118696093559265, "learning_rate": 0.00019976277072729845, "loss": 1.0936, "step": 40 }, { "epoch": 0.002910123325348239, "grad_norm": 0.4197646975517273, "learning_rate": 0.00019974669864570346, "loss": 1.0367, "step": 41 }, { "epoch": 0.0029811019430396593, "grad_norm": 0.614962100982666, "learning_rate": 0.00019973010051548275, "loss": 1.2423, "step": 42 }, { "epoch": 0.0030520805607310796, "grad_norm": 0.5005660653114319, "learning_rate": 0.0001997129764241723, "loss": 1.1571, "step": 43 }, { "epoch": 0.0031230591784225004, "grad_norm": 0.5688406229019165, "learning_rate": 0.00019969532646208195, "loss": 1.0885, "step": 44 }, { "epoch": 0.0031940377961139207, "grad_norm": 0.5735238194465637, "learning_rate": 0.00019967715072229485, "loss": 1.1203, "step": 45 }, { "epoch": 0.003265016413805341, "grad_norm": 0.4851870834827423, "learning_rate": 0.000199658449300667, "loss": 0.8962, "step": 46 }, { "epoch": 0.003335995031496762, "grad_norm": 0.5166847109794617, "learning_rate": 0.0001996392222958268, "loss": 0.8109, "step": 47 }, { "epoch": 0.003406973649188182, "grad_norm": 0.5074728727340698, "learning_rate": 0.00019961946980917456, "loss": 0.9288, "step": 48 }, { "epoch": 0.0034779522668796025, "grad_norm": 0.5071332454681396, "learning_rate": 0.00019959919194488177, "loss": 1.0574, "step": 49 }, { "epoch": 0.003548930884571023, "grad_norm": 0.6750898957252502, "learning_rate": 0.00019957838880989078, "loss": 1.4215, "step": 50 }, { "epoch": 0.0036199095022624436, "grad_norm": 0.6220878958702087, "learning_rate": 0.00019955706051391406, "loss": 1.0261, "step": 51 }, { "epoch": 0.003690888119953864, "grad_norm": 0.44693389534950256, "learning_rate": 0.00019953520716943371, "loss": 0.6748, "step": 52 }, { "epoch": 0.0037618667376452843, "grad_norm": 0.6086903214454651, "learning_rate": 0.000199512828891701, "loss": 1.0206, "step": 53 }, { "epoch": 0.003832845355336705, "grad_norm": 0.6346308588981628, "learning_rate": 0.00019948992579873538, "loss": 1.0987, "step": 54 }, { "epoch": 0.0039038239730281254, "grad_norm": 0.6782318949699402, "learning_rate": 0.00019946649801132427, "loss": 1.1718, "step": 55 }, { "epoch": 0.003974802590719546, "grad_norm": 0.5194227695465088, "learning_rate": 0.00019944254565302217, "loss": 0.9065, "step": 56 }, { "epoch": 0.0040457812084109665, "grad_norm": 0.6403017044067383, "learning_rate": 0.00019941806885015012, "loss": 0.8227, "step": 57 }, { "epoch": 0.004116759826102386, "grad_norm": 0.7098397016525269, "learning_rate": 0.00019939306773179497, "loss": 0.9796, "step": 58 }, { "epoch": 0.004187738443793807, "grad_norm": 0.5155645608901978, "learning_rate": 0.00019936754242980874, "loss": 0.6579, "step": 59 }, { "epoch": 0.004258717061485228, "grad_norm": 0.6105719208717346, "learning_rate": 0.00019934149307880791, "loss": 0.995, "step": 60 }, { "epoch": 0.004329695679176648, "grad_norm": 0.4568040370941162, "learning_rate": 0.00019931491981617274, "loss": 0.8595, "step": 61 }, { "epoch": 0.004400674296868069, "grad_norm": 0.5027696490287781, "learning_rate": 0.0001992878227820465, "loss": 0.9482, "step": 62 }, { "epoch": 0.004471652914559489, "grad_norm": 0.7956494092941284, "learning_rate": 0.0001992602021193347, "loss": 1.0026, "step": 63 }, { "epoch": 0.004542631532250909, "grad_norm": 0.5496545433998108, "learning_rate": 0.0001992320579737045, "loss": 1.012, "step": 64 }, { "epoch": 0.00461361014994233, "grad_norm": 0.4921521246433258, "learning_rate": 0.00019920339049358374, "loss": 1.2796, "step": 65 }, { "epoch": 0.00468458876763375, "grad_norm": 0.5527113676071167, "learning_rate": 0.00019917419983016025, "loss": 0.9349, "step": 66 }, { "epoch": 0.004755567385325171, "grad_norm": 0.6795085668563843, "learning_rate": 0.00019914448613738106, "loss": 1.1524, "step": 67 }, { "epoch": 0.0048265460030165915, "grad_norm": 0.8340494632720947, "learning_rate": 0.00019911424957195158, "loss": 1.1957, "step": 68 }, { "epoch": 0.004897524620708011, "grad_norm": 0.47689518332481384, "learning_rate": 0.00019908349029333479, "loss": 0.9924, "step": 69 }, { "epoch": 0.004968503238399432, "grad_norm": 0.5147756338119507, "learning_rate": 0.00019905220846375032, "loss": 1.0138, "step": 70 }, { "epoch": 0.005039481856090853, "grad_norm": 0.5162140727043152, "learning_rate": 0.00019902040424817376, "loss": 1.0271, "step": 71 }, { "epoch": 0.005110460473782273, "grad_norm": 0.6443835496902466, "learning_rate": 0.00019898807781433555, "loss": 1.3248, "step": 72 }, { "epoch": 0.0051814390914736936, "grad_norm": 0.46129754185676575, "learning_rate": 0.00019895522933272028, "loss": 0.9714, "step": 73 }, { "epoch": 0.005252417709165114, "grad_norm": 0.5381733179092407, "learning_rate": 0.00019892185897656578, "loss": 0.9294, "step": 74 }, { "epoch": 0.005323396326856534, "grad_norm": 0.4620009660720825, "learning_rate": 0.00019888796692186218, "loss": 1.0435, "step": 75 }, { "epoch": 0.005394374944547955, "grad_norm": 0.8141055703163147, "learning_rate": 0.00019885355334735082, "loss": 1.143, "step": 76 }, { "epoch": 0.005465353562239376, "grad_norm": 0.5396761894226074, "learning_rate": 0.00019881861843452366, "loss": 1.0376, "step": 77 }, { "epoch": 0.005536332179930796, "grad_norm": 0.4937331974506378, "learning_rate": 0.00019878316236762196, "loss": 0.8089, "step": 78 }, { "epoch": 0.0056073107976222164, "grad_norm": 0.5338419079780579, "learning_rate": 0.0001987471853336355, "loss": 1.1142, "step": 79 }, { "epoch": 0.005678289415313636, "grad_norm": 0.6474658846855164, "learning_rate": 0.00019871068752230162, "loss": 0.9935, "step": 80 }, { "epoch": 0.005749268033005057, "grad_norm": 0.5022076964378357, "learning_rate": 0.0001986736691261041, "loss": 1.1436, "step": 81 }, { "epoch": 0.005820246650696478, "grad_norm": 0.5347083210945129, "learning_rate": 0.00019863613034027224, "loss": 1.1927, "step": 82 }, { "epoch": 0.005891225268387898, "grad_norm": 0.40351274609565735, "learning_rate": 0.00019859807136277978, "loss": 1.3807, "step": 83 }, { "epoch": 0.0059622038860793185, "grad_norm": 0.4644363820552826, "learning_rate": 0.0001985594923943438, "loss": 0.9057, "step": 84 }, { "epoch": 0.006033182503770739, "grad_norm": 0.5400217175483704, "learning_rate": 0.00019852039363842393, "loss": 1.0047, "step": 85 }, { "epoch": 0.006104161121462159, "grad_norm": 0.7487593293190002, "learning_rate": 0.00019848077530122083, "loss": 1.2362, "step": 86 }, { "epoch": 0.00617513973915358, "grad_norm": 0.4599716365337372, "learning_rate": 0.0001984406375916755, "loss": 0.8023, "step": 87 }, { "epoch": 0.006246118356845001, "grad_norm": 0.48803049325942993, "learning_rate": 0.000198399980721468, "loss": 0.9927, "step": 88 }, { "epoch": 0.006317096974536421, "grad_norm": 0.4646725058555603, "learning_rate": 0.0001983588049050164, "loss": 0.8408, "step": 89 }, { "epoch": 0.006388075592227841, "grad_norm": 0.3934456408023834, "learning_rate": 0.0001983171103594755, "loss": 1.0583, "step": 90 }, { "epoch": 0.006459054209919262, "grad_norm": 0.5140405297279358, "learning_rate": 0.00019827489730473596, "loss": 0.9967, "step": 91 }, { "epoch": 0.006530032827610682, "grad_norm": 0.8771049976348877, "learning_rate": 0.0001982321659634228, "loss": 1.134, "step": 92 }, { "epoch": 0.006601011445302103, "grad_norm": 0.479004442691803, "learning_rate": 0.00019818891656089453, "loss": 1.0823, "step": 93 }, { "epoch": 0.006671990062993524, "grad_norm": 0.5401038527488708, "learning_rate": 0.0001981451493252418, "loss": 1.0357, "step": 94 }, { "epoch": 0.0067429686806849435, "grad_norm": 0.5647906064987183, "learning_rate": 0.0001981008644872862, "loss": 1.033, "step": 95 }, { "epoch": 0.006813947298376364, "grad_norm": 0.44292977452278137, "learning_rate": 0.00019805606228057916, "loss": 0.8557, "step": 96 }, { "epoch": 0.006884925916067784, "grad_norm": 0.44759151339530945, "learning_rate": 0.00019801074294140047, "loss": 0.7499, "step": 97 }, { "epoch": 0.006955904533759205, "grad_norm": 0.5166530013084412, "learning_rate": 0.0001979649067087574, "loss": 0.9405, "step": 98 }, { "epoch": 0.007026883151450626, "grad_norm": 0.38778871297836304, "learning_rate": 0.00019791855382438308, "loss": 0.7693, "step": 99 }, { "epoch": 0.007097861769142046, "grad_norm": 0.4173828065395355, "learning_rate": 0.00019787168453273544, "loss": 0.924, "step": 100 }, { "epoch": 0.007168840386833466, "grad_norm": 0.5038108825683594, "learning_rate": 0.0001978242990809959, "loss": 1.2093, "step": 101 }, { "epoch": 0.007239819004524887, "grad_norm": 4.205886363983154, "learning_rate": 0.00019777639771906795, "loss": 1.183, "step": 102 }, { "epoch": 0.007310797622216307, "grad_norm": 0.45464202761650085, "learning_rate": 0.00019772798069957594, "loss": 0.7971, "step": 103 }, { "epoch": 0.007381776239907728, "grad_norm": 0.46295204758644104, "learning_rate": 0.00019767904827786375, "loss": 0.9121, "step": 104 }, { "epoch": 0.007452754857599149, "grad_norm": 0.7446455359458923, "learning_rate": 0.00019762960071199333, "loss": 1.1137, "step": 105 }, { "epoch": 0.0075237334752905685, "grad_norm": 0.702106773853302, "learning_rate": 0.00019757963826274357, "loss": 1.3439, "step": 106 }, { "epoch": 0.007594712092981989, "grad_norm": 0.5121258497238159, "learning_rate": 0.00019752916119360862, "loss": 0.8035, "step": 107 }, { "epoch": 0.00766569071067341, "grad_norm": 0.5357794165611267, "learning_rate": 0.00019747816977079671, "loss": 0.7856, "step": 108 }, { "epoch": 0.00773666932836483, "grad_norm": 0.519943118095398, "learning_rate": 0.00019742666426322876, "loss": 0.8407, "step": 109 }, { "epoch": 0.007807647946056251, "grad_norm": 0.5052663683891296, "learning_rate": 0.0001973746449425368, "loss": 1.2201, "step": 110 }, { "epoch": 0.007878626563747671, "grad_norm": 0.6436232924461365, "learning_rate": 0.0001973221120830626, "loss": 1.0023, "step": 111 }, { "epoch": 0.007949605181439091, "grad_norm": 0.5646064877510071, "learning_rate": 0.0001972690659618564, "loss": 0.9883, "step": 112 }, { "epoch": 0.008020583799130511, "grad_norm": 0.40956825017929077, "learning_rate": 0.0001972155068586752, "loss": 1.0297, "step": 113 }, { "epoch": 0.008091562416821933, "grad_norm": 0.5277307629585266, "learning_rate": 0.0001971614350559814, "loss": 1.0068, "step": 114 }, { "epoch": 0.008162541034513353, "grad_norm": 0.6302029490470886, "learning_rate": 0.0001971068508389413, "loss": 0.8764, "step": 115 }, { "epoch": 0.008233519652204773, "grad_norm": 1.1476373672485352, "learning_rate": 0.00019705175449542358, "loss": 1.5357, "step": 116 }, { "epoch": 0.008304498269896194, "grad_norm": 0.4832446575164795, "learning_rate": 0.00019699614631599787, "loss": 0.8828, "step": 117 }, { "epoch": 0.008375476887587614, "grad_norm": 0.6070902347564697, "learning_rate": 0.00019694002659393305, "loss": 1.0711, "step": 118 }, { "epoch": 0.008446455505279034, "grad_norm": 0.51983642578125, "learning_rate": 0.00019688339562519584, "loss": 0.8965, "step": 119 }, { "epoch": 0.008517434122970456, "grad_norm": 0.5980839729309082, "learning_rate": 0.00019682625370844918, "loss": 0.9181, "step": 120 }, { "epoch": 0.008588412740661876, "grad_norm": 0.5174521803855896, "learning_rate": 0.0001967686011450507, "loss": 0.8876, "step": 121 }, { "epoch": 0.008659391358353296, "grad_norm": 0.6685493588447571, "learning_rate": 0.0001967104382390511, "loss": 0.8606, "step": 122 }, { "epoch": 0.008730369976044717, "grad_norm": 0.5910232067108154, "learning_rate": 0.00019665176529719248, "loss": 1.062, "step": 123 }, { "epoch": 0.008801348593736137, "grad_norm": 0.5033608675003052, "learning_rate": 0.00019659258262890683, "loss": 0.8579, "step": 124 }, { "epoch": 0.008872327211427557, "grad_norm": 0.48769769072532654, "learning_rate": 0.00019653289054631445, "loss": 0.9893, "step": 125 }, { "epoch": 0.008943305829118979, "grad_norm": 0.44114089012145996, "learning_rate": 0.00019647268936422206, "loss": 0.9559, "step": 126 }, { "epoch": 0.009014284446810399, "grad_norm": 0.4468735456466675, "learning_rate": 0.00019641197940012137, "loss": 0.8747, "step": 127 }, { "epoch": 0.009085263064501818, "grad_norm": 0.4759454131126404, "learning_rate": 0.00019635076097418734, "loss": 0.8759, "step": 128 }, { "epoch": 0.00915624168219324, "grad_norm": 0.40357035398483276, "learning_rate": 0.00019628903440927647, "loss": 0.7243, "step": 129 }, { "epoch": 0.00922722029988466, "grad_norm": 0.6857088804244995, "learning_rate": 0.00019622680003092503, "loss": 1.1519, "step": 130 }, { "epoch": 0.00929819891757608, "grad_norm": 0.7141540050506592, "learning_rate": 0.00019616405816734757, "loss": 1.4631, "step": 131 }, { "epoch": 0.0093691775352675, "grad_norm": 0.6907796859741211, "learning_rate": 0.00019610080914943492, "loss": 1.0705, "step": 132 }, { "epoch": 0.009440156152958921, "grad_norm": 0.44966191053390503, "learning_rate": 0.00019603705331075255, "loss": 0.8862, "step": 133 }, { "epoch": 0.009511134770650341, "grad_norm": 0.48853984475135803, "learning_rate": 0.0001959727909875389, "loss": 0.9516, "step": 134 }, { "epoch": 0.009582113388341761, "grad_norm": 0.5312148928642273, "learning_rate": 0.00019590802251870354, "loss": 1.2261, "step": 135 }, { "epoch": 0.009653092006033183, "grad_norm": 0.6372950077056885, "learning_rate": 0.0001958427482458253, "loss": 1.0486, "step": 136 }, { "epoch": 0.009724070623724603, "grad_norm": 0.4442826211452484, "learning_rate": 0.00019577696851315053, "loss": 0.9093, "step": 137 }, { "epoch": 0.009795049241416023, "grad_norm": 0.4945155382156372, "learning_rate": 0.00019571068366759143, "loss": 1.0575, "step": 138 }, { "epoch": 0.009866027859107444, "grad_norm": 0.41773444414138794, "learning_rate": 0.00019564389405872393, "loss": 1.0511, "step": 139 }, { "epoch": 0.009937006476798864, "grad_norm": 0.43992042541503906, "learning_rate": 0.00019557660003878614, "loss": 0.8827, "step": 140 }, { "epoch": 0.010007985094490284, "grad_norm": 0.5190751552581787, "learning_rate": 0.00019550880196267622, "loss": 0.9591, "step": 141 }, { "epoch": 0.010078963712181706, "grad_norm": 0.377345472574234, "learning_rate": 0.00019544050018795075, "loss": 0.7227, "step": 142 }, { "epoch": 0.010149942329873126, "grad_norm": 0.4752659499645233, "learning_rate": 0.0001953716950748227, "loss": 0.7934, "step": 143 }, { "epoch": 0.010220920947564546, "grad_norm": 0.39363810420036316, "learning_rate": 0.00019530238698615957, "loss": 0.8836, "step": 144 }, { "epoch": 0.010291899565255967, "grad_norm": 0.6112295389175415, "learning_rate": 0.00019523257628748146, "loss": 1.0871, "step": 145 }, { "epoch": 0.010362878182947387, "grad_norm": 0.5272983312606812, "learning_rate": 0.0001951622633469592, "loss": 0.9483, "step": 146 }, { "epoch": 0.010433856800638807, "grad_norm": 0.5348608493804932, "learning_rate": 0.0001950914485354123, "loss": 1.1011, "step": 147 }, { "epoch": 0.010504835418330229, "grad_norm": 0.47906118631362915, "learning_rate": 0.00019502013222630712, "loss": 1.1606, "step": 148 }, { "epoch": 0.010575814036021649, "grad_norm": 0.5874256491661072, "learning_rate": 0.00019494831479575483, "loss": 0.9251, "step": 149 }, { "epoch": 0.010646792653713068, "grad_norm": 0.5095938444137573, "learning_rate": 0.00019487599662250943, "loss": 0.9448, "step": 150 }, { "epoch": 0.01071777127140449, "grad_norm": 0.7566680312156677, "learning_rate": 0.00019480317808796572, "loss": 1.1951, "step": 151 }, { "epoch": 0.01078874988909591, "grad_norm": 0.6020771861076355, "learning_rate": 0.0001947298595761574, "loss": 1.1239, "step": 152 }, { "epoch": 0.01085972850678733, "grad_norm": 0.47523820400238037, "learning_rate": 0.0001946560414737549, "loss": 0.8064, "step": 153 }, { "epoch": 0.010930707124478752, "grad_norm": 1.1158400774002075, "learning_rate": 0.00019458172417006347, "loss": 1.4332, "step": 154 }, { "epoch": 0.011001685742170171, "grad_norm": 0.6161775588989258, "learning_rate": 0.00019450690805702107, "loss": 1.1659, "step": 155 }, { "epoch": 0.011072664359861591, "grad_norm": 0.43671852350234985, "learning_rate": 0.00019443159352919623, "loss": 1.1158, "step": 156 }, { "epoch": 0.011143642977553013, "grad_norm": 0.4232232868671417, "learning_rate": 0.0001943557809837861, "loss": 1.0479, "step": 157 }, { "epoch": 0.011214621595244433, "grad_norm": 0.36087891459465027, "learning_rate": 0.00019427947082061432, "loss": 0.7191, "step": 158 }, { "epoch": 0.011285600212935853, "grad_norm": 0.505713939666748, "learning_rate": 0.0001942026634421289, "loss": 0.779, "step": 159 }, { "epoch": 0.011356578830627273, "grad_norm": 0.5202808976173401, "learning_rate": 0.00019412535925339997, "loss": 1.0448, "step": 160 }, { "epoch": 0.011427557448318694, "grad_norm": 0.6393616795539856, "learning_rate": 0.00019404755866211785, "loss": 1.2844, "step": 161 }, { "epoch": 0.011498536066010114, "grad_norm": 0.5086619853973389, "learning_rate": 0.00019396926207859084, "loss": 0.904, "step": 162 }, { "epoch": 0.011569514683701534, "grad_norm": 0.47012782096862793, "learning_rate": 0.00019389046991574298, "loss": 0.7615, "step": 163 }, { "epoch": 0.011640493301392956, "grad_norm": 0.6485052704811096, "learning_rate": 0.00019381118258911186, "loss": 1.0581, "step": 164 }, { "epoch": 0.011711471919084376, "grad_norm": 0.44959667325019836, "learning_rate": 0.0001937314005168466, "loss": 1.1482, "step": 165 }, { "epoch": 0.011782450536775796, "grad_norm": 0.4789312779903412, "learning_rate": 0.0001936511241197055, "loss": 0.9073, "step": 166 }, { "epoch": 0.011853429154467217, "grad_norm": 0.3625926077365875, "learning_rate": 0.0001935703538210538, "loss": 0.7853, "step": 167 }, { "epoch": 0.011924407772158637, "grad_norm": 0.6628193855285645, "learning_rate": 0.00019348909004686152, "loss": 1.41, "step": 168 }, { "epoch": 0.011995386389850057, "grad_norm": 0.6075107455253601, "learning_rate": 0.00019340733322570122, "loss": 1.0838, "step": 169 }, { "epoch": 0.012066365007541479, "grad_norm": 0.6164256930351257, "learning_rate": 0.0001933250837887457, "loss": 1.0956, "step": 170 }, { "epoch": 0.012137343625232899, "grad_norm": 0.5837414860725403, "learning_rate": 0.0001932423421697658, "loss": 1.0298, "step": 171 }, { "epoch": 0.012208322242924318, "grad_norm": 0.43682530522346497, "learning_rate": 0.0001931591088051279, "loss": 0.7275, "step": 172 }, { "epoch": 0.01227930086061574, "grad_norm": 0.5301129221916199, "learning_rate": 0.0001930753841337919, "loss": 0.8661, "step": 173 }, { "epoch": 0.01235027947830716, "grad_norm": 0.6144097447395325, "learning_rate": 0.0001929911685973088, "loss": 0.9047, "step": 174 }, { "epoch": 0.01242125809599858, "grad_norm": 0.508905291557312, "learning_rate": 0.00019290646263981815, "loss": 1.0019, "step": 175 }, { "epoch": 0.012492236713690002, "grad_norm": 0.4849667251110077, "learning_rate": 0.00019282126670804614, "loss": 0.8546, "step": 176 }, { "epoch": 0.012563215331381421, "grad_norm": 0.6729165315628052, "learning_rate": 0.0001927355812513029, "loss": 1.1535, "step": 177 }, { "epoch": 0.012634193949072841, "grad_norm": 0.5970641374588013, "learning_rate": 0.00019264940672148018, "loss": 1.0079, "step": 178 }, { "epoch": 0.012705172566764263, "grad_norm": 0.6463990211486816, "learning_rate": 0.00019256274357304918, "loss": 1.0836, "step": 179 }, { "epoch": 0.012776151184455683, "grad_norm": 0.5064119696617126, "learning_rate": 0.00019247559226305785, "loss": 0.7732, "step": 180 }, { "epoch": 0.012847129802147103, "grad_norm": 0.4314514696598053, "learning_rate": 0.0001923879532511287, "loss": 1.1146, "step": 181 }, { "epoch": 0.012918108419838524, "grad_norm": 0.4748472571372986, "learning_rate": 0.0001922998269994563, "loss": 1.0797, "step": 182 }, { "epoch": 0.012989087037529944, "grad_norm": 0.4918375313282013, "learning_rate": 0.00019221121397280483, "loss": 0.9177, "step": 183 }, { "epoch": 0.013060065655221364, "grad_norm": 0.548943042755127, "learning_rate": 0.00019212211463850567, "loss": 1.0342, "step": 184 }, { "epoch": 0.013131044272912786, "grad_norm": 0.4742969274520874, "learning_rate": 0.00019203252946645489, "loss": 0.9173, "step": 185 }, { "epoch": 0.013202022890604206, "grad_norm": 0.5800470113754272, "learning_rate": 0.0001919424589291108, "loss": 1.3675, "step": 186 }, { "epoch": 0.013273001508295626, "grad_norm": 0.484326034784317, "learning_rate": 0.00019185190350149144, "loss": 0.9727, "step": 187 }, { "epoch": 0.013343980125987047, "grad_norm": 0.6512810587882996, "learning_rate": 0.00019176086366117211, "loss": 0.9219, "step": 188 }, { "epoch": 0.013414958743678467, "grad_norm": 0.5059779286384583, "learning_rate": 0.00019166933988828283, "loss": 1.2346, "step": 189 }, { "epoch": 0.013485937361369887, "grad_norm": 0.5709842443466187, "learning_rate": 0.00019157733266550575, "loss": 1.4333, "step": 190 }, { "epoch": 0.013556915979061307, "grad_norm": 0.3898125886917114, "learning_rate": 0.00019148484247807272, "loss": 0.7785, "step": 191 }, { "epoch": 0.013627894596752729, "grad_norm": 0.6490575075149536, "learning_rate": 0.00019139186981376267, "loss": 1.275, "step": 192 }, { "epoch": 0.013698873214444149, "grad_norm": 0.4250531792640686, "learning_rate": 0.00019129841516289902, "loss": 0.8174, "step": 193 }, { "epoch": 0.013769851832135568, "grad_norm": 0.4925726354122162, "learning_rate": 0.00019120447901834706, "loss": 0.9447, "step": 194 }, { "epoch": 0.01384083044982699, "grad_norm": 0.3922591209411621, "learning_rate": 0.0001911100618755115, "loss": 0.7983, "step": 195 }, { "epoch": 0.01391180906751841, "grad_norm": 0.4286252558231354, "learning_rate": 0.00019101516423233368, "loss": 1.4458, "step": 196 }, { "epoch": 0.01398278768520983, "grad_norm": 0.38011500239372253, "learning_rate": 0.0001909197865892891, "loss": 0.9919, "step": 197 }, { "epoch": 0.014053766302901251, "grad_norm": 0.6599341034889221, "learning_rate": 0.00019082392944938466, "loss": 1.1749, "step": 198 }, { "epoch": 0.014124744920592671, "grad_norm": 0.5216343402862549, "learning_rate": 0.000190727593318156, "loss": 1.0529, "step": 199 }, { "epoch": 0.014195723538284091, "grad_norm": 0.3849651515483856, "learning_rate": 0.000190630778703665, "loss": 0.888, "step": 200 }, { "epoch": 0.014266702155975513, "grad_norm": 0.48573601245880127, "learning_rate": 0.0001905334861164969, "loss": 0.7641, "step": 201 }, { "epoch": 0.014337680773666933, "grad_norm": 0.4658781886100769, "learning_rate": 0.00019043571606975777, "loss": 0.9595, "step": 202 }, { "epoch": 0.014408659391358353, "grad_norm": 0.5626301765441895, "learning_rate": 0.0001903374690790716, "loss": 0.9871, "step": 203 }, { "epoch": 0.014479638009049774, "grad_norm": 0.47993823885917664, "learning_rate": 0.00019023874566257784, "loss": 0.8728, "step": 204 }, { "epoch": 0.014550616626741194, "grad_norm": 0.5967445373535156, "learning_rate": 0.00019013954634092847, "loss": 1.0216, "step": 205 }, { "epoch": 0.014621595244432614, "grad_norm": 0.7762762308120728, "learning_rate": 0.00019003987163728535, "loss": 1.3867, "step": 206 }, { "epoch": 0.014692573862124036, "grad_norm": 0.5508975982666016, "learning_rate": 0.0001899397220773174, "loss": 0.9216, "step": 207 }, { "epoch": 0.014763552479815456, "grad_norm": 0.6259493827819824, "learning_rate": 0.0001898390981891979, "loss": 1.0652, "step": 208 }, { "epoch": 0.014834531097506876, "grad_norm": 0.5097838044166565, "learning_rate": 0.0001897380005036016, "loss": 1.0916, "step": 209 }, { "epoch": 0.014905509715198297, "grad_norm": 0.45562729239463806, "learning_rate": 0.00018963642955370201, "loss": 0.8647, "step": 210 }, { "epoch": 0.014976488332889717, "grad_norm": 0.44729146361351013, "learning_rate": 0.00018953438587516864, "loss": 0.8334, "step": 211 }, { "epoch": 0.015047466950581137, "grad_norm": 0.5578773021697998, "learning_rate": 0.00018943187000616395, "loss": 1.1954, "step": 212 }, { "epoch": 0.015118445568272559, "grad_norm": 0.7742463946342468, "learning_rate": 0.00018932888248734083, "loss": 1.4558, "step": 213 }, { "epoch": 0.015189424185963979, "grad_norm": 0.4087100625038147, "learning_rate": 0.0001892254238618394, "loss": 0.732, "step": 214 }, { "epoch": 0.015260402803655398, "grad_norm": 0.4343758523464203, "learning_rate": 0.00018912149467528448, "loss": 0.9388, "step": 215 }, { "epoch": 0.01533138142134682, "grad_norm": 0.6047740578651428, "learning_rate": 0.00018901709547578245, "loss": 1.2839, "step": 216 }, { "epoch": 0.01540236003903824, "grad_norm": 0.4531131982803345, "learning_rate": 0.00018891222681391851, "loss": 0.9324, "step": 217 }, { "epoch": 0.01547333865672966, "grad_norm": 0.4474259912967682, "learning_rate": 0.00018880688924275378, "loss": 0.9529, "step": 218 }, { "epoch": 0.01554431727442108, "grad_norm": 0.48609498143196106, "learning_rate": 0.00018870108331782217, "loss": 0.97, "step": 219 }, { "epoch": 0.015615295892112501, "grad_norm": 0.4625166058540344, "learning_rate": 0.0001885948095971278, "loss": 0.8479, "step": 220 }, { "epoch": 0.01568627450980392, "grad_norm": 0.5676214694976807, "learning_rate": 0.00018848806864114184, "loss": 0.8292, "step": 221 }, { "epoch": 0.015757253127495343, "grad_norm": 0.5348947048187256, "learning_rate": 0.00018838086101279945, "loss": 0.8647, "step": 222 }, { "epoch": 0.01582823174518676, "grad_norm": 0.615378201007843, "learning_rate": 0.00018827318727749706, "loss": 1.0789, "step": 223 }, { "epoch": 0.015899210362878183, "grad_norm": 0.47168368101119995, "learning_rate": 0.00018816504800308934, "loss": 0.6722, "step": 224 }, { "epoch": 0.015970188980569604, "grad_norm": 0.4130800664424896, "learning_rate": 0.00018805644375988596, "loss": 0.8984, "step": 225 }, { "epoch": 0.016041167598261023, "grad_norm": 0.3562272787094116, "learning_rate": 0.0001879473751206489, "loss": 0.9989, "step": 226 }, { "epoch": 0.016112146215952444, "grad_norm": 0.5637345910072327, "learning_rate": 0.0001878378426605893, "loss": 1.1139, "step": 227 }, { "epoch": 0.016183124833643866, "grad_norm": 0.49348559975624084, "learning_rate": 0.0001877278469573643, "loss": 0.9959, "step": 228 }, { "epoch": 0.016254103451335284, "grad_norm": 0.43761688470840454, "learning_rate": 0.00018761738859107422, "loss": 0.7426, "step": 229 }, { "epoch": 0.016325082069026706, "grad_norm": 0.4174177646636963, "learning_rate": 0.00018750646814425938, "loss": 0.7377, "step": 230 }, { "epoch": 0.016396060686718127, "grad_norm": 0.3912721872329712, "learning_rate": 0.000187395086201897, "loss": 0.7679, "step": 231 }, { "epoch": 0.016467039304409545, "grad_norm": 0.4349600672721863, "learning_rate": 0.00018728324335139814, "loss": 0.8902, "step": 232 }, { "epoch": 0.016538017922100967, "grad_norm": 0.427292138338089, "learning_rate": 0.00018717094018260474, "loss": 1.0749, "step": 233 }, { "epoch": 0.01660899653979239, "grad_norm": 0.38427236676216125, "learning_rate": 0.00018705817728778624, "loss": 0.7116, "step": 234 }, { "epoch": 0.016679975157483807, "grad_norm": 0.550449550151825, "learning_rate": 0.0001869449552616367, "loss": 0.7448, "step": 235 }, { "epoch": 0.01675095377517523, "grad_norm": 0.8102374076843262, "learning_rate": 0.0001868312747012715, "loss": 1.1299, "step": 236 }, { "epoch": 0.01682193239286665, "grad_norm": 0.5067493915557861, "learning_rate": 0.00018671713620622434, "loss": 0.9349, "step": 237 }, { "epoch": 0.01689291101055807, "grad_norm": 0.45576345920562744, "learning_rate": 0.00018660254037844388, "loss": 1.1479, "step": 238 }, { "epoch": 0.01696388962824949, "grad_norm": 0.44452664256095886, "learning_rate": 0.0001864874878222908, "loss": 1.1202, "step": 239 }, { "epoch": 0.01703486824594091, "grad_norm": 0.570928156375885, "learning_rate": 0.00018637197914453445, "loss": 0.8098, "step": 240 }, { "epoch": 0.01710584686363233, "grad_norm": 0.49109429121017456, "learning_rate": 0.00018625601495434965, "loss": 1.1551, "step": 241 }, { "epoch": 0.01717682548132375, "grad_norm": 0.5388101935386658, "learning_rate": 0.00018613959586331362, "loss": 0.9084, "step": 242 }, { "epoch": 0.017247804099015173, "grad_norm": 0.39581820368766785, "learning_rate": 0.00018602272248540252, "loss": 0.7484, "step": 243 }, { "epoch": 0.01731878271670659, "grad_norm": 0.4832267463207245, "learning_rate": 0.00018590539543698854, "loss": 0.8394, "step": 244 }, { "epoch": 0.017389761334398013, "grad_norm": 0.6111173629760742, "learning_rate": 0.00018578761533683623, "loss": 0.9625, "step": 245 }, { "epoch": 0.017460739952089434, "grad_norm": 0.4213511645793915, "learning_rate": 0.00018566938280609966, "loss": 0.7921, "step": 246 }, { "epoch": 0.017531718569780853, "grad_norm": 0.48381322622299194, "learning_rate": 0.0001855506984683188, "loss": 0.7962, "step": 247 }, { "epoch": 0.017602697187472274, "grad_norm": 0.3988328278064728, "learning_rate": 0.0001854315629494165, "loss": 0.8441, "step": 248 }, { "epoch": 0.017673675805163696, "grad_norm": 0.4115743935108185, "learning_rate": 0.00018531197687769502, "loss": 0.742, "step": 249 }, { "epoch": 0.017744654422855114, "grad_norm": 0.4145933985710144, "learning_rate": 0.00018519194088383273, "loss": 0.7554, "step": 250 }, { "epoch": 0.017815633040546536, "grad_norm": 0.4477425515651703, "learning_rate": 0.00018507145560088086, "loss": 0.8286, "step": 251 }, { "epoch": 0.017886611658237957, "grad_norm": 0.5898309946060181, "learning_rate": 0.00018495052166426015, "loss": 1.0697, "step": 252 }, { "epoch": 0.017957590275929376, "grad_norm": 0.540810227394104, "learning_rate": 0.00018482913971175737, "loss": 0.8028, "step": 253 }, { "epoch": 0.018028568893620797, "grad_norm": 0.3993106782436371, "learning_rate": 0.0001847073103835222, "loss": 0.8614, "step": 254 }, { "epoch": 0.01809954751131222, "grad_norm": 0.6189174056053162, "learning_rate": 0.00018458503432206358, "loss": 0.7428, "step": 255 }, { "epoch": 0.018170526129003637, "grad_norm": 1.559867024421692, "learning_rate": 0.0001844623121722465, "loss": 1.0642, "step": 256 }, { "epoch": 0.01824150474669506, "grad_norm": 0.4580833315849304, "learning_rate": 0.0001843391445812886, "loss": 0.9229, "step": 257 }, { "epoch": 0.01831248336438648, "grad_norm": 0.37486153841018677, "learning_rate": 0.00018421553219875658, "loss": 0.7479, "step": 258 }, { "epoch": 0.0183834619820779, "grad_norm": 0.3566621243953705, "learning_rate": 0.00018409147567656305, "loss": 0.9224, "step": 259 }, { "epoch": 0.01845444059976932, "grad_norm": 0.4004257917404175, "learning_rate": 0.00018396697566896286, "loss": 1.0313, "step": 260 }, { "epoch": 0.018525419217460738, "grad_norm": 0.683728039264679, "learning_rate": 0.00018384203283254975, "loss": 0.9659, "step": 261 }, { "epoch": 0.01859639783515216, "grad_norm": 0.4134996235370636, "learning_rate": 0.00018371664782625287, "loss": 0.7003, "step": 262 }, { "epoch": 0.01866737645284358, "grad_norm": 0.36783143877983093, "learning_rate": 0.00018359082131133328, "loss": 0.7422, "step": 263 }, { "epoch": 0.018738355070535, "grad_norm": 0.5012601613998413, "learning_rate": 0.00018346455395138058, "loss": 0.7691, "step": 264 }, { "epoch": 0.01880933368822642, "grad_norm": 0.4422801733016968, "learning_rate": 0.0001833378464123092, "loss": 1.1521, "step": 265 }, { "epoch": 0.018880312305917843, "grad_norm": 0.4929892420768738, "learning_rate": 0.00018321069936235503, "loss": 0.7935, "step": 266 }, { "epoch": 0.01895129092360926, "grad_norm": 0.47165802121162415, "learning_rate": 0.00018308311347207194, "loss": 1.0542, "step": 267 }, { "epoch": 0.019022269541300683, "grad_norm": 0.5729721188545227, "learning_rate": 0.00018295508941432815, "loss": 0.9696, "step": 268 }, { "epoch": 0.019093248158992104, "grad_norm": 0.49791082739830017, "learning_rate": 0.00018282662786430268, "loss": 0.9136, "step": 269 }, { "epoch": 0.019164226776683523, "grad_norm": 0.6356544494628906, "learning_rate": 0.00018269772949948182, "loss": 0.6927, "step": 270 }, { "epoch": 0.019235205394374944, "grad_norm": 0.4809679090976715, "learning_rate": 0.0001825683949996556, "loss": 1.0192, "step": 271 }, { "epoch": 0.019306184012066366, "grad_norm": 0.8236749172210693, "learning_rate": 0.00018243862504691407, "loss": 1.1338, "step": 272 }, { "epoch": 0.019377162629757784, "grad_norm": 0.4069054424762726, "learning_rate": 0.00018230842032564387, "loss": 0.7399, "step": 273 }, { "epoch": 0.019448141247449206, "grad_norm": 0.43730130791664124, "learning_rate": 0.0001821777815225245, "loss": 0.8541, "step": 274 }, { "epoch": 0.019519119865140627, "grad_norm": 0.6162770986557007, "learning_rate": 0.00018204670932652482, "loss": 0.9537, "step": 275 }, { "epoch": 0.019590098482832045, "grad_norm": 0.6175110340118408, "learning_rate": 0.0001819152044288992, "loss": 1.1993, "step": 276 }, { "epoch": 0.019661077100523467, "grad_norm": 1.2285798788070679, "learning_rate": 0.00018178326752318408, "loss": 0.7926, "step": 277 }, { "epoch": 0.01973205571821489, "grad_norm": 0.4019002616405487, "learning_rate": 0.0001816508993051943, "loss": 0.6177, "step": 278 }, { "epoch": 0.019803034335906307, "grad_norm": 0.7058013677597046, "learning_rate": 0.0001815181004730193, "loss": 1.4354, "step": 279 }, { "epoch": 0.01987401295359773, "grad_norm": 0.5000433921813965, "learning_rate": 0.0001813848717270195, "loss": 0.9086, "step": 280 }, { "epoch": 0.01994499157128915, "grad_norm": 0.4633633494377136, "learning_rate": 0.0001812512137698227, "loss": 1.0117, "step": 281 }, { "epoch": 0.02001597018898057, "grad_norm": 0.4790281057357788, "learning_rate": 0.00018111712730632022, "loss": 0.9698, "step": 282 }, { "epoch": 0.02008694880667199, "grad_norm": 0.42838573455810547, "learning_rate": 0.00018098261304366333, "loss": 0.7362, "step": 283 }, { "epoch": 0.02015792742436341, "grad_norm": 0.43362554907798767, "learning_rate": 0.00018084767169125932, "loss": 0.9521, "step": 284 }, { "epoch": 0.02022890604205483, "grad_norm": 0.6145448684692383, "learning_rate": 0.00018071230396076801, "loss": 0.855, "step": 285 }, { "epoch": 0.02029988465974625, "grad_norm": 0.4333368241786957, "learning_rate": 0.00018057651056609784, "loss": 0.7893, "step": 286 }, { "epoch": 0.020370863277437673, "grad_norm": 0.5832470059394836, "learning_rate": 0.00018044029222340205, "loss": 1.1255, "step": 287 }, { "epoch": 0.02044184189512909, "grad_norm": 0.4958140254020691, "learning_rate": 0.0001803036496510752, "loss": 0.9768, "step": 288 }, { "epoch": 0.020512820512820513, "grad_norm": 0.6027165651321411, "learning_rate": 0.00018016658356974884, "loss": 0.9163, "step": 289 }, { "epoch": 0.020583799130511934, "grad_norm": 0.6058111190795898, "learning_rate": 0.00018002909470228842, "loss": 1.1876, "step": 290 }, { "epoch": 0.020654777748203353, "grad_norm": 0.422075092792511, "learning_rate": 0.0001798911837737888, "loss": 0.85, "step": 291 }, { "epoch": 0.020725756365894774, "grad_norm": 0.43910181522369385, "learning_rate": 0.0001797528515115709, "loss": 0.8274, "step": 292 }, { "epoch": 0.020796734983586196, "grad_norm": 0.665794849395752, "learning_rate": 0.00017961409864517756, "loss": 1.5863, "step": 293 }, { "epoch": 0.020867713601277614, "grad_norm": 0.5460638403892517, "learning_rate": 0.00017947492590637, "loss": 1.1745, "step": 294 }, { "epoch": 0.020938692218969036, "grad_norm": 0.64398592710495, "learning_rate": 0.00017933533402912354, "loss": 1.3274, "step": 295 }, { "epoch": 0.021009670836660457, "grad_norm": 0.5168805122375488, "learning_rate": 0.00017919532374962416, "loss": 1.1043, "step": 296 }, { "epoch": 0.021080649454351875, "grad_norm": 0.5605073571205139, "learning_rate": 0.00017905489580626437, "loss": 1.303, "step": 297 }, { "epoch": 0.021151628072043297, "grad_norm": 0.6793704032897949, "learning_rate": 0.00017891405093963938, "loss": 1.2249, "step": 298 }, { "epoch": 0.02122260668973472, "grad_norm": 0.7593535780906677, "learning_rate": 0.00017877278989254317, "loss": 1.0362, "step": 299 }, { "epoch": 0.021293585307426137, "grad_norm": 0.5288353562355042, "learning_rate": 0.00017863111340996458, "loss": 1.0863, "step": 300 }, { "epoch": 0.02136456392511756, "grad_norm": 0.6947289109230042, "learning_rate": 0.00017848902223908345, "loss": 1.3717, "step": 301 }, { "epoch": 0.02143554254280898, "grad_norm": 0.6794065833091736, "learning_rate": 0.00017834651712926662, "loss": 0.8269, "step": 302 }, { "epoch": 0.0215065211605004, "grad_norm": 0.4852735698223114, "learning_rate": 0.00017820359883206387, "loss": 0.8961, "step": 303 }, { "epoch": 0.02157749977819182, "grad_norm": 0.5103346705436707, "learning_rate": 0.00017806026810120423, "loss": 0.848, "step": 304 }, { "epoch": 0.02164847839588324, "grad_norm": 0.37829336524009705, "learning_rate": 0.00017791652569259164, "loss": 0.8279, "step": 305 }, { "epoch": 0.02171945701357466, "grad_norm": 0.4773477613925934, "learning_rate": 0.0001777723723643014, "loss": 1.1607, "step": 306 }, { "epoch": 0.02179043563126608, "grad_norm": 0.4693739116191864, "learning_rate": 0.00017762780887657574, "loss": 1.2594, "step": 307 }, { "epoch": 0.021861414248957503, "grad_norm": 0.5650644898414612, "learning_rate": 0.00017748283599182014, "loss": 1.3446, "step": 308 }, { "epoch": 0.02193239286664892, "grad_norm": 0.4361521303653717, "learning_rate": 0.00017733745447459905, "loss": 0.8959, "step": 309 }, { "epoch": 0.022003371484340343, "grad_norm": 0.6202030777931213, "learning_rate": 0.0001771916650916321, "loss": 1.0443, "step": 310 }, { "epoch": 0.022074350102031765, "grad_norm": 0.6412432789802551, "learning_rate": 0.0001770454686117899, "loss": 1.2219, "step": 311 }, { "epoch": 0.022145328719723183, "grad_norm": 0.5516712069511414, "learning_rate": 0.00017689886580608998, "loss": 1.1871, "step": 312 }, { "epoch": 0.022216307337414604, "grad_norm": 0.4452298879623413, "learning_rate": 0.00017675185744769287, "loss": 0.8516, "step": 313 }, { "epoch": 0.022287285955106026, "grad_norm": 0.7187719941139221, "learning_rate": 0.0001766044443118978, "loss": 1.3354, "step": 314 }, { "epoch": 0.022358264572797444, "grad_norm": 0.5408218502998352, "learning_rate": 0.00017645662717613884, "loss": 1.1112, "step": 315 }, { "epoch": 0.022429243190488866, "grad_norm": 0.348739892244339, "learning_rate": 0.00017630840681998066, "loss": 1.0297, "step": 316 }, { "epoch": 0.022500221808180287, "grad_norm": 0.44230809807777405, "learning_rate": 0.0001761597840251144, "loss": 0.8506, "step": 317 }, { "epoch": 0.022571200425871706, "grad_norm": 0.4578515589237213, "learning_rate": 0.00017601075957535364, "loss": 0.8387, "step": 318 }, { "epoch": 0.022642179043563127, "grad_norm": 0.628984808921814, "learning_rate": 0.00017586133425663027, "loss": 1.3108, "step": 319 }, { "epoch": 0.022713157661254545, "grad_norm": 0.5663823485374451, "learning_rate": 0.00017571150885699023, "loss": 1.2816, "step": 320 }, { "epoch": 0.022784136278945967, "grad_norm": 0.36888179183006287, "learning_rate": 0.00017556128416658942, "loss": 0.9029, "step": 321 }, { "epoch": 0.02285511489663739, "grad_norm": 0.5662537813186646, "learning_rate": 0.00017541066097768963, "loss": 1.0153, "step": 322 }, { "epoch": 0.022926093514328807, "grad_norm": 0.5493496656417847, "learning_rate": 0.00017525964008465418, "loss": 1.169, "step": 323 }, { "epoch": 0.02299707213202023, "grad_norm": 0.5340925455093384, "learning_rate": 0.00017510822228394385, "loss": 0.7783, "step": 324 }, { "epoch": 0.02306805074971165, "grad_norm": 0.4661863446235657, "learning_rate": 0.0001749564083741126, "loss": 0.6926, "step": 325 }, { "epoch": 0.023139029367403068, "grad_norm": 0.4688137471675873, "learning_rate": 0.00017480419915580356, "loss": 0.9131, "step": 326 }, { "epoch": 0.02321000798509449, "grad_norm": 0.5179314017295837, "learning_rate": 0.00017465159543174444, "loss": 0.9808, "step": 327 }, { "epoch": 0.02328098660278591, "grad_norm": 0.4717937111854553, "learning_rate": 0.00017449859800674371, "loss": 0.8589, "step": 328 }, { "epoch": 0.02335196522047733, "grad_norm": 0.5106039643287659, "learning_rate": 0.00017434520768768602, "loss": 1.004, "step": 329 }, { "epoch": 0.02342294383816875, "grad_norm": 0.5373323559761047, "learning_rate": 0.00017419142528352817, "loss": 0.8033, "step": 330 }, { "epoch": 0.023493922455860173, "grad_norm": 0.5274369120597839, "learning_rate": 0.0001740372516052947, "loss": 1.1962, "step": 331 }, { "epoch": 0.02356490107355159, "grad_norm": 0.4952099621295929, "learning_rate": 0.0001738826874660737, "loss": 0.7968, "step": 332 }, { "epoch": 0.023635879691243013, "grad_norm": 0.3695271909236908, "learning_rate": 0.0001737277336810124, "loss": 0.5438, "step": 333 }, { "epoch": 0.023706858308934434, "grad_norm": 0.492368221282959, "learning_rate": 0.00017357239106731317, "loss": 1.1246, "step": 334 }, { "epoch": 0.023777836926625853, "grad_norm": 0.6120978593826294, "learning_rate": 0.0001734166604442288, "loss": 0.9406, "step": 335 }, { "epoch": 0.023848815544317274, "grad_norm": 0.4556725025177002, "learning_rate": 0.00017326054263305847, "loss": 0.8216, "step": 336 }, { "epoch": 0.023919794162008696, "grad_norm": 0.4907512068748474, "learning_rate": 0.0001731040384571433, "loss": 0.7905, "step": 337 }, { "epoch": 0.023990772779700114, "grad_norm": 0.5129969120025635, "learning_rate": 0.0001729471487418621, "loss": 0.8803, "step": 338 }, { "epoch": 0.024061751397391536, "grad_norm": 0.503262996673584, "learning_rate": 0.00017278987431462684, "loss": 0.8183, "step": 339 }, { "epoch": 0.024132730015082957, "grad_norm": 0.4988924264907837, "learning_rate": 0.00017263221600487852, "loss": 1.0483, "step": 340 }, { "epoch": 0.024203708632774375, "grad_norm": 0.6246508359909058, "learning_rate": 0.00017247417464408258, "loss": 1.1449, "step": 341 }, { "epoch": 0.024274687250465797, "grad_norm": 0.4745483994483948, "learning_rate": 0.00017231575106572467, "loss": 1.0851, "step": 342 }, { "epoch": 0.02434566586815722, "grad_norm": 0.4907170534133911, "learning_rate": 0.0001721569461053062, "loss": 0.8655, "step": 343 }, { "epoch": 0.024416644485848637, "grad_norm": 0.502733588218689, "learning_rate": 0.00017199776060033997, "loss": 0.9557, "step": 344 }, { "epoch": 0.02448762310354006, "grad_norm": 0.4516993761062622, "learning_rate": 0.00017183819539034554, "loss": 0.9933, "step": 345 }, { "epoch": 0.02448762310354006, "eval_loss": 1.17430579662323, "eval_runtime": 314.5705, "eval_samples_per_second": 18.861, "eval_steps_per_second": 9.432, "step": 345 }, { "epoch": 0.02455860172123148, "grad_norm": 0.40883591771125793, "learning_rate": 0.00017167825131684513, "loss": 0.8141, "step": 346 }, { "epoch": 0.0246295803389229, "grad_norm": 0.35272955894470215, "learning_rate": 0.00017151792922335903, "loss": 0.9703, "step": 347 }, { "epoch": 0.02470055895661432, "grad_norm": 0.4392254948616028, "learning_rate": 0.00017135722995540107, "loss": 0.8744, "step": 348 }, { "epoch": 0.02477153757430574, "grad_norm": 0.5151838660240173, "learning_rate": 0.0001711961543604743, "loss": 0.8671, "step": 349 }, { "epoch": 0.02484251619199716, "grad_norm": 0.5394666790962219, "learning_rate": 0.0001710347032880664, "loss": 0.9802, "step": 350 }, { "epoch": 0.02491349480968858, "grad_norm": 0.356200248003006, "learning_rate": 0.00017087287758964538, "loss": 1.3239, "step": 351 }, { "epoch": 0.024984473427380003, "grad_norm": 0.8448062539100647, "learning_rate": 0.00017071067811865476, "loss": 1.3531, "step": 352 }, { "epoch": 0.02505545204507142, "grad_norm": 0.7906566262245178, "learning_rate": 0.0001705481057305095, "loss": 1.2485, "step": 353 }, { "epoch": 0.025126430662762843, "grad_norm": 0.4532487094402313, "learning_rate": 0.00017038516128259115, "loss": 0.9028, "step": 354 }, { "epoch": 0.025197409280454264, "grad_norm": 0.4616987705230713, "learning_rate": 0.00017022184563424348, "loss": 0.8315, "step": 355 }, { "epoch": 0.025268387898145683, "grad_norm": 0.5591869950294495, "learning_rate": 0.00017005815964676787, "loss": 1.0024, "step": 356 }, { "epoch": 0.025339366515837104, "grad_norm": 0.5752520561218262, "learning_rate": 0.00016989410418341886, "loss": 0.7582, "step": 357 }, { "epoch": 0.025410345133528526, "grad_norm": 0.616784393787384, "learning_rate": 0.00016972968010939954, "loss": 1.2454, "step": 358 }, { "epoch": 0.025481323751219944, "grad_norm": 0.3129301071166992, "learning_rate": 0.000169564888291857, "loss": 0.77, "step": 359 }, { "epoch": 0.025552302368911366, "grad_norm": 0.7636944055557251, "learning_rate": 0.0001693997295998777, "loss": 1.3112, "step": 360 }, { "epoch": 0.025623280986602787, "grad_norm": 0.4161663055419922, "learning_rate": 0.00016923420490448296, "loss": 0.9419, "step": 361 }, { "epoch": 0.025694259604294205, "grad_norm": 0.5600863099098206, "learning_rate": 0.00016906831507862443, "loss": 0.8739, "step": 362 }, { "epoch": 0.025765238221985627, "grad_norm": 0.5836548805236816, "learning_rate": 0.00016890206099717933, "loss": 0.8755, "step": 363 }, { "epoch": 0.02583621683967705, "grad_norm": 0.6149469017982483, "learning_rate": 0.00016873544353694588, "loss": 1.1966, "step": 364 }, { "epoch": 0.025907195457368467, "grad_norm": 0.7984243035316467, "learning_rate": 0.00016856846357663874, "loss": 1.1527, "step": 365 }, { "epoch": 0.02597817407505989, "grad_norm": 0.5544553399085999, "learning_rate": 0.00016840112199688432, "loss": 0.9552, "step": 366 }, { "epoch": 0.02604915269275131, "grad_norm": 0.5867147445678711, "learning_rate": 0.0001682334196802162, "loss": 1.3768, "step": 367 }, { "epoch": 0.02612013131044273, "grad_norm": 0.49774566292762756, "learning_rate": 0.00016806535751107037, "loss": 0.8269, "step": 368 }, { "epoch": 0.02619110992813415, "grad_norm": 0.44314420223236084, "learning_rate": 0.0001678969363757807, "loss": 1.0084, "step": 369 }, { "epoch": 0.02626208854582557, "grad_norm": 0.5219785571098328, "learning_rate": 0.00016772815716257412, "loss": 1.0319, "step": 370 }, { "epoch": 0.02633306716351699, "grad_norm": 0.4038362503051758, "learning_rate": 0.00016755902076156604, "loss": 0.7497, "step": 371 }, { "epoch": 0.02640404578120841, "grad_norm": 0.5352762341499329, "learning_rate": 0.0001673895280647556, "loss": 1.0418, "step": 372 }, { "epoch": 0.026475024398899833, "grad_norm": 0.6628060340881348, "learning_rate": 0.00016721967996602107, "loss": 0.9296, "step": 373 }, { "epoch": 0.02654600301659125, "grad_norm": 0.48506560921669006, "learning_rate": 0.00016704947736111492, "loss": 0.9518, "step": 374 }, { "epoch": 0.026616981634282673, "grad_norm": 0.5067095160484314, "learning_rate": 0.00016687892114765934, "loss": 1.0566, "step": 375 }, { "epoch": 0.026687960251974095, "grad_norm": 0.7099199295043945, "learning_rate": 0.00016670801222514134, "loss": 0.9235, "step": 376 }, { "epoch": 0.026758938869665513, "grad_norm": 0.44219255447387695, "learning_rate": 0.00016653675149490808, "loss": 1.1732, "step": 377 }, { "epoch": 0.026829917487356934, "grad_norm": 0.5791195631027222, "learning_rate": 0.00016636513986016213, "loss": 1.0645, "step": 378 }, { "epoch": 0.026900896105048352, "grad_norm": 0.3665172755718231, "learning_rate": 0.00016619317822595667, "loss": 0.9059, "step": 379 }, { "epoch": 0.026971874722739774, "grad_norm": 0.5197924971580505, "learning_rate": 0.00016602086749919063, "loss": 0.6917, "step": 380 }, { "epoch": 0.027042853340431196, "grad_norm": 0.47570809721946716, "learning_rate": 0.00016584820858860411, "loss": 0.7951, "step": 381 }, { "epoch": 0.027113831958122614, "grad_norm": 0.43381422758102417, "learning_rate": 0.00016567520240477344, "loss": 0.9001, "step": 382 }, { "epoch": 0.027184810575814036, "grad_norm": 0.3908829092979431, "learning_rate": 0.00016550184986010642, "loss": 0.7585, "step": 383 }, { "epoch": 0.027255789193505457, "grad_norm": 0.6746481657028198, "learning_rate": 0.00016532815186883748, "loss": 1.3776, "step": 384 }, { "epoch": 0.027326767811196875, "grad_norm": 0.7884753346443176, "learning_rate": 0.0001651541093470229, "loss": 1.2894, "step": 385 }, { "epoch": 0.027397746428888297, "grad_norm": 0.5592153072357178, "learning_rate": 0.000164979723212536, "loss": 1.1928, "step": 386 }, { "epoch": 0.02746872504657972, "grad_norm": 0.6361520290374756, "learning_rate": 0.00016480499438506212, "loss": 1.0849, "step": 387 }, { "epoch": 0.027539703664271137, "grad_norm": 0.5614412426948547, "learning_rate": 0.00016462992378609407, "loss": 1.0129, "step": 388 }, { "epoch": 0.02761068228196256, "grad_norm": 0.5978992581367493, "learning_rate": 0.00016445451233892714, "loss": 1.1642, "step": 389 }, { "epoch": 0.02768166089965398, "grad_norm": 0.43469658493995667, "learning_rate": 0.00016427876096865394, "loss": 0.9207, "step": 390 }, { "epoch": 0.027752639517345398, "grad_norm": 0.4649278521537781, "learning_rate": 0.00016410267060216007, "loss": 0.8428, "step": 391 }, { "epoch": 0.02782361813503682, "grad_norm": 0.656232476234436, "learning_rate": 0.00016392624216811879, "loss": 1.0931, "step": 392 }, { "epoch": 0.02789459675272824, "grad_norm": 0.4414312541484833, "learning_rate": 0.00016374947659698628, "loss": 0.9547, "step": 393 }, { "epoch": 0.02796557537041966, "grad_norm": 0.4081096947193146, "learning_rate": 0.00016357237482099684, "loss": 1.026, "step": 394 }, { "epoch": 0.02803655398811108, "grad_norm": 0.5185043215751648, "learning_rate": 0.00016339493777415767, "loss": 0.8768, "step": 395 }, { "epoch": 0.028107532605802503, "grad_norm": 0.5498956441879272, "learning_rate": 0.00016321716639224434, "loss": 0.65, "step": 396 }, { "epoch": 0.02817851122349392, "grad_norm": 0.5188130736351013, "learning_rate": 0.0001630390616127955, "loss": 1.0789, "step": 397 }, { "epoch": 0.028249489841185343, "grad_norm": 0.7840229868888855, "learning_rate": 0.0001628606243751082, "loss": 1.0992, "step": 398 }, { "epoch": 0.028320468458876764, "grad_norm": 0.5934768319129944, "learning_rate": 0.00016268185562023278, "loss": 1.4807, "step": 399 }, { "epoch": 0.028391447076568183, "grad_norm": 0.46113017201423645, "learning_rate": 0.00016250275629096786, "loss": 0.996, "step": 400 }, { "epoch": 0.028462425694259604, "grad_norm": 0.43795326352119446, "learning_rate": 0.00016232332733185557, "loss": 0.9035, "step": 401 }, { "epoch": 0.028533404311951026, "grad_norm": 0.5044823884963989, "learning_rate": 0.00016214356968917648, "loss": 0.856, "step": 402 }, { "epoch": 0.028604382929642444, "grad_norm": 0.5347551703453064, "learning_rate": 0.00016196348431094448, "loss": 1.0638, "step": 403 }, { "epoch": 0.028675361547333866, "grad_norm": 0.5653433799743652, "learning_rate": 0.00016178307214690193, "loss": 1.0553, "step": 404 }, { "epoch": 0.028746340165025287, "grad_norm": 0.984065592288971, "learning_rate": 0.00016160233414851463, "loss": 1.6807, "step": 405 }, { "epoch": 0.028817318782716705, "grad_norm": 0.5554366111755371, "learning_rate": 0.0001614212712689668, "loss": 1.0986, "step": 406 }, { "epoch": 0.028888297400408127, "grad_norm": 0.447272926568985, "learning_rate": 0.00016123988446315593, "loss": 0.934, "step": 407 }, { "epoch": 0.02895927601809955, "grad_norm": 0.38044825196266174, "learning_rate": 0.00016105817468768798, "loss": 0.9175, "step": 408 }, { "epoch": 0.029030254635790967, "grad_norm": 0.48493674397468567, "learning_rate": 0.00016087614290087208, "loss": 0.9096, "step": 409 }, { "epoch": 0.02910123325348239, "grad_norm": 0.7507244348526001, "learning_rate": 0.00016069379006271566, "loss": 1.2098, "step": 410 }, { "epoch": 0.02917221187117381, "grad_norm": 0.4828748106956482, "learning_rate": 0.0001605111171349194, "loss": 1.3215, "step": 411 }, { "epoch": 0.02924319048886523, "grad_norm": 0.43988776206970215, "learning_rate": 0.0001603281250808719, "loss": 0.8359, "step": 412 }, { "epoch": 0.02931416910655665, "grad_norm": 0.7423245906829834, "learning_rate": 0.00016014481486564493, "loss": 1.2535, "step": 413 }, { "epoch": 0.02938514772424807, "grad_norm": 0.4142375886440277, "learning_rate": 0.00015996118745598817, "loss": 0.839, "step": 414 }, { "epoch": 0.02945612634193949, "grad_norm": 0.5178642272949219, "learning_rate": 0.0001597772438203241, "loss": 1.2574, "step": 415 }, { "epoch": 0.02952710495963091, "grad_norm": 0.4112253189086914, "learning_rate": 0.00015959298492874288, "loss": 0.8162, "step": 416 }, { "epoch": 0.029598083577322333, "grad_norm": 0.44102925062179565, "learning_rate": 0.00015940841175299744, "loss": 0.8183, "step": 417 }, { "epoch": 0.02966906219501375, "grad_norm": 0.42544183135032654, "learning_rate": 0.00015922352526649803, "loss": 1.0383, "step": 418 }, { "epoch": 0.029740040812705173, "grad_norm": 0.4051035940647125, "learning_rate": 0.00015903832644430729, "loss": 0.7855, "step": 419 }, { "epoch": 0.029811019430396594, "grad_norm": 0.5159337520599365, "learning_rate": 0.00015885281626313517, "loss": 0.954, "step": 420 }, { "epoch": 0.029881998048088013, "grad_norm": 0.5515350103378296, "learning_rate": 0.0001586669957013336, "loss": 1.0254, "step": 421 }, { "epoch": 0.029952976665779434, "grad_norm": 0.5302817821502686, "learning_rate": 0.00015848086573889137, "loss": 1.0386, "step": 422 }, { "epoch": 0.030023955283470856, "grad_norm": 0.5249578952789307, "learning_rate": 0.00015829442735742908, "loss": 0.9024, "step": 423 }, { "epoch": 0.030094933901162274, "grad_norm": 0.5785447955131531, "learning_rate": 0.00015810768154019385, "loss": 1.4716, "step": 424 }, { "epoch": 0.030165912518853696, "grad_norm": 0.48202839493751526, "learning_rate": 0.00015792062927205412, "loss": 0.6959, "step": 425 }, { "epoch": 0.030236891136545117, "grad_norm": 0.4296504855155945, "learning_rate": 0.00015773327153949465, "loss": 0.9469, "step": 426 }, { "epoch": 0.030307869754236536, "grad_norm": 0.3921097218990326, "learning_rate": 0.00015754560933061102, "loss": 0.7949, "step": 427 }, { "epoch": 0.030378848371927957, "grad_norm": 0.33943453431129456, "learning_rate": 0.0001573576436351046, "loss": 0.564, "step": 428 }, { "epoch": 0.03044982698961938, "grad_norm": 0.41947242617607117, "learning_rate": 0.00015716937544427742, "loss": 0.7209, "step": 429 }, { "epoch": 0.030520805607310797, "grad_norm": 0.38738954067230225, "learning_rate": 0.00015698080575102661, "loss": 0.7105, "step": 430 }, { "epoch": 0.03059178422500222, "grad_norm": 0.4998133182525635, "learning_rate": 0.00015679193554983962, "loss": 0.8116, "step": 431 }, { "epoch": 0.03066276284269364, "grad_norm": 0.4546727240085602, "learning_rate": 0.00015660276583678853, "loss": 0.5981, "step": 432 }, { "epoch": 0.03073374146038506, "grad_norm": 0.5597979426383972, "learning_rate": 0.00015641329760952513, "loss": 0.96, "step": 433 }, { "epoch": 0.03080472007807648, "grad_norm": 0.38752710819244385, "learning_rate": 0.00015622353186727544, "loss": 0.7463, "step": 434 }, { "epoch": 0.030875698695767898, "grad_norm": 0.3666408061981201, "learning_rate": 0.00015603346961083457, "loss": 0.8058, "step": 435 }, { "epoch": 0.03094667731345932, "grad_norm": 0.4385254979133606, "learning_rate": 0.0001558431118425614, "loss": 0.8161, "step": 436 }, { "epoch": 0.03101765593115074, "grad_norm": 0.42853856086730957, "learning_rate": 0.00015565245956637337, "loss": 0.7179, "step": 437 }, { "epoch": 0.03108863454884216, "grad_norm": 0.4087078273296356, "learning_rate": 0.00015546151378774086, "loss": 0.809, "step": 438 }, { "epoch": 0.03115961316653358, "grad_norm": 0.49349793791770935, "learning_rate": 0.0001552702755136825, "loss": 0.8627, "step": 439 }, { "epoch": 0.031230591784225003, "grad_norm": 0.588937520980835, "learning_rate": 0.00015507874575275917, "loss": 1.1767, "step": 440 }, { "epoch": 0.031301570401916425, "grad_norm": 0.39766088128089905, "learning_rate": 0.00015488692551506915, "loss": 0.6619, "step": 441 }, { "epoch": 0.03137254901960784, "grad_norm": 0.469530314207077, "learning_rate": 0.00015469481581224272, "loss": 0.8312, "step": 442 }, { "epoch": 0.03144352763729926, "grad_norm": 0.5216483473777771, "learning_rate": 0.00015450241765743657, "loss": 0.9357, "step": 443 }, { "epoch": 0.031514506254990686, "grad_norm": 0.5060241222381592, "learning_rate": 0.00015430973206532878, "loss": 1.0924, "step": 444 }, { "epoch": 0.031585484872682104, "grad_norm": 0.5776031613349915, "learning_rate": 0.0001541167600521133, "loss": 1.1481, "step": 445 }, { "epoch": 0.03165646349037352, "grad_norm": 0.39201876521110535, "learning_rate": 0.0001539235026354946, "loss": 0.9276, "step": 446 }, { "epoch": 0.03172744210806495, "grad_norm": 0.6131280660629272, "learning_rate": 0.0001537299608346824, "loss": 1.1048, "step": 447 }, { "epoch": 0.031798420725756366, "grad_norm": 0.5436589121818542, "learning_rate": 0.00015353613567038607, "loss": 1.1316, "step": 448 }, { "epoch": 0.031869399343447784, "grad_norm": 0.44587400555610657, "learning_rate": 0.00015334202816480948, "loss": 0.652, "step": 449 }, { "epoch": 0.03194037796113921, "grad_norm": 0.602884829044342, "learning_rate": 0.0001531476393416456, "loss": 1.1943, "step": 450 }, { "epoch": 0.03201135657883063, "grad_norm": 0.49723750352859497, "learning_rate": 0.00015295297022607088, "loss": 0.9081, "step": 451 }, { "epoch": 0.032082335196522045, "grad_norm": 0.3994935154914856, "learning_rate": 0.0001527580218447401, "loss": 0.8847, "step": 452 }, { "epoch": 0.03215331381421347, "grad_norm": 0.41622379422187805, "learning_rate": 0.00015256279522578075, "loss": 0.7827, "step": 453 }, { "epoch": 0.03222429243190489, "grad_norm": 0.5180255174636841, "learning_rate": 0.00015236729139878782, "loss": 1.1626, "step": 454 }, { "epoch": 0.03229527104959631, "grad_norm": 0.43589210510253906, "learning_rate": 0.0001521715113948181, "loss": 1.0843, "step": 455 }, { "epoch": 0.03236624966728773, "grad_norm": 0.4477526843547821, "learning_rate": 0.00015197545624638504, "loss": 0.8436, "step": 456 }, { "epoch": 0.03243722828497915, "grad_norm": 0.4171714782714844, "learning_rate": 0.00015177912698745313, "loss": 0.9367, "step": 457 }, { "epoch": 0.03250820690267057, "grad_norm": 0.4946620762348175, "learning_rate": 0.00015158252465343242, "loss": 1.0863, "step": 458 }, { "epoch": 0.03257918552036199, "grad_norm": 0.42572730779647827, "learning_rate": 0.0001513856502811731, "loss": 0.9448, "step": 459 }, { "epoch": 0.03265016413805341, "grad_norm": 0.5282134413719177, "learning_rate": 0.00015118850490896012, "loss": 0.9628, "step": 460 }, { "epoch": 0.03272114275574483, "grad_norm": 0.5403441786766052, "learning_rate": 0.0001509910895765076, "loss": 1.2888, "step": 461 }, { "epoch": 0.032792121373436255, "grad_norm": 0.49020805954933167, "learning_rate": 0.00015079340532495343, "loss": 0.9225, "step": 462 }, { "epoch": 0.03286309999112767, "grad_norm": 0.3833196461200714, "learning_rate": 0.0001505954531968537, "loss": 1.0136, "step": 463 }, { "epoch": 0.03293407860881909, "grad_norm": 0.42853808403015137, "learning_rate": 0.0001503972342361772, "loss": 0.8872, "step": 464 }, { "epoch": 0.033005057226510516, "grad_norm": 0.5518404245376587, "learning_rate": 0.00015019874948830002, "loss": 0.969, "step": 465 }, { "epoch": 0.033076035844201934, "grad_norm": 0.4969305098056793, "learning_rate": 0.00015000000000000001, "loss": 1.0048, "step": 466 }, { "epoch": 0.03314701446189335, "grad_norm": 0.8780049085617065, "learning_rate": 0.0001498009868194511, "loss": 1.1976, "step": 467 }, { "epoch": 0.03321799307958478, "grad_norm": 0.5217463374137878, "learning_rate": 0.00014960171099621795, "loss": 1.1335, "step": 468 }, { "epoch": 0.033288971697276196, "grad_norm": 0.5528985857963562, "learning_rate": 0.00014940217358125042, "loss": 0.9732, "step": 469 }, { "epoch": 0.033359950314967614, "grad_norm": 0.4432036876678467, "learning_rate": 0.00014920237562687785, "loss": 1.3563, "step": 470 }, { "epoch": 0.03343092893265904, "grad_norm": 0.4422716200351715, "learning_rate": 0.00014900231818680367, "loss": 1.2711, "step": 471 }, { "epoch": 0.03350190755035046, "grad_norm": 0.4755992889404297, "learning_rate": 0.00014880200231609983, "loss": 1.0836, "step": 472 }, { "epoch": 0.033572886168041875, "grad_norm": 0.4565562903881073, "learning_rate": 0.00014860142907120117, "loss": 0.8823, "step": 473 }, { "epoch": 0.0336438647857333, "grad_norm": 0.6401200890541077, "learning_rate": 0.0001484005995098999, "loss": 1.0755, "step": 474 }, { "epoch": 0.03371484340342472, "grad_norm": 0.6158058047294617, "learning_rate": 0.00014819951469133996, "loss": 1.0649, "step": 475 }, { "epoch": 0.03378582202111614, "grad_norm": 0.424187034368515, "learning_rate": 0.00014799817567601157, "loss": 0.9622, "step": 476 }, { "epoch": 0.03385680063880756, "grad_norm": 0.473598837852478, "learning_rate": 0.00014779658352574547, "loss": 0.9844, "step": 477 }, { "epoch": 0.03392777925649898, "grad_norm": 0.7252926230430603, "learning_rate": 0.00014759473930370736, "loss": 0.9465, "step": 478 }, { "epoch": 0.0339987578741904, "grad_norm": 0.6348116993904114, "learning_rate": 0.0001473926440743924, "loss": 1.1721, "step": 479 }, { "epoch": 0.03406973649188182, "grad_norm": 0.4119262397289276, "learning_rate": 0.00014719029890361955, "loss": 0.8751, "step": 480 }, { "epoch": 0.03414071510957324, "grad_norm": 0.35753583908081055, "learning_rate": 0.0001469877048585258, "loss": 0.9371, "step": 481 }, { "epoch": 0.03421169372726466, "grad_norm": 0.5053995251655579, "learning_rate": 0.0001467848630075608, "loss": 1.1227, "step": 482 }, { "epoch": 0.034282672344956085, "grad_norm": 0.5652085542678833, "learning_rate": 0.000146581774420481, "loss": 0.953, "step": 483 }, { "epoch": 0.0343536509626475, "grad_norm": 0.573786199092865, "learning_rate": 0.00014637844016834406, "loss": 0.8142, "step": 484 }, { "epoch": 0.03442462958033892, "grad_norm": 0.48114916682243347, "learning_rate": 0.00014617486132350343, "loss": 0.8498, "step": 485 }, { "epoch": 0.034495608198030346, "grad_norm": 0.4647403955459595, "learning_rate": 0.00014597103895960226, "loss": 1.1702, "step": 486 }, { "epoch": 0.034566586815721764, "grad_norm": 0.5446001887321472, "learning_rate": 0.00014576697415156817, "loss": 1.1223, "step": 487 }, { "epoch": 0.03463756543341318, "grad_norm": 0.5652422904968262, "learning_rate": 0.00014556266797560732, "loss": 1.0664, "step": 488 }, { "epoch": 0.03470854405110461, "grad_norm": 0.7315934300422668, "learning_rate": 0.00014535812150919878, "loss": 0.9459, "step": 489 }, { "epoch": 0.034779522668796026, "grad_norm": 0.5323507785797119, "learning_rate": 0.00014515333583108896, "loss": 1.1672, "step": 490 }, { "epoch": 0.034850501286487444, "grad_norm": 0.4387359619140625, "learning_rate": 0.0001449483120212857, "loss": 0.8472, "step": 491 }, { "epoch": 0.03492147990417887, "grad_norm": 0.4899459779262543, "learning_rate": 0.00014474305116105284, "loss": 0.8763, "step": 492 }, { "epoch": 0.03499245852187029, "grad_norm": 0.5132838487625122, "learning_rate": 0.00014453755433290434, "loss": 1.0211, "step": 493 }, { "epoch": 0.035063437139561705, "grad_norm": 0.44308844208717346, "learning_rate": 0.0001443318226205986, "loss": 0.7512, "step": 494 }, { "epoch": 0.03513441575725313, "grad_norm": 0.4143776297569275, "learning_rate": 0.00014412585710913277, "loss": 1.0512, "step": 495 }, { "epoch": 0.03520539437494455, "grad_norm": 0.39649519324302673, "learning_rate": 0.00014391965888473703, "loss": 0.974, "step": 496 }, { "epoch": 0.03527637299263597, "grad_norm": 1.8992846012115479, "learning_rate": 0.00014371322903486888, "loss": 0.8682, "step": 497 }, { "epoch": 0.03534735161032739, "grad_norm": 0.415995717048645, "learning_rate": 0.00014350656864820733, "loss": 1.0104, "step": 498 }, { "epoch": 0.03541833022801881, "grad_norm": 0.5461487174034119, "learning_rate": 0.0001432996788146472, "loss": 1.0534, "step": 499 }, { "epoch": 0.03548930884571023, "grad_norm": 0.4715401530265808, "learning_rate": 0.00014309256062529344, "loss": 0.8942, "step": 500 }, { "epoch": 0.03556028746340165, "grad_norm": 0.4451175630092621, "learning_rate": 0.00014288521517245525, "loss": 0.8776, "step": 501 }, { "epoch": 0.03563126608109307, "grad_norm": 0.5925554633140564, "learning_rate": 0.00014267764354964038, "loss": 1.3517, "step": 502 }, { "epoch": 0.03570224469878449, "grad_norm": 0.3424084186553955, "learning_rate": 0.00014246984685154944, "loss": 0.8207, "step": 503 }, { "epoch": 0.035773223316475915, "grad_norm": 0.4916914701461792, "learning_rate": 0.00014226182617406996, "loss": 0.9486, "step": 504 }, { "epoch": 0.03584420193416733, "grad_norm": 0.4123711884021759, "learning_rate": 0.00014205358261427074, "loss": 0.646, "step": 505 }, { "epoch": 0.03591518055185875, "grad_norm": 0.5164076089859009, "learning_rate": 0.00014184511727039612, "loss": 0.8733, "step": 506 }, { "epoch": 0.035986159169550176, "grad_norm": 0.7842580676078796, "learning_rate": 0.0001416364312418599, "loss": 1.1441, "step": 507 }, { "epoch": 0.036057137787241594, "grad_norm": 0.5408650040626526, "learning_rate": 0.00014142752562923988, "loss": 1.3061, "step": 508 }, { "epoch": 0.03612811640493301, "grad_norm": 0.35656940937042236, "learning_rate": 0.00014121840153427193, "loss": 0.7223, "step": 509 }, { "epoch": 0.03619909502262444, "grad_norm": 0.46929752826690674, "learning_rate": 0.00014100906005984403, "loss": 0.879, "step": 510 }, { "epoch": 0.036270073640315856, "grad_norm": 0.5965684056282043, "learning_rate": 0.00014079950230999069, "loss": 1.0023, "step": 511 }, { "epoch": 0.036341052258007274, "grad_norm": 0.5516963601112366, "learning_rate": 0.000140589729389887, "loss": 0.9672, "step": 512 }, { "epoch": 0.0364120308756987, "grad_norm": 0.8615989089012146, "learning_rate": 0.00014037974240584282, "loss": 0.8098, "step": 513 }, { "epoch": 0.03648300949339012, "grad_norm": 0.3987727761268616, "learning_rate": 0.00014016954246529696, "loss": 0.8316, "step": 514 }, { "epoch": 0.036553988111081535, "grad_norm": 0.6568902134895325, "learning_rate": 0.0001399591306768113, "loss": 1.5254, "step": 515 }, { "epoch": 0.03662496672877296, "grad_norm": 0.8896635174751282, "learning_rate": 0.00013974850815006503, "loss": 0.7569, "step": 516 }, { "epoch": 0.03669594534646438, "grad_norm": 0.631305992603302, "learning_rate": 0.00013953767599584867, "loss": 0.9355, "step": 517 }, { "epoch": 0.0367669239641558, "grad_norm": 0.49458688497543335, "learning_rate": 0.0001393266353260583, "loss": 0.7543, "step": 518 }, { "epoch": 0.03683790258184722, "grad_norm": 0.41726812720298767, "learning_rate": 0.00013911538725368977, "loss": 0.7159, "step": 519 }, { "epoch": 0.03690888119953864, "grad_norm": 0.4371448755264282, "learning_rate": 0.0001389039328928326, "loss": 1.0797, "step": 520 }, { "epoch": 0.03697985981723006, "grad_norm": 0.525700032711029, "learning_rate": 0.00013869227335866434, "loss": 1.0843, "step": 521 }, { "epoch": 0.037050838434921476, "grad_norm": 0.4877499043941498, "learning_rate": 0.00013848040976744457, "loss": 0.8283, "step": 522 }, { "epoch": 0.0371218170526129, "grad_norm": 0.5594969391822815, "learning_rate": 0.000138268343236509, "loss": 1.0556, "step": 523 }, { "epoch": 0.03719279567030432, "grad_norm": 0.5910261273384094, "learning_rate": 0.00013805607488426362, "loss": 1.2204, "step": 524 }, { "epoch": 0.03726377428799574, "grad_norm": 0.5748972296714783, "learning_rate": 0.00013784360583017887, "loss": 0.734, "step": 525 }, { "epoch": 0.03733475290568716, "grad_norm": 0.6297950744628906, "learning_rate": 0.00013763093719478358, "loss": 0.8886, "step": 526 }, { "epoch": 0.03740573152337858, "grad_norm": 0.47521424293518066, "learning_rate": 0.00013741807009965911, "loss": 0.8331, "step": 527 }, { "epoch": 0.03747671014107, "grad_norm": 0.43686890602111816, "learning_rate": 0.00013720500566743362, "loss": 1.1703, "step": 528 }, { "epoch": 0.037547688758761424, "grad_norm": 0.6339263319969177, "learning_rate": 0.00013699174502177582, "loss": 1.084, "step": 529 }, { "epoch": 0.03761866737645284, "grad_norm": 0.8485615253448486, "learning_rate": 0.00013677828928738934, "loss": 0.9832, "step": 530 }, { "epoch": 0.03768964599414426, "grad_norm": 0.710398256778717, "learning_rate": 0.0001365646395900066, "loss": 1.1895, "step": 531 }, { "epoch": 0.037760624611835686, "grad_norm": 0.5375249981880188, "learning_rate": 0.00013635079705638298, "loss": 1.0867, "step": 532 }, { "epoch": 0.037831603229527104, "grad_norm": 0.4536316692829132, "learning_rate": 0.0001361367628142909, "loss": 1.0096, "step": 533 }, { "epoch": 0.03790258184721852, "grad_norm": 0.5407841801643372, "learning_rate": 0.00013592253799251376, "loss": 1.1234, "step": 534 }, { "epoch": 0.03797356046490995, "grad_norm": 0.41910848021507263, "learning_rate": 0.00013570812372083998, "loss": 0.7564, "step": 535 }, { "epoch": 0.038044539082601365, "grad_norm": 0.5602056980133057, "learning_rate": 0.00013549352113005728, "loss": 1.2408, "step": 536 }, { "epoch": 0.038115517700292784, "grad_norm": 0.4589637815952301, "learning_rate": 0.00013527873135194644, "loss": 0.9453, "step": 537 }, { "epoch": 0.03818649631798421, "grad_norm": 0.531792163848877, "learning_rate": 0.00013506375551927547, "loss": 1.281, "step": 538 }, { "epoch": 0.03825747493567563, "grad_norm": 0.7548795342445374, "learning_rate": 0.0001348485947657935, "loss": 1.3972, "step": 539 }, { "epoch": 0.038328453553367045, "grad_norm": 0.5263999700546265, "learning_rate": 0.00013463325022622507, "loss": 1.1422, "step": 540 }, { "epoch": 0.03839943217105847, "grad_norm": 0.9256210923194885, "learning_rate": 0.00013441772303626387, "loss": 1.3945, "step": 541 }, { "epoch": 0.03847041078874989, "grad_norm": 0.5024957656860352, "learning_rate": 0.00013420201433256689, "loss": 0.9851, "step": 542 }, { "epoch": 0.038541389406441307, "grad_norm": 0.5249804258346558, "learning_rate": 0.0001339861252527484, "loss": 1.0699, "step": 543 }, { "epoch": 0.03861236802413273, "grad_norm": 0.5011900067329407, "learning_rate": 0.0001337700569353739, "loss": 0.9208, "step": 544 }, { "epoch": 0.03868334664182415, "grad_norm": 0.501251757144928, "learning_rate": 0.00013355381051995428, "loss": 1.0151, "step": 545 }, { "epoch": 0.03875432525951557, "grad_norm": 0.5241694450378418, "learning_rate": 0.00013333738714693956, "loss": 0.9274, "step": 546 }, { "epoch": 0.03882530387720699, "grad_norm": 0.5149634480476379, "learning_rate": 0.00013312078795771305, "loss": 0.995, "step": 547 }, { "epoch": 0.03889628249489841, "grad_norm": 0.44491928815841675, "learning_rate": 0.00013290401409458532, "loss": 0.8227, "step": 548 }, { "epoch": 0.03896726111258983, "grad_norm": 0.682807207107544, "learning_rate": 0.00013268706670078813, "loss": 0.9766, "step": 549 }, { "epoch": 0.039038239730281254, "grad_norm": 0.528838574886322, "learning_rate": 0.00013246994692046836, "loss": 0.9775, "step": 550 }, { "epoch": 0.03910921834797267, "grad_norm": 0.43190521001815796, "learning_rate": 0.0001322526558986821, "loss": 1.0157, "step": 551 }, { "epoch": 0.03918019696566409, "grad_norm": 0.6751129031181335, "learning_rate": 0.00013203519478138852, "loss": 1.1538, "step": 552 }, { "epoch": 0.039251175583355516, "grad_norm": 0.570706307888031, "learning_rate": 0.00013181756471544382, "loss": 0.9594, "step": 553 }, { "epoch": 0.039322154201046934, "grad_norm": 0.5315188765525818, "learning_rate": 0.00013159976684859527, "loss": 1.0076, "step": 554 }, { "epoch": 0.03939313281873835, "grad_norm": 0.5034574866294861, "learning_rate": 0.00013138180232947502, "loss": 1.0435, "step": 555 }, { "epoch": 0.03946411143642978, "grad_norm": 0.4046081304550171, "learning_rate": 0.00013116367230759415, "loss": 0.7837, "step": 556 }, { "epoch": 0.039535090054121196, "grad_norm": 0.6341990232467651, "learning_rate": 0.00013094537793333658, "loss": 1.2637, "step": 557 }, { "epoch": 0.039606068671812614, "grad_norm": 0.718720555305481, "learning_rate": 0.00013072692035795305, "loss": 1.044, "step": 558 }, { "epoch": 0.03967704728950404, "grad_norm": 0.5487260818481445, "learning_rate": 0.00013050830073355488, "loss": 0.8928, "step": 559 }, { "epoch": 0.03974802590719546, "grad_norm": 0.6338328719139099, "learning_rate": 0.00013028952021310812, "loss": 1.1605, "step": 560 }, { "epoch": 0.039819004524886875, "grad_norm": 0.6349433064460754, "learning_rate": 0.00013007057995042732, "loss": 1.1042, "step": 561 }, { "epoch": 0.0398899831425783, "grad_norm": 0.5027518272399902, "learning_rate": 0.00012985148110016947, "loss": 1.0426, "step": 562 }, { "epoch": 0.03996096176026972, "grad_norm": 0.5513244271278381, "learning_rate": 0.00012963222481782792, "loss": 1.1094, "step": 563 }, { "epoch": 0.04003194037796114, "grad_norm": 0.49723491072654724, "learning_rate": 0.00012941281225972636, "loss": 1.1326, "step": 564 }, { "epoch": 0.04010291899565256, "grad_norm": 0.4548230469226837, "learning_rate": 0.00012919324458301258, "loss": 0.9863, "step": 565 }, { "epoch": 0.04017389761334398, "grad_norm": 0.4577331840991974, "learning_rate": 0.0001289735229456525, "loss": 1.2053, "step": 566 }, { "epoch": 0.0402448762310354, "grad_norm": 0.4611161947250366, "learning_rate": 0.0001287536485064239, "loss": 0.815, "step": 567 }, { "epoch": 0.04031585484872682, "grad_norm": 0.6065758466720581, "learning_rate": 0.00012853362242491053, "loss": 1.277, "step": 568 }, { "epoch": 0.04038683346641824, "grad_norm": 0.49618154764175415, "learning_rate": 0.00012831344586149585, "loss": 1.0705, "step": 569 }, { "epoch": 0.04045781208410966, "grad_norm": 0.4948149025440216, "learning_rate": 0.00012809311997735696, "loss": 0.9454, "step": 570 }, { "epoch": 0.040528790701801085, "grad_norm": 0.5248857140541077, "learning_rate": 0.0001278726459344583, "loss": 0.7728, "step": 571 }, { "epoch": 0.0405997693194925, "grad_norm": 0.482176810503006, "learning_rate": 0.0001276520248955459, "loss": 1.0844, "step": 572 }, { "epoch": 0.04067074793718392, "grad_norm": 0.5663802623748779, "learning_rate": 0.0001274312580241409, "loss": 1.1882, "step": 573 }, { "epoch": 0.040741726554875346, "grad_norm": 0.46108278632164, "learning_rate": 0.00012721034648453353, "loss": 1.0908, "step": 574 }, { "epoch": 0.040812705172566764, "grad_norm": 0.39203080534935, "learning_rate": 0.00012698929144177698, "loss": 0.8045, "step": 575 }, { "epoch": 0.04088368379025818, "grad_norm": 0.5454275608062744, "learning_rate": 0.00012676809406168133, "loss": 0.9215, "step": 576 }, { "epoch": 0.04095466240794961, "grad_norm": 0.4310530424118042, "learning_rate": 0.00012654675551080724, "loss": 1.0449, "step": 577 }, { "epoch": 0.041025641025641026, "grad_norm": 0.383184015750885, "learning_rate": 0.00012632527695645993, "loss": 0.6255, "step": 578 }, { "epoch": 0.041096619643332444, "grad_norm": 0.43439266085624695, "learning_rate": 0.00012610365956668295, "loss": 0.6654, "step": 579 }, { "epoch": 0.04116759826102387, "grad_norm": 0.560525119304657, "learning_rate": 0.00012588190451025207, "loss": 0.9753, "step": 580 }, { "epoch": 0.04123857687871529, "grad_norm": 0.5516296029090881, "learning_rate": 0.0001256600129566691, "loss": 1.0959, "step": 581 }, { "epoch": 0.041309555496406705, "grad_norm": 0.523314356803894, "learning_rate": 0.00012543798607615565, "loss": 1.3729, "step": 582 }, { "epoch": 0.04138053411409813, "grad_norm": 0.5294370055198669, "learning_rate": 0.0001252158250396471, "loss": 0.8585, "step": 583 }, { "epoch": 0.04145151273178955, "grad_norm": 0.4960081875324249, "learning_rate": 0.0001249935310187863, "loss": 0.9557, "step": 584 }, { "epoch": 0.04152249134948097, "grad_norm": 0.4074767529964447, "learning_rate": 0.00012477110518591743, "loss": 0.8855, "step": 585 }, { "epoch": 0.04159346996717239, "grad_norm": 0.522270917892456, "learning_rate": 0.00012454854871407994, "loss": 1.0237, "step": 586 }, { "epoch": 0.04166444858486381, "grad_norm": 0.5770000219345093, "learning_rate": 0.00012432586277700208, "loss": 1.1054, "step": 587 }, { "epoch": 0.04173542720255523, "grad_norm": 0.4341234564781189, "learning_rate": 0.00012410304854909495, "loss": 0.69, "step": 588 }, { "epoch": 0.04180640582024665, "grad_norm": 0.667077898979187, "learning_rate": 0.00012388010720544633, "loss": 1.1115, "step": 589 }, { "epoch": 0.04187738443793807, "grad_norm": 0.40901777148246765, "learning_rate": 0.00012365703992181425, "loss": 0.7777, "step": 590 }, { "epoch": 0.04194836305562949, "grad_norm": 0.39315634965896606, "learning_rate": 0.00012343384787462099, "loss": 0.7453, "step": 591 }, { "epoch": 0.042019341673320915, "grad_norm": 0.5069630742073059, "learning_rate": 0.0001232105322409468, "loss": 0.8962, "step": 592 }, { "epoch": 0.04209032029101233, "grad_norm": 0.5047445893287659, "learning_rate": 0.0001229870941985237, "loss": 1.1736, "step": 593 }, { "epoch": 0.04216129890870375, "grad_norm": 0.7503096461296082, "learning_rate": 0.00012276353492572935, "loss": 1.165, "step": 594 }, { "epoch": 0.042232277526395176, "grad_norm": 0.49574917554855347, "learning_rate": 0.00012253985560158062, "loss": 1.0954, "step": 595 }, { "epoch": 0.042303256144086594, "grad_norm": 0.4608153998851776, "learning_rate": 0.00012231605740572766, "loss": 0.5896, "step": 596 }, { "epoch": 0.04237423476177801, "grad_norm": 0.5130946636199951, "learning_rate": 0.00012209214151844743, "loss": 0.8495, "step": 597 }, { "epoch": 0.04244521337946944, "grad_norm": 1.1176502704620361, "learning_rate": 0.0001218681091206376, "loss": 1.0988, "step": 598 }, { "epoch": 0.042516191997160856, "grad_norm": 0.7175559997558594, "learning_rate": 0.00012164396139381029, "loss": 1.105, "step": 599 }, { "epoch": 0.042587170614852274, "grad_norm": 0.3643674850463867, "learning_rate": 0.00012141969952008591, "loss": 0.7453, "step": 600 }, { "epoch": 0.0426581492325437, "grad_norm": 0.4029672145843506, "learning_rate": 0.00012119532468218677, "loss": 0.6966, "step": 601 }, { "epoch": 0.04272912785023512, "grad_norm": 0.4581654965877533, "learning_rate": 0.00012097083806343103, "loss": 0.8579, "step": 602 }, { "epoch": 0.042800106467926535, "grad_norm": 0.4180246889591217, "learning_rate": 0.00012074624084772628, "loss": 1.1826, "step": 603 }, { "epoch": 0.04287108508561796, "grad_norm": 0.4940919578075409, "learning_rate": 0.00012052153421956342, "loss": 1.0777, "step": 604 }, { "epoch": 0.04294206370330938, "grad_norm": 0.5713196992874146, "learning_rate": 0.00012029671936401042, "loss": 0.9822, "step": 605 }, { "epoch": 0.0430130423210008, "grad_norm": 0.43694865703582764, "learning_rate": 0.00012007179746670592, "loss": 0.9203, "step": 606 }, { "epoch": 0.04308402093869222, "grad_norm": 0.9349303245544434, "learning_rate": 0.00011984676971385314, "loss": 1.0257, "step": 607 }, { "epoch": 0.04315499955638364, "grad_norm": 0.42085903882980347, "learning_rate": 0.0001196216372922136, "loss": 0.7858, "step": 608 }, { "epoch": 0.04322597817407506, "grad_norm": 0.48192501068115234, "learning_rate": 0.00011939640138910073, "loss": 1.1237, "step": 609 }, { "epoch": 0.04329695679176648, "grad_norm": 0.6980161070823669, "learning_rate": 0.00011917106319237386, "loss": 1.2193, "step": 610 }, { "epoch": 0.0433679354094579, "grad_norm": 0.44299203157424927, "learning_rate": 0.00011894562389043162, "loss": 1.3598, "step": 611 }, { "epoch": 0.04343891402714932, "grad_norm": 0.4817625880241394, "learning_rate": 0.00011872008467220599, "loss": 0.9416, "step": 612 }, { "epoch": 0.043509892644840745, "grad_norm": 0.5078029036521912, "learning_rate": 0.00011849444672715586, "loss": 1.1766, "step": 613 }, { "epoch": 0.04358087126253216, "grad_norm": 0.666621208190918, "learning_rate": 0.00011826871124526071, "loss": 1.1444, "step": 614 }, { "epoch": 0.04365184988022358, "grad_norm": 1.1335780620574951, "learning_rate": 0.0001180428794170145, "loss": 1.5676, "step": 615 }, { "epoch": 0.043722828497915006, "grad_norm": 0.6950260996818542, "learning_rate": 0.00011781695243341932, "loss": 1.1453, "step": 616 }, { "epoch": 0.043793807115606424, "grad_norm": 0.509067952632904, "learning_rate": 0.000117590931485979, "loss": 0.8675, "step": 617 }, { "epoch": 0.04386478573329784, "grad_norm": 0.7531905770301819, "learning_rate": 0.00011736481776669306, "loss": 1.4806, "step": 618 }, { "epoch": 0.04393576435098927, "grad_norm": 0.4752219617366791, "learning_rate": 0.00011713861246805011, "loss": 0.8707, "step": 619 }, { "epoch": 0.044006742968680686, "grad_norm": 0.5162742137908936, "learning_rate": 0.00011691231678302187, "loss": 0.894, "step": 620 }, { "epoch": 0.044077721586372104, "grad_norm": 0.47614169120788574, "learning_rate": 0.00011668593190505674, "loss": 1.0056, "step": 621 }, { "epoch": 0.04414870020406353, "grad_norm": 0.56108158826828, "learning_rate": 0.00011645945902807341, "loss": 0.9745, "step": 622 }, { "epoch": 0.04421967882175495, "grad_norm": 0.6655295491218567, "learning_rate": 0.00011623289934645474, "loss": 1.084, "step": 623 }, { "epoch": 0.044290657439446365, "grad_norm": 0.48683473467826843, "learning_rate": 0.0001160062540550414, "loss": 0.8182, "step": 624 }, { "epoch": 0.04436163605713779, "grad_norm": 0.4132605791091919, "learning_rate": 0.00011577952434912548, "loss": 0.8302, "step": 625 }, { "epoch": 0.04443261467482921, "grad_norm": 0.47813522815704346, "learning_rate": 0.00011555271142444433, "loss": 1.0622, "step": 626 }, { "epoch": 0.04450359329252063, "grad_norm": 0.4718089997768402, "learning_rate": 0.00011532581647717413, "loss": 1.0672, "step": 627 }, { "epoch": 0.04457457191021205, "grad_norm": 0.5046697854995728, "learning_rate": 0.00011509884070392369, "loss": 0.7276, "step": 628 }, { "epoch": 0.04464555052790347, "grad_norm": 0.5806900262832642, "learning_rate": 0.00011487178530172804, "loss": 1.0545, "step": 629 }, { "epoch": 0.04471652914559489, "grad_norm": 0.9328553676605225, "learning_rate": 0.00011464465146804217, "loss": 1.0426, "step": 630 }, { "epoch": 0.04478750776328631, "grad_norm": 0.4498421549797058, "learning_rate": 0.00011441744040073468, "loss": 0.8228, "step": 631 }, { "epoch": 0.04485848638097773, "grad_norm": 0.4342452585697174, "learning_rate": 0.00011419015329808157, "loss": 0.8865, "step": 632 }, { "epoch": 0.04492946499866915, "grad_norm": 0.7273264527320862, "learning_rate": 0.00011396279135875976, "loss": 1.013, "step": 633 }, { "epoch": 0.045000443616360575, "grad_norm": 0.7325798869132996, "learning_rate": 0.00011373535578184082, "loss": 1.3961, "step": 634 }, { "epoch": 0.04507142223405199, "grad_norm": 0.4335000216960907, "learning_rate": 0.0001135078477667848, "loss": 0.8541, "step": 635 }, { "epoch": 0.04514240085174341, "grad_norm": 0.5893487930297852, "learning_rate": 0.00011328026851343367, "loss": 1.4421, "step": 636 }, { "epoch": 0.045213379469434836, "grad_norm": 0.5885086059570312, "learning_rate": 0.00011305261922200519, "loss": 0.8343, "step": 637 }, { "epoch": 0.045284358087126254, "grad_norm": 0.4759581387042999, "learning_rate": 0.00011282490109308633, "loss": 0.8707, "step": 638 }, { "epoch": 0.04535533670481767, "grad_norm": 0.49139389395713806, "learning_rate": 0.00011259711532762724, "loss": 1.473, "step": 639 }, { "epoch": 0.04542631532250909, "grad_norm": 0.42390480637550354, "learning_rate": 0.00011236926312693479, "loss": 1.0461, "step": 640 }, { "epoch": 0.045497293940200516, "grad_norm": 0.475140243768692, "learning_rate": 0.00011214134569266607, "loss": 0.887, "step": 641 }, { "epoch": 0.045568272557891934, "grad_norm": 0.5448624491691589, "learning_rate": 0.00011191336422682237, "loss": 0.8352, "step": 642 }, { "epoch": 0.04563925117558335, "grad_norm": 0.6726250648498535, "learning_rate": 0.00011168531993174258, "loss": 1.2589, "step": 643 }, { "epoch": 0.04571022979327478, "grad_norm": 0.78367680311203, "learning_rate": 0.00011145721401009694, "loss": 1.3666, "step": 644 }, { "epoch": 0.045781208410966195, "grad_norm": 0.4396587014198303, "learning_rate": 0.00011122904766488078, "loss": 0.6807, "step": 645 }, { "epoch": 0.045852187028657614, "grad_norm": 0.4255082905292511, "learning_rate": 0.00011100082209940795, "loss": 0.9493, "step": 646 }, { "epoch": 0.04592316564634904, "grad_norm": 0.42374178767204285, "learning_rate": 0.00011077253851730474, "loss": 0.8472, "step": 647 }, { "epoch": 0.04599414426404046, "grad_norm": 0.4219643771648407, "learning_rate": 0.00011054419812250338, "loss": 0.736, "step": 648 }, { "epoch": 0.046065122881731875, "grad_norm": 0.6243124604225159, "learning_rate": 0.00011031580211923571, "loss": 1.2868, "step": 649 }, { "epoch": 0.0461361014994233, "grad_norm": 0.42096689343452454, "learning_rate": 0.00011008735171202684, "loss": 0.9855, "step": 650 }, { "epoch": 0.04620708011711472, "grad_norm": 0.5067743062973022, "learning_rate": 0.00010985884810568878, "loss": 0.7375, "step": 651 }, { "epoch": 0.046278058734806136, "grad_norm": 0.4720652997493744, "learning_rate": 0.00010963029250531418, "loss": 0.722, "step": 652 }, { "epoch": 0.04634903735249756, "grad_norm": 0.47726547718048096, "learning_rate": 0.00010940168611626984, "loss": 1.0371, "step": 653 }, { "epoch": 0.04642001597018898, "grad_norm": 0.5624931454658508, "learning_rate": 0.00010917303014419036, "loss": 1.0886, "step": 654 }, { "epoch": 0.0464909945878804, "grad_norm": 0.5794550776481628, "learning_rate": 0.00010894432579497191, "loss": 0.8423, "step": 655 }, { "epoch": 0.04656197320557182, "grad_norm": 0.45380592346191406, "learning_rate": 0.00010871557427476583, "loss": 0.9962, "step": 656 }, { "epoch": 0.04663295182326324, "grad_norm": 0.6655727028846741, "learning_rate": 0.00010848677678997213, "loss": 1.5437, "step": 657 }, { "epoch": 0.04670393044095466, "grad_norm": 0.4097822606563568, "learning_rate": 0.00010825793454723325, "loss": 0.7659, "step": 658 }, { "epoch": 0.046774909058646084, "grad_norm": 0.46168258786201477, "learning_rate": 0.00010802904875342775, "loss": 0.8298, "step": 659 }, { "epoch": 0.0468458876763375, "grad_norm": 0.45900896191596985, "learning_rate": 0.00010780012061566378, "loss": 0.8446, "step": 660 }, { "epoch": 0.04691686629402892, "grad_norm": 0.6090603470802307, "learning_rate": 0.00010757115134127292, "loss": 1.437, "step": 661 }, { "epoch": 0.046987844911720346, "grad_norm": 0.4104398190975189, "learning_rate": 0.00010734214213780354, "loss": 0.7109, "step": 662 }, { "epoch": 0.047058823529411764, "grad_norm": 0.5128499269485474, "learning_rate": 0.00010711309421301474, "loss": 0.8878, "step": 663 }, { "epoch": 0.04712980214710318, "grad_norm": 0.4654010832309723, "learning_rate": 0.00010688400877486978, "loss": 1.1085, "step": 664 }, { "epoch": 0.04720078076479461, "grad_norm": 0.438040167093277, "learning_rate": 0.00010665488703152966, "loss": 0.6826, "step": 665 }, { "epoch": 0.047271759382486025, "grad_norm": 0.3760967552661896, "learning_rate": 0.00010642573019134703, "loss": 1.1014, "step": 666 }, { "epoch": 0.047342738000177444, "grad_norm": 0.5885511040687561, "learning_rate": 0.00010619653946285947, "loss": 0.784, "step": 667 }, { "epoch": 0.04741371661786887, "grad_norm": 0.46597081422805786, "learning_rate": 0.0001059673160547834, "loss": 1.171, "step": 668 }, { "epoch": 0.04748469523556029, "grad_norm": 0.4472491145133972, "learning_rate": 0.00010573806117600755, "loss": 1.1062, "step": 669 }, { "epoch": 0.047555673853251705, "grad_norm": 0.5407065153121948, "learning_rate": 0.00010550877603558655, "loss": 1.1002, "step": 670 }, { "epoch": 0.04762665247094313, "grad_norm": 0.5669470429420471, "learning_rate": 0.00010527946184273474, "loss": 1.0241, "step": 671 }, { "epoch": 0.04769763108863455, "grad_norm": 0.5017465353012085, "learning_rate": 0.00010505011980681962, "loss": 0.8614, "step": 672 }, { "epoch": 0.04776860970632597, "grad_norm": 0.5284566879272461, "learning_rate": 0.0001048207511373555, "loss": 1.0367, "step": 673 }, { "epoch": 0.04783958832401739, "grad_norm": 0.7074596285820007, "learning_rate": 0.00010459135704399718, "loss": 0.8528, "step": 674 }, { "epoch": 0.04791056694170881, "grad_norm": 0.5193062424659729, "learning_rate": 0.00010436193873653361, "loss": 0.863, "step": 675 }, { "epoch": 0.04798154555940023, "grad_norm": 0.365856409072876, "learning_rate": 0.00010413249742488131, "loss": 0.708, "step": 676 }, { "epoch": 0.04805252417709165, "grad_norm": 0.44047170877456665, "learning_rate": 0.00010390303431907826, "loss": 0.7627, "step": 677 }, { "epoch": 0.04812350279478307, "grad_norm": 0.5452557802200317, "learning_rate": 0.00010367355062927726, "loss": 1.2639, "step": 678 }, { "epoch": 0.04819448141247449, "grad_norm": 0.9847500324249268, "learning_rate": 0.00010344404756573971, "loss": 1.3433, "step": 679 }, { "epoch": 0.048265460030165915, "grad_norm": 0.3580925166606903, "learning_rate": 0.00010321452633882922, "loss": 0.8454, "step": 680 }, { "epoch": 0.04833643864785733, "grad_norm": 0.504849910736084, "learning_rate": 0.00010298498815900513, "loss": 0.7675, "step": 681 }, { "epoch": 0.04840741726554875, "grad_norm": 0.49430397152900696, "learning_rate": 0.00010275543423681621, "loss": 0.9612, "step": 682 }, { "epoch": 0.048478395883240176, "grad_norm": 0.5871439576148987, "learning_rate": 0.0001025258657828943, "loss": 1.0146, "step": 683 }, { "epoch": 0.048549374500931594, "grad_norm": 0.43843525648117065, "learning_rate": 0.0001022962840079478, "loss": 0.839, "step": 684 }, { "epoch": 0.04862035311862301, "grad_norm": 0.5134842395782471, "learning_rate": 0.00010206669012275545, "loss": 1.2938, "step": 685 }, { "epoch": 0.04869133173631444, "grad_norm": 0.5673609972000122, "learning_rate": 0.00010183708533815974, "loss": 1.3012, "step": 686 }, { "epoch": 0.048762310354005856, "grad_norm": 0.5483171343803406, "learning_rate": 0.00010160747086506077, "loss": 0.932, "step": 687 }, { "epoch": 0.048833288971697274, "grad_norm": 0.485774964094162, "learning_rate": 0.00010137784791440965, "loss": 1.1575, "step": 688 }, { "epoch": 0.0489042675893887, "grad_norm": 0.5414448380470276, "learning_rate": 0.00010114821769720221, "loss": 1.0061, "step": 689 }, { "epoch": 0.04897524620708012, "grad_norm": 0.3982224762439728, "learning_rate": 0.00010091858142447265, "loss": 0.9397, "step": 690 }, { "epoch": 0.04897524620708012, "eval_loss": 1.1494126319885254, "eval_runtime": 314.4169, "eval_samples_per_second": 18.87, "eval_steps_per_second": 9.437, "step": 690 }, { "epoch": 0.049046224824771535, "grad_norm": 0.4022897779941559, "learning_rate": 0.00010068894030728704, "loss": 0.8083, "step": 691 }, { "epoch": 0.04911720344246296, "grad_norm": 0.5586432814598083, "learning_rate": 0.00010045929555673705, "loss": 1.358, "step": 692 }, { "epoch": 0.04918818206015438, "grad_norm": 0.47804561257362366, "learning_rate": 0.00010022964838393354, "loss": 1.0319, "step": 693 }, { "epoch": 0.0492591606778458, "grad_norm": 0.440791517496109, "learning_rate": 0.0001, "loss": 0.7665, "step": 694 }, { "epoch": 0.04933013929553722, "grad_norm": 0.413766473531723, "learning_rate": 9.977035161606648e-05, "loss": 0.8927, "step": 695 }, { "epoch": 0.04940111791322864, "grad_norm": 0.5470471978187561, "learning_rate": 9.954070444326293e-05, "loss": 0.9975, "step": 696 }, { "epoch": 0.04947209653092006, "grad_norm": 0.3963073492050171, "learning_rate": 9.931105969271298e-05, "loss": 0.924, "step": 697 }, { "epoch": 0.04954307514861148, "grad_norm": 0.786769688129425, "learning_rate": 9.908141857552737e-05, "loss": 0.9929, "step": 698 }, { "epoch": 0.0496140537663029, "grad_norm": 0.6488879919052124, "learning_rate": 9.88517823027978e-05, "loss": 1.1, "step": 699 }, { "epoch": 0.04968503238399432, "grad_norm": 0.7211329340934753, "learning_rate": 9.862215208559037e-05, "loss": 1.3579, "step": 700 }, { "epoch": 0.049756011001685745, "grad_norm": 0.4899226725101471, "learning_rate": 9.839252913493924e-05, "loss": 0.8371, "step": 701 }, { "epoch": 0.04982698961937716, "grad_norm": 0.4601897597312927, "learning_rate": 9.816291466184026e-05, "loss": 0.7999, "step": 702 }, { "epoch": 0.04989796823706858, "grad_norm": 0.43111488223075867, "learning_rate": 9.793330987724459e-05, "loss": 0.7538, "step": 703 }, { "epoch": 0.049968946854760006, "grad_norm": 0.4941050708293915, "learning_rate": 9.770371599205222e-05, "loss": 1.026, "step": 704 }, { "epoch": 0.050039925472451424, "grad_norm": 0.4672418534755707, "learning_rate": 9.747413421710573e-05, "loss": 0.8018, "step": 705 }, { "epoch": 0.05011090409014284, "grad_norm": 0.4009042978286743, "learning_rate": 9.724456576318381e-05, "loss": 1.1407, "step": 706 }, { "epoch": 0.05018188270783427, "grad_norm": 0.701394259929657, "learning_rate": 9.70150118409949e-05, "loss": 1.077, "step": 707 }, { "epoch": 0.050252861325525686, "grad_norm": 0.4240935146808624, "learning_rate": 9.678547366117083e-05, "loss": 0.6823, "step": 708 }, { "epoch": 0.050323839943217104, "grad_norm": 0.45193445682525635, "learning_rate": 9.655595243426032e-05, "loss": 1.1346, "step": 709 }, { "epoch": 0.05039481856090853, "grad_norm": 0.5613667368888855, "learning_rate": 9.632644937072277e-05, "loss": 0.8667, "step": 710 }, { "epoch": 0.05046579717859995, "grad_norm": 0.4414259195327759, "learning_rate": 9.609696568092175e-05, "loss": 0.6781, "step": 711 }, { "epoch": 0.050536775796291365, "grad_norm": 0.6090078949928284, "learning_rate": 9.586750257511867e-05, "loss": 0.7027, "step": 712 }, { "epoch": 0.05060775441398279, "grad_norm": 0.3751627206802368, "learning_rate": 9.563806126346642e-05, "loss": 0.6676, "step": 713 }, { "epoch": 0.05067873303167421, "grad_norm": 0.5507296919822693, "learning_rate": 9.540864295600283e-05, "loss": 0.9018, "step": 714 }, { "epoch": 0.05074971164936563, "grad_norm": 0.3942962884902954, "learning_rate": 9.517924886264453e-05, "loss": 0.7887, "step": 715 }, { "epoch": 0.05082069026705705, "grad_norm": 0.5177544355392456, "learning_rate": 9.49498801931804e-05, "loss": 1.0513, "step": 716 }, { "epoch": 0.05089166888474847, "grad_norm": 0.419549822807312, "learning_rate": 9.472053815726527e-05, "loss": 0.782, "step": 717 }, { "epoch": 0.05096264750243989, "grad_norm": 0.4804348647594452, "learning_rate": 9.449122396441345e-05, "loss": 0.8969, "step": 718 }, { "epoch": 0.05103362612013131, "grad_norm": 0.7644201517105103, "learning_rate": 9.42619388239925e-05, "loss": 1.3905, "step": 719 }, { "epoch": 0.05110460473782273, "grad_norm": 0.509852409362793, "learning_rate": 9.403268394521662e-05, "loss": 0.9453, "step": 720 }, { "epoch": 0.05117558335551415, "grad_norm": 0.4332434833049774, "learning_rate": 9.380346053714055e-05, "loss": 0.9153, "step": 721 }, { "epoch": 0.051246561973205575, "grad_norm": 0.45807182788848877, "learning_rate": 9.357426980865301e-05, "loss": 0.9324, "step": 722 }, { "epoch": 0.05131754059089699, "grad_norm": 0.4809603691101074, "learning_rate": 9.334511296847035e-05, "loss": 0.9034, "step": 723 }, { "epoch": 0.05138851920858841, "grad_norm": 0.49270740151405334, "learning_rate": 9.311599122513029e-05, "loss": 0.8644, "step": 724 }, { "epoch": 0.051459497826279836, "grad_norm": 0.8549937605857849, "learning_rate": 9.288690578698528e-05, "loss": 1.3285, "step": 725 }, { "epoch": 0.051530476443971254, "grad_norm": 0.6012173891067505, "learning_rate": 9.265785786219647e-05, "loss": 0.8291, "step": 726 }, { "epoch": 0.05160145506166267, "grad_norm": 0.4957113265991211, "learning_rate": 9.24288486587271e-05, "loss": 0.9983, "step": 727 }, { "epoch": 0.0516724336793541, "grad_norm": 0.4758509695529938, "learning_rate": 9.219987938433621e-05, "loss": 1.1677, "step": 728 }, { "epoch": 0.051743412297045516, "grad_norm": 0.6273649334907532, "learning_rate": 9.197095124657232e-05, "loss": 0.9646, "step": 729 }, { "epoch": 0.051814390914736934, "grad_norm": 0.40931689739227295, "learning_rate": 9.174206545276677e-05, "loss": 0.7198, "step": 730 }, { "epoch": 0.05188536953242836, "grad_norm": 0.44549432396888733, "learning_rate": 9.151322321002791e-05, "loss": 0.7917, "step": 731 }, { "epoch": 0.05195634815011978, "grad_norm": 0.5102499127388, "learning_rate": 9.128442572523417e-05, "loss": 1.2428, "step": 732 }, { "epoch": 0.052027326767811195, "grad_norm": 0.43024900555610657, "learning_rate": 9.105567420502807e-05, "loss": 0.7804, "step": 733 }, { "epoch": 0.05209830538550262, "grad_norm": 0.6325342655181885, "learning_rate": 9.082696985580964e-05, "loss": 0.9901, "step": 734 }, { "epoch": 0.05216928400319404, "grad_norm": 0.6966437101364136, "learning_rate": 9.059831388373021e-05, "loss": 1.2194, "step": 735 }, { "epoch": 0.05224026262088546, "grad_norm": 0.5250086188316345, "learning_rate": 9.036970749468584e-05, "loss": 0.7614, "step": 736 }, { "epoch": 0.05231124123857688, "grad_norm": 0.4323217272758484, "learning_rate": 9.014115189431123e-05, "loss": 1.0261, "step": 737 }, { "epoch": 0.0523822198562683, "grad_norm": 0.4656195044517517, "learning_rate": 8.991264828797319e-05, "loss": 0.9253, "step": 738 }, { "epoch": 0.05245319847395972, "grad_norm": 0.4614455997943878, "learning_rate": 8.968419788076431e-05, "loss": 0.8205, "step": 739 }, { "epoch": 0.05252417709165114, "grad_norm": 0.5283876061439514, "learning_rate": 8.945580187749666e-05, "loss": 0.86, "step": 740 }, { "epoch": 0.05259515570934256, "grad_norm": 0.5251266360282898, "learning_rate": 8.922746148269528e-05, "loss": 1.0184, "step": 741 }, { "epoch": 0.05266613432703398, "grad_norm": 0.48121723532676697, "learning_rate": 8.899917790059208e-05, "loss": 0.6964, "step": 742 }, { "epoch": 0.052737112944725405, "grad_norm": 0.4094066917896271, "learning_rate": 8.877095233511924e-05, "loss": 0.7444, "step": 743 }, { "epoch": 0.05280809156241682, "grad_norm": 0.5368256568908691, "learning_rate": 8.854278598990305e-05, "loss": 0.8453, "step": 744 }, { "epoch": 0.05287907018010824, "grad_norm": 0.41687527298927307, "learning_rate": 8.831468006825745e-05, "loss": 1.0663, "step": 745 }, { "epoch": 0.052950048797799666, "grad_norm": 0.5173694491386414, "learning_rate": 8.808663577317764e-05, "loss": 0.9384, "step": 746 }, { "epoch": 0.053021027415491084, "grad_norm": 0.4728773236274719, "learning_rate": 8.785865430733394e-05, "loss": 0.7385, "step": 747 }, { "epoch": 0.0530920060331825, "grad_norm": 0.5745211243629456, "learning_rate": 8.763073687306524e-05, "loss": 1.0491, "step": 748 }, { "epoch": 0.05316298465087393, "grad_norm": 0.5553246736526489, "learning_rate": 8.740288467237275e-05, "loss": 1.0816, "step": 749 }, { "epoch": 0.053233963268565346, "grad_norm": 1.0538278818130493, "learning_rate": 8.717509890691368e-05, "loss": 1.7472, "step": 750 }, { "epoch": 0.053304941886256764, "grad_norm": 0.4369787573814392, "learning_rate": 8.694738077799488e-05, "loss": 0.9145, "step": 751 }, { "epoch": 0.05337592050394819, "grad_norm": 0.5557453632354736, "learning_rate": 8.671973148656634e-05, "loss": 0.9107, "step": 752 }, { "epoch": 0.05344689912163961, "grad_norm": 0.5311267375946045, "learning_rate": 8.649215223321521e-05, "loss": 0.6837, "step": 753 }, { "epoch": 0.053517877739331025, "grad_norm": 0.8827739357948303, "learning_rate": 8.626464421815919e-05, "loss": 1.1713, "step": 754 }, { "epoch": 0.053588856357022444, "grad_norm": 0.4869166910648346, "learning_rate": 8.603720864124027e-05, "loss": 0.9842, "step": 755 }, { "epoch": 0.05365983497471387, "grad_norm": 0.3187915086746216, "learning_rate": 8.580984670191848e-05, "loss": 0.6011, "step": 756 }, { "epoch": 0.05373081359240529, "grad_norm": 0.5452215075492859, "learning_rate": 8.558255959926533e-05, "loss": 1.0099, "step": 757 }, { "epoch": 0.053801792210096705, "grad_norm": 0.5802979469299316, "learning_rate": 8.535534853195786e-05, "loss": 0.7361, "step": 758 }, { "epoch": 0.05387277082778813, "grad_norm": 0.4334274232387543, "learning_rate": 8.512821469827197e-05, "loss": 0.8313, "step": 759 }, { "epoch": 0.05394374944547955, "grad_norm": 0.5790685415267944, "learning_rate": 8.490115929607631e-05, "loss": 1.1261, "step": 760 }, { "epoch": 0.054014728063170966, "grad_norm": 0.584993839263916, "learning_rate": 8.46741835228259e-05, "loss": 0.8985, "step": 761 }, { "epoch": 0.05408570668086239, "grad_norm": 0.44616758823394775, "learning_rate": 8.444728857555572e-05, "loss": 0.6972, "step": 762 }, { "epoch": 0.05415668529855381, "grad_norm": 0.5474051237106323, "learning_rate": 8.422047565087454e-05, "loss": 0.926, "step": 763 }, { "epoch": 0.05422766391624523, "grad_norm": 0.3344883620738983, "learning_rate": 8.399374594495861e-05, "loss": 0.7286, "step": 764 }, { "epoch": 0.05429864253393665, "grad_norm": 0.7142869830131531, "learning_rate": 8.376710065354526e-05, "loss": 1.3952, "step": 765 }, { "epoch": 0.05436962115162807, "grad_norm": 0.47985586524009705, "learning_rate": 8.35405409719266e-05, "loss": 0.7974, "step": 766 }, { "epoch": 0.05444059976931949, "grad_norm": 0.4016041159629822, "learning_rate": 8.331406809494331e-05, "loss": 0.7404, "step": 767 }, { "epoch": 0.054511578387010914, "grad_norm": 0.9050955772399902, "learning_rate": 8.308768321697815e-05, "loss": 1.227, "step": 768 }, { "epoch": 0.05458255700470233, "grad_norm": 0.4450608193874359, "learning_rate": 8.286138753194992e-05, "loss": 0.6029, "step": 769 }, { "epoch": 0.05465353562239375, "grad_norm": 0.4160042107105255, "learning_rate": 8.263518223330697e-05, "loss": 0.8783, "step": 770 }, { "epoch": 0.054724514240085176, "grad_norm": 0.5563350319862366, "learning_rate": 8.2409068514021e-05, "loss": 0.8592, "step": 771 }, { "epoch": 0.054795492857776594, "grad_norm": 0.41437390446662903, "learning_rate": 8.218304756658072e-05, "loss": 1.0336, "step": 772 }, { "epoch": 0.05486647147546801, "grad_norm": 0.42581912875175476, "learning_rate": 8.195712058298552e-05, "loss": 0.8626, "step": 773 }, { "epoch": 0.05493745009315944, "grad_norm": 0.4045267105102539, "learning_rate": 8.173128875473932e-05, "loss": 0.738, "step": 774 }, { "epoch": 0.055008428710850855, "grad_norm": 0.39921483397483826, "learning_rate": 8.150555327284417e-05, "loss": 0.8155, "step": 775 }, { "epoch": 0.055079407328542274, "grad_norm": 0.4529501497745514, "learning_rate": 8.127991532779401e-05, "loss": 1.0299, "step": 776 }, { "epoch": 0.0551503859462337, "grad_norm": 0.4552212655544281, "learning_rate": 8.105437610956842e-05, "loss": 1.0856, "step": 777 }, { "epoch": 0.05522136456392512, "grad_norm": 0.34472283720970154, "learning_rate": 8.082893680762619e-05, "loss": 0.5842, "step": 778 }, { "epoch": 0.055292343181616535, "grad_norm": 0.42668965458869934, "learning_rate": 8.06035986108993e-05, "loss": 0.9446, "step": 779 }, { "epoch": 0.05536332179930796, "grad_norm": 0.3614772856235504, "learning_rate": 8.037836270778642e-05, "loss": 0.6807, "step": 780 }, { "epoch": 0.05543430041699938, "grad_norm": 0.45221051573753357, "learning_rate": 8.015323028614687e-05, "loss": 0.7521, "step": 781 }, { "epoch": 0.055505279034690796, "grad_norm": 0.4714120030403137, "learning_rate": 7.992820253329409e-05, "loss": 0.8073, "step": 782 }, { "epoch": 0.05557625765238222, "grad_norm": 0.4666607975959778, "learning_rate": 7.970328063598962e-05, "loss": 0.7753, "step": 783 }, { "epoch": 0.05564723627007364, "grad_norm": 0.41531217098236084, "learning_rate": 7.947846578043659e-05, "loss": 0.7738, "step": 784 }, { "epoch": 0.05571821488776506, "grad_norm": 0.4434370696544647, "learning_rate": 7.925375915227373e-05, "loss": 1.0818, "step": 785 }, { "epoch": 0.05578919350545648, "grad_norm": 0.4669349789619446, "learning_rate": 7.902916193656898e-05, "loss": 0.8903, "step": 786 }, { "epoch": 0.0558601721231479, "grad_norm": 0.8342474102973938, "learning_rate": 7.880467531781323e-05, "loss": 1.1674, "step": 787 }, { "epoch": 0.05593115074083932, "grad_norm": 0.4424683749675751, "learning_rate": 7.858030047991411e-05, "loss": 0.704, "step": 788 }, { "epoch": 0.056002129358530744, "grad_norm": 0.5986254215240479, "learning_rate": 7.835603860618972e-05, "loss": 0.9534, "step": 789 }, { "epoch": 0.05607310797622216, "grad_norm": 0.47736671566963196, "learning_rate": 7.813189087936243e-05, "loss": 1.3392, "step": 790 }, { "epoch": 0.05614408659391358, "grad_norm": 0.4386388659477234, "learning_rate": 7.790785848155258e-05, "loss": 0.7381, "step": 791 }, { "epoch": 0.056215065211605006, "grad_norm": 0.4492368698120117, "learning_rate": 7.768394259427234e-05, "loss": 0.8146, "step": 792 }, { "epoch": 0.056286043829296424, "grad_norm": 0.5233791470527649, "learning_rate": 7.74601443984194e-05, "loss": 0.8539, "step": 793 }, { "epoch": 0.05635702244698784, "grad_norm": 0.5854870676994324, "learning_rate": 7.72364650742707e-05, "loss": 0.9071, "step": 794 }, { "epoch": 0.05642800106467927, "grad_norm": 0.5238258838653564, "learning_rate": 7.701290580147632e-05, "loss": 1.0756, "step": 795 }, { "epoch": 0.056498979682370686, "grad_norm": 0.5499415397644043, "learning_rate": 7.678946775905324e-05, "loss": 1.1878, "step": 796 }, { "epoch": 0.056569958300062104, "grad_norm": 0.5401821732521057, "learning_rate": 7.656615212537904e-05, "loss": 1.1512, "step": 797 }, { "epoch": 0.05664093691775353, "grad_norm": 0.4304891526699066, "learning_rate": 7.634296007818576e-05, "loss": 1.432, "step": 798 }, { "epoch": 0.05671191553544495, "grad_norm": 0.37207162380218506, "learning_rate": 7.611989279455371e-05, "loss": 0.7997, "step": 799 }, { "epoch": 0.056782894153136365, "grad_norm": 0.4321914315223694, "learning_rate": 7.589695145090506e-05, "loss": 1.0968, "step": 800 }, { "epoch": 0.05685387277082779, "grad_norm": 0.4520570933818817, "learning_rate": 7.567413722299796e-05, "loss": 1.056, "step": 801 }, { "epoch": 0.05692485138851921, "grad_norm": 0.44635599851608276, "learning_rate": 7.54514512859201e-05, "loss": 0.993, "step": 802 }, { "epoch": 0.05699583000621063, "grad_norm": 0.5553821921348572, "learning_rate": 7.522889481408258e-05, "loss": 0.8282, "step": 803 }, { "epoch": 0.05706680862390205, "grad_norm": 0.4571636915206909, "learning_rate": 7.500646898121373e-05, "loss": 1.1057, "step": 804 }, { "epoch": 0.05713778724159347, "grad_norm": 0.45817831158638, "learning_rate": 7.478417496035294e-05, "loss": 0.7529, "step": 805 }, { "epoch": 0.05720876585928489, "grad_norm": 0.5361679196357727, "learning_rate": 7.456201392384436e-05, "loss": 1.1876, "step": 806 }, { "epoch": 0.05727974447697631, "grad_norm": 0.33276188373565674, "learning_rate": 7.433998704333092e-05, "loss": 0.9754, "step": 807 }, { "epoch": 0.05735072309466773, "grad_norm": 0.42418715357780457, "learning_rate": 7.411809548974792e-05, "loss": 0.9489, "step": 808 }, { "epoch": 0.05742170171235915, "grad_norm": 0.44860485196113586, "learning_rate": 7.389634043331707e-05, "loss": 1.1316, "step": 809 }, { "epoch": 0.057492680330050575, "grad_norm": 1.0400519371032715, "learning_rate": 7.36747230435401e-05, "loss": 1.4115, "step": 810 }, { "epoch": 0.05756365894774199, "grad_norm": 0.5096813440322876, "learning_rate": 7.34532444891928e-05, "loss": 1.164, "step": 811 }, { "epoch": 0.05763463756543341, "grad_norm": 0.4916059970855713, "learning_rate": 7.32319059383187e-05, "loss": 0.991, "step": 812 }, { "epoch": 0.057705616183124836, "grad_norm": 0.4421261250972748, "learning_rate": 7.301070855822305e-05, "loss": 1.1033, "step": 813 }, { "epoch": 0.057776594800816254, "grad_norm": 0.796850323677063, "learning_rate": 7.278965351546648e-05, "loss": 1.2357, "step": 814 }, { "epoch": 0.05784757341850767, "grad_norm": 0.5907092690467834, "learning_rate": 7.256874197585914e-05, "loss": 0.7622, "step": 815 }, { "epoch": 0.0579185520361991, "grad_norm": 0.4151279330253601, "learning_rate": 7.234797510445411e-05, "loss": 1.2456, "step": 816 }, { "epoch": 0.057989530653890516, "grad_norm": 0.37903130054473877, "learning_rate": 7.212735406554169e-05, "loss": 0.8014, "step": 817 }, { "epoch": 0.058060509271581934, "grad_norm": 0.4448012411594391, "learning_rate": 7.190688002264308e-05, "loss": 1.2161, "step": 818 }, { "epoch": 0.05813148788927336, "grad_norm": 0.44265174865722656, "learning_rate": 7.168655413850413e-05, "loss": 0.8808, "step": 819 }, { "epoch": 0.05820246650696478, "grad_norm": 0.6118061542510986, "learning_rate": 7.146637757508949e-05, "loss": 0.7636, "step": 820 }, { "epoch": 0.058273445124656195, "grad_norm": 0.5194787979125977, "learning_rate": 7.124635149357612e-05, "loss": 0.7704, "step": 821 }, { "epoch": 0.05834442374234762, "grad_norm": 0.596010148525238, "learning_rate": 7.102647705434756e-05, "loss": 0.7633, "step": 822 }, { "epoch": 0.05841540236003904, "grad_norm": 0.6367552876472473, "learning_rate": 7.080675541698743e-05, "loss": 0.9826, "step": 823 }, { "epoch": 0.05848638097773046, "grad_norm": 0.563396692276001, "learning_rate": 7.058718774027364e-05, "loss": 1.2128, "step": 824 }, { "epoch": 0.05855735959542188, "grad_norm": 0.4853164553642273, "learning_rate": 7.036777518217211e-05, "loss": 0.9957, "step": 825 }, { "epoch": 0.0586283382131133, "grad_norm": 0.7309368848800659, "learning_rate": 7.014851889983057e-05, "loss": 1.255, "step": 826 }, { "epoch": 0.05869931683080472, "grad_norm": 0.49265366792678833, "learning_rate": 6.992942004957271e-05, "loss": 1.1252, "step": 827 }, { "epoch": 0.05877029544849614, "grad_norm": 0.5501289963722229, "learning_rate": 6.971047978689189e-05, "loss": 0.8189, "step": 828 }, { "epoch": 0.05884127406618756, "grad_norm": 0.4956642985343933, "learning_rate": 6.949169926644514e-05, "loss": 1.0077, "step": 829 }, { "epoch": 0.05891225268387898, "grad_norm": 0.6592064499855042, "learning_rate": 6.927307964204694e-05, "loss": 1.1189, "step": 830 }, { "epoch": 0.058983231301570405, "grad_norm": 0.5669671297073364, "learning_rate": 6.905462206666345e-05, "loss": 0.8068, "step": 831 }, { "epoch": 0.05905420991926182, "grad_norm": 0.42329180240631104, "learning_rate": 6.883632769240589e-05, "loss": 0.5867, "step": 832 }, { "epoch": 0.05912518853695324, "grad_norm": 0.643518328666687, "learning_rate": 6.861819767052502e-05, "loss": 0.8993, "step": 833 }, { "epoch": 0.059196167154644666, "grad_norm": 0.5929781198501587, "learning_rate": 6.840023315140475e-05, "loss": 1.1432, "step": 834 }, { "epoch": 0.059267145772336084, "grad_norm": 0.4585597515106201, "learning_rate": 6.818243528455618e-05, "loss": 0.7018, "step": 835 }, { "epoch": 0.0593381243900275, "grad_norm": 0.590122640132904, "learning_rate": 6.79648052186115e-05, "loss": 1.0193, "step": 836 }, { "epoch": 0.05940910300771893, "grad_norm": 0.6057392358779907, "learning_rate": 6.774734410131792e-05, "loss": 1.1889, "step": 837 }, { "epoch": 0.059480081625410346, "grad_norm": 0.43460813164711, "learning_rate": 6.753005307953167e-05, "loss": 0.7651, "step": 838 }, { "epoch": 0.059551060243101764, "grad_norm": 0.46095114946365356, "learning_rate": 6.731293329921189e-05, "loss": 0.8922, "step": 839 }, { "epoch": 0.05962203886079319, "grad_norm": 0.41050395369529724, "learning_rate": 6.709598590541469e-05, "loss": 0.9109, "step": 840 }, { "epoch": 0.05969301747848461, "grad_norm": 0.4851504862308502, "learning_rate": 6.687921204228698e-05, "loss": 0.9195, "step": 841 }, { "epoch": 0.059763996096176025, "grad_norm": 0.5795786380767822, "learning_rate": 6.666261285306047e-05, "loss": 1.0914, "step": 842 }, { "epoch": 0.05983497471386745, "grad_norm": 0.5676115155220032, "learning_rate": 6.644618948004576e-05, "loss": 1.2622, "step": 843 }, { "epoch": 0.05990595333155887, "grad_norm": 0.5190305113792419, "learning_rate": 6.622994306462611e-05, "loss": 1.0094, "step": 844 }, { "epoch": 0.05997693194925029, "grad_norm": 0.5477513670921326, "learning_rate": 6.601387474725162e-05, "loss": 1.3023, "step": 845 }, { "epoch": 0.06004791056694171, "grad_norm": 0.4775758385658264, "learning_rate": 6.579798566743314e-05, "loss": 0.9357, "step": 846 }, { "epoch": 0.06011888918463313, "grad_norm": 0.5332251787185669, "learning_rate": 6.558227696373616e-05, "loss": 1.0024, "step": 847 }, { "epoch": 0.06018986780232455, "grad_norm": 0.44302892684936523, "learning_rate": 6.536674977377496e-05, "loss": 0.7302, "step": 848 }, { "epoch": 0.06026084642001597, "grad_norm": 0.5083339810371399, "learning_rate": 6.515140523420653e-05, "loss": 1.0031, "step": 849 }, { "epoch": 0.06033182503770739, "grad_norm": 0.3714834153652191, "learning_rate": 6.493624448072457e-05, "loss": 1.0358, "step": 850 }, { "epoch": 0.06040280365539881, "grad_norm": 0.4366932809352875, "learning_rate": 6.472126864805356e-05, "loss": 0.8262, "step": 851 }, { "epoch": 0.060473782273090235, "grad_norm": 0.4308808445930481, "learning_rate": 6.450647886994272e-05, "loss": 0.817, "step": 852 }, { "epoch": 0.06054476089078165, "grad_norm": 0.45121482014656067, "learning_rate": 6.429187627916002e-05, "loss": 0.9796, "step": 853 }, { "epoch": 0.06061573950847307, "grad_norm": 0.4070253074169159, "learning_rate": 6.407746200748628e-05, "loss": 0.8875, "step": 854 }, { "epoch": 0.060686718126164496, "grad_norm": 0.4956483840942383, "learning_rate": 6.38632371857091e-05, "loss": 0.7545, "step": 855 }, { "epoch": 0.060757696743855914, "grad_norm": 0.509768545627594, "learning_rate": 6.3649202943617e-05, "loss": 0.7952, "step": 856 }, { "epoch": 0.06082867536154733, "grad_norm": 0.5500235557556152, "learning_rate": 6.343536040999345e-05, "loss": 1.4228, "step": 857 }, { "epoch": 0.06089965397923876, "grad_norm": 0.4672796130180359, "learning_rate": 6.322171071261071e-05, "loss": 0.814, "step": 858 }, { "epoch": 0.060970632596930176, "grad_norm": 0.4815391004085541, "learning_rate": 6.300825497822421e-05, "loss": 0.8316, "step": 859 }, { "epoch": 0.061041611214621594, "grad_norm": 0.8490742444992065, "learning_rate": 6.279499433256642e-05, "loss": 1.4786, "step": 860 }, { "epoch": 0.06111258983231302, "grad_norm": 0.3983782231807709, "learning_rate": 6.258192990034091e-05, "loss": 1.0704, "step": 861 }, { "epoch": 0.06118356845000444, "grad_norm": 0.5656004548072815, "learning_rate": 6.236906280521646e-05, "loss": 0.7812, "step": 862 }, { "epoch": 0.061254547067695855, "grad_norm": 0.5088900923728943, "learning_rate": 6.215639416982118e-05, "loss": 0.8215, "step": 863 }, { "epoch": 0.06132552568538728, "grad_norm": 0.3907179832458496, "learning_rate": 6.19439251157364e-05, "loss": 0.6628, "step": 864 }, { "epoch": 0.0613965043030787, "grad_norm": 0.4127732515335083, "learning_rate": 6.173165676349103e-05, "loss": 0.8488, "step": 865 }, { "epoch": 0.06146748292077012, "grad_norm": 0.7236722111701965, "learning_rate": 6.151959023255545e-05, "loss": 1.1711, "step": 866 }, { "epoch": 0.06153846153846154, "grad_norm": 0.42126694321632385, "learning_rate": 6.130772664133566e-05, "loss": 0.8305, "step": 867 }, { "epoch": 0.06160944015615296, "grad_norm": 0.5233303308486938, "learning_rate": 6.109606710716741e-05, "loss": 1.2384, "step": 868 }, { "epoch": 0.06168041877384438, "grad_norm": 0.38591718673706055, "learning_rate": 6.088461274631023e-05, "loss": 0.6866, "step": 869 }, { "epoch": 0.061751397391535796, "grad_norm": 0.7960970997810364, "learning_rate": 6.067336467394169e-05, "loss": 1.2643, "step": 870 }, { "epoch": 0.06182237600922722, "grad_norm": 0.4649513363838196, "learning_rate": 6.046232400415135e-05, "loss": 0.6887, "step": 871 }, { "epoch": 0.06189335462691864, "grad_norm": 0.37796077132225037, "learning_rate": 6.025149184993498e-05, "loss": 0.8045, "step": 872 }, { "epoch": 0.06196433324461006, "grad_norm": 0.39656832814216614, "learning_rate": 6.0040869323188685e-05, "loss": 0.9173, "step": 873 }, { "epoch": 0.06203531186230148, "grad_norm": 0.6306287050247192, "learning_rate": 5.983045753470308e-05, "loss": 0.8869, "step": 874 }, { "epoch": 0.0621062904799929, "grad_norm": 0.4304412007331848, "learning_rate": 5.9620257594157215e-05, "loss": 0.9553, "step": 875 }, { "epoch": 0.06217726909768432, "grad_norm": 0.42224422097206116, "learning_rate": 5.941027061011303e-05, "loss": 0.6167, "step": 876 }, { "epoch": 0.062248247715375744, "grad_norm": 0.5038176774978638, "learning_rate": 5.920049769000934e-05, "loss": 0.7844, "step": 877 }, { "epoch": 0.06231922633306716, "grad_norm": 0.46274542808532715, "learning_rate": 5.8990939940156e-05, "loss": 0.7315, "step": 878 }, { "epoch": 0.06239020495075858, "grad_norm": 0.4557129144668579, "learning_rate": 5.8781598465728104e-05, "loss": 0.8365, "step": 879 }, { "epoch": 0.062461183568450006, "grad_norm": 0.5313268303871155, "learning_rate": 5.857247437076012e-05, "loss": 0.828, "step": 880 }, { "epoch": 0.06253216218614142, "grad_norm": 0.5893296599388123, "learning_rate": 5.836356875814011e-05, "loss": 1.3876, "step": 881 }, { "epoch": 0.06260314080383285, "grad_norm": 0.4517008364200592, "learning_rate": 5.8154882729603876e-05, "loss": 0.758, "step": 882 }, { "epoch": 0.06267411942152426, "grad_norm": 0.5353963971138, "learning_rate": 5.794641738572925e-05, "loss": 0.6984, "step": 883 }, { "epoch": 0.06274509803921569, "grad_norm": 0.5784291625022888, "learning_rate": 5.773817382593008e-05, "loss": 0.9718, "step": 884 }, { "epoch": 0.06281607665690711, "grad_norm": 0.5251544713973999, "learning_rate": 5.753015314845061e-05, "loss": 1.0439, "step": 885 }, { "epoch": 0.06288705527459852, "grad_norm": 0.42581355571746826, "learning_rate": 5.732235645035964e-05, "loss": 0.6126, "step": 886 }, { "epoch": 0.06295803389228995, "grad_norm": 0.893518328666687, "learning_rate": 5.71147848275448e-05, "loss": 1.0956, "step": 887 }, { "epoch": 0.06302901250998137, "grad_norm": 0.40940263867378235, "learning_rate": 5.690743937470657e-05, "loss": 0.9405, "step": 888 }, { "epoch": 0.06309999112767278, "grad_norm": 0.5360181927680969, "learning_rate": 5.670032118535281e-05, "loss": 0.9099, "step": 889 }, { "epoch": 0.06317096974536421, "grad_norm": 0.4612846076488495, "learning_rate": 5.64934313517927e-05, "loss": 0.7953, "step": 890 }, { "epoch": 0.06324194836305563, "grad_norm": 0.4803790748119354, "learning_rate": 5.628677096513112e-05, "loss": 0.9702, "step": 891 }, { "epoch": 0.06331292698074704, "grad_norm": 0.6938463449478149, "learning_rate": 5.608034111526298e-05, "loss": 0.8876, "step": 892 }, { "epoch": 0.06338390559843847, "grad_norm": 0.5365687608718872, "learning_rate": 5.587414289086723e-05, "loss": 0.7865, "step": 893 }, { "epoch": 0.0634548842161299, "grad_norm": 0.4781073033809662, "learning_rate": 5.566817737940142e-05, "loss": 0.8294, "step": 894 }, { "epoch": 0.0635258628338213, "grad_norm": 0.5229554772377014, "learning_rate": 5.546244566709569e-05, "loss": 1.1194, "step": 895 }, { "epoch": 0.06359684145151273, "grad_norm": 0.42827725410461426, "learning_rate": 5.52569488389472e-05, "loss": 1.0934, "step": 896 }, { "epoch": 0.06366782006920416, "grad_norm": 0.47307196259498596, "learning_rate": 5.505168797871432e-05, "loss": 0.7492, "step": 897 }, { "epoch": 0.06373879868689557, "grad_norm": 0.42117777466773987, "learning_rate": 5.484666416891109e-05, "loss": 0.9033, "step": 898 }, { "epoch": 0.06380977730458699, "grad_norm": 0.55430668592453, "learning_rate": 5.464187849080122e-05, "loss": 0.866, "step": 899 }, { "epoch": 0.06388075592227842, "grad_norm": 0.3976860046386719, "learning_rate": 5.4437332024392694e-05, "loss": 0.8684, "step": 900 }, { "epoch": 0.06395173453996983, "grad_norm": 0.43081173300743103, "learning_rate": 5.423302584843186e-05, "loss": 0.7524, "step": 901 }, { "epoch": 0.06402271315766125, "grad_norm": 0.5336025953292847, "learning_rate": 5.402896104039776e-05, "loss": 1.2586, "step": 902 }, { "epoch": 0.06409369177535268, "grad_norm": 1.221236228942871, "learning_rate": 5.382513867649663e-05, "loss": 1.4954, "step": 903 }, { "epoch": 0.06416467039304409, "grad_norm": 0.5544896721839905, "learning_rate": 5.362155983165594e-05, "loss": 0.8836, "step": 904 }, { "epoch": 0.06423564901073552, "grad_norm": 0.4151901602745056, "learning_rate": 5.3418225579519056e-05, "loss": 1.1386, "step": 905 }, { "epoch": 0.06430662762842694, "grad_norm": 0.5084452033042908, "learning_rate": 5.321513699243924e-05, "loss": 1.0988, "step": 906 }, { "epoch": 0.06437760624611835, "grad_norm": 0.4532809257507324, "learning_rate": 5.301229514147419e-05, "loss": 0.7789, "step": 907 }, { "epoch": 0.06444858486380978, "grad_norm": 0.6207795739173889, "learning_rate": 5.280970109638047e-05, "loss": 1.563, "step": 908 }, { "epoch": 0.0645195634815012, "grad_norm": 0.5591384768486023, "learning_rate": 5.260735592560757e-05, "loss": 0.8817, "step": 909 }, { "epoch": 0.06459054209919261, "grad_norm": 0.40313634276390076, "learning_rate": 5.240526069629265e-05, "loss": 0.9852, "step": 910 }, { "epoch": 0.06466152071688404, "grad_norm": 0.610811710357666, "learning_rate": 5.220341647425456e-05, "loss": 0.9797, "step": 911 }, { "epoch": 0.06473249933457546, "grad_norm": 0.4686618149280548, "learning_rate": 5.2001824323988455e-05, "loss": 0.9359, "step": 912 }, { "epoch": 0.06480347795226687, "grad_norm": 0.5324022173881531, "learning_rate": 5.180048530866004e-05, "loss": 0.9383, "step": 913 }, { "epoch": 0.0648744565699583, "grad_norm": 0.5006153583526611, "learning_rate": 5.159940049010015e-05, "loss": 1.0675, "step": 914 }, { "epoch": 0.06494543518764972, "grad_norm": 0.4832466244697571, "learning_rate": 5.139857092879885e-05, "loss": 0.7792, "step": 915 }, { "epoch": 0.06501641380534114, "grad_norm": 0.3981482684612274, "learning_rate": 5.1197997683900214e-05, "loss": 0.745, "step": 916 }, { "epoch": 0.06508739242303256, "grad_norm": 0.6618260145187378, "learning_rate": 5.099768181319638e-05, "loss": 1.0604, "step": 917 }, { "epoch": 0.06515837104072399, "grad_norm": 0.4373709559440613, "learning_rate": 5.079762437312219e-05, "loss": 1.1679, "step": 918 }, { "epoch": 0.0652293496584154, "grad_norm": 0.60200434923172, "learning_rate": 5.059782641874962e-05, "loss": 1.1689, "step": 919 }, { "epoch": 0.06530032827610682, "grad_norm": 0.4210618734359741, "learning_rate": 5.039828900378204e-05, "loss": 1.1105, "step": 920 }, { "epoch": 0.06537130689379825, "grad_norm": 0.4953047037124634, "learning_rate": 5.0199013180548915e-05, "loss": 1.1284, "step": 921 }, { "epoch": 0.06544228551148966, "grad_norm": 0.42933738231658936, "learning_rate": 5.000000000000002e-05, "loss": 0.6366, "step": 922 }, { "epoch": 0.06551326412918108, "grad_norm": 0.549660861492157, "learning_rate": 4.980125051169997e-05, "loss": 1.1068, "step": 923 }, { "epoch": 0.06558424274687251, "grad_norm": 0.5911871790885925, "learning_rate": 4.960276576382283e-05, "loss": 0.9106, "step": 924 }, { "epoch": 0.06565522136456392, "grad_norm": 0.44298383593559265, "learning_rate": 4.9404546803146324e-05, "loss": 0.672, "step": 925 }, { "epoch": 0.06572619998225535, "grad_norm": 0.47260457277297974, "learning_rate": 4.920659467504659e-05, "loss": 0.7916, "step": 926 }, { "epoch": 0.06579717859994677, "grad_norm": 0.7364523410797119, "learning_rate": 4.900891042349243e-05, "loss": 1.093, "step": 927 }, { "epoch": 0.06586815721763818, "grad_norm": 0.5798365473747253, "learning_rate": 4.8811495091039926e-05, "loss": 1.0899, "step": 928 }, { "epoch": 0.06593913583532961, "grad_norm": 0.5490689873695374, "learning_rate": 4.8614349718826935e-05, "loss": 1.0739, "step": 929 }, { "epoch": 0.06601011445302103, "grad_norm": 0.4105396568775177, "learning_rate": 4.841747534656763e-05, "loss": 0.7494, "step": 930 }, { "epoch": 0.06608109307071244, "grad_norm": 0.45491930842399597, "learning_rate": 4.8220873012546866e-05, "loss": 0.681, "step": 931 }, { "epoch": 0.06615207168840387, "grad_norm": 0.7174997329711914, "learning_rate": 4.802454375361495e-05, "loss": 0.6417, "step": 932 }, { "epoch": 0.0662230503060953, "grad_norm": 0.36011412739753723, "learning_rate": 4.782848860518193e-05, "loss": 0.9337, "step": 933 }, { "epoch": 0.0662940289237867, "grad_norm": 0.4253694713115692, "learning_rate": 4.763270860121222e-05, "loss": 0.7568, "step": 934 }, { "epoch": 0.06636500754147813, "grad_norm": 0.49379852414131165, "learning_rate": 4.743720477421928e-05, "loss": 0.9879, "step": 935 }, { "epoch": 0.06643598615916955, "grad_norm": 0.39140021800994873, "learning_rate": 4.7241978155259925e-05, "loss": 0.7339, "step": 936 }, { "epoch": 0.06650696477686097, "grad_norm": 0.44276922941207886, "learning_rate": 4.704702977392914e-05, "loss": 0.971, "step": 937 }, { "epoch": 0.06657794339455239, "grad_norm": 0.4943040907382965, "learning_rate": 4.685236065835443e-05, "loss": 1.0298, "step": 938 }, { "epoch": 0.06664892201224382, "grad_norm": 0.4996838867664337, "learning_rate": 4.6657971835190505e-05, "loss": 1.3058, "step": 939 }, { "epoch": 0.06671990062993523, "grad_norm": 0.47278833389282227, "learning_rate": 4.646386432961396e-05, "loss": 1.2168, "step": 940 }, { "epoch": 0.06679087924762665, "grad_norm": 0.38809934258461, "learning_rate": 4.6270039165317605e-05, "loss": 0.8219, "step": 941 }, { "epoch": 0.06686185786531808, "grad_norm": 0.5847929120063782, "learning_rate": 4.6076497364505386e-05, "loss": 1.2773, "step": 942 }, { "epoch": 0.06693283648300949, "grad_norm": 0.6435703635215759, "learning_rate": 4.588323994788672e-05, "loss": 0.905, "step": 943 }, { "epoch": 0.06700381510070091, "grad_norm": 0.4198989272117615, "learning_rate": 4.569026793467126e-05, "loss": 0.7188, "step": 944 }, { "epoch": 0.06707479371839234, "grad_norm": 0.41071218252182007, "learning_rate": 4.549758234256346e-05, "loss": 0.8161, "step": 945 }, { "epoch": 0.06714577233608375, "grad_norm": 0.4895361363887787, "learning_rate": 4.530518418775733e-05, "loss": 0.8872, "step": 946 }, { "epoch": 0.06721675095377518, "grad_norm": 0.3790246844291687, "learning_rate": 4.511307448493084e-05, "loss": 0.8414, "step": 947 }, { "epoch": 0.0672877295714666, "grad_norm": 0.42425140738487244, "learning_rate": 4.492125424724086e-05, "loss": 0.8176, "step": 948 }, { "epoch": 0.06735870818915801, "grad_norm": 0.6203161478042603, "learning_rate": 4.4729724486317535e-05, "loss": 0.9816, "step": 949 }, { "epoch": 0.06742968680684944, "grad_norm": 0.4554537236690521, "learning_rate": 4.453848621225912e-05, "loss": 0.8104, "step": 950 }, { "epoch": 0.06750066542454086, "grad_norm": 0.3816584050655365, "learning_rate": 4.434754043362668e-05, "loss": 0.8923, "step": 951 }, { "epoch": 0.06757164404223227, "grad_norm": 0.4490455985069275, "learning_rate": 4.415688815743858e-05, "loss": 0.7421, "step": 952 }, { "epoch": 0.0676426226599237, "grad_norm": 0.45603904128074646, "learning_rate": 4.396653038916545e-05, "loss": 0.8262, "step": 953 }, { "epoch": 0.06771360127761512, "grad_norm": 0.6921128630638123, "learning_rate": 4.3776468132724604e-05, "loss": 1.1418, "step": 954 }, { "epoch": 0.06778457989530653, "grad_norm": 0.4899638891220093, "learning_rate": 4.35867023904749e-05, "loss": 0.8589, "step": 955 }, { "epoch": 0.06785555851299796, "grad_norm": 0.6228287816047668, "learning_rate": 4.3397234163211483e-05, "loss": 1.3087, "step": 956 }, { "epoch": 0.06792653713068939, "grad_norm": 0.8028937578201294, "learning_rate": 4.320806445016038e-05, "loss": 1.2678, "step": 957 }, { "epoch": 0.0679975157483808, "grad_norm": 0.4841277599334717, "learning_rate": 4.301919424897338e-05, "loss": 0.9275, "step": 958 }, { "epoch": 0.06806849436607222, "grad_norm": 0.7058796286582947, "learning_rate": 4.283062455572262e-05, "loss": 1.5377, "step": 959 }, { "epoch": 0.06813947298376365, "grad_norm": 0.5098562836647034, "learning_rate": 4.264235636489542e-05, "loss": 0.9543, "step": 960 }, { "epoch": 0.06821045160145506, "grad_norm": 0.4190066456794739, "learning_rate": 4.2454390669389e-05, "loss": 1.0292, "step": 961 }, { "epoch": 0.06828143021914648, "grad_norm": 0.6736373901367188, "learning_rate": 4.2266728460505375e-05, "loss": 0.8487, "step": 962 }, { "epoch": 0.06835240883683791, "grad_norm": 0.3881334066390991, "learning_rate": 4.207937072794587e-05, "loss": 1.0263, "step": 963 }, { "epoch": 0.06842338745452932, "grad_norm": 0.6050649881362915, "learning_rate": 4.189231845980618e-05, "loss": 1.166, "step": 964 }, { "epoch": 0.06849436607222074, "grad_norm": 0.5665785074234009, "learning_rate": 4.170557264257096e-05, "loss": 1.0805, "step": 965 }, { "epoch": 0.06856534468991217, "grad_norm": 0.4978638291358948, "learning_rate": 4.151913426110864e-05, "loss": 0.9755, "step": 966 }, { "epoch": 0.06863632330760358, "grad_norm": 0.3485337793827057, "learning_rate": 4.1333004298666436e-05, "loss": 0.7782, "step": 967 }, { "epoch": 0.068707301925295, "grad_norm": 0.5523264408111572, "learning_rate": 4.114718373686481e-05, "loss": 0.9849, "step": 968 }, { "epoch": 0.06877828054298643, "grad_norm": 0.4814048111438751, "learning_rate": 4.0961673555692716e-05, "loss": 0.7936, "step": 969 }, { "epoch": 0.06884925916067784, "grad_norm": 0.6478021144866943, "learning_rate": 4.077647473350201e-05, "loss": 1.0021, "step": 970 }, { "epoch": 0.06892023777836927, "grad_norm": 0.4357635974884033, "learning_rate": 4.059158824700258e-05, "loss": 0.7532, "step": 971 }, { "epoch": 0.06899121639606069, "grad_norm": 0.5967605710029602, "learning_rate": 4.040701507125712e-05, "loss": 0.8062, "step": 972 }, { "epoch": 0.0690621950137521, "grad_norm": 0.4152340888977051, "learning_rate": 4.022275617967591e-05, "loss": 0.7736, "step": 973 }, { "epoch": 0.06913317363144353, "grad_norm": 0.5046787858009338, "learning_rate": 4.003881254401183e-05, "loss": 1.0321, "step": 974 }, { "epoch": 0.06920415224913495, "grad_norm": 0.4122855067253113, "learning_rate": 3.9855185134355075e-05, "loss": 0.8448, "step": 975 }, { "epoch": 0.06927513086682636, "grad_norm": 0.4052172601222992, "learning_rate": 3.967187491912813e-05, "loss": 0.6328, "step": 976 }, { "epoch": 0.06934610948451779, "grad_norm": 0.4540557563304901, "learning_rate": 3.948888286508062e-05, "loss": 0.9434, "step": 977 }, { "epoch": 0.06941708810220922, "grad_norm": 0.4870578348636627, "learning_rate": 3.9306209937284346e-05, "loss": 0.722, "step": 978 }, { "epoch": 0.06948806671990063, "grad_norm": 0.5865741968154907, "learning_rate": 3.9123857099127936e-05, "loss": 0.8361, "step": 979 }, { "epoch": 0.06955904533759205, "grad_norm": 0.5220431685447693, "learning_rate": 3.8941825312312054e-05, "loss": 1.1973, "step": 980 }, { "epoch": 0.06963002395528348, "grad_norm": 0.4371016323566437, "learning_rate": 3.87601155368441e-05, "loss": 0.7153, "step": 981 }, { "epoch": 0.06970100257297489, "grad_norm": 0.7337780594825745, "learning_rate": 3.857872873103322e-05, "loss": 1.2137, "step": 982 }, { "epoch": 0.06977198119066631, "grad_norm": 0.560259222984314, "learning_rate": 3.839766585148538e-05, "loss": 0.7988, "step": 983 }, { "epoch": 0.06984295980835774, "grad_norm": 0.5993716716766357, "learning_rate": 3.821692785309807e-05, "loss": 0.7241, "step": 984 }, { "epoch": 0.06991393842604915, "grad_norm": 0.4063752293586731, "learning_rate": 3.803651568905554e-05, "loss": 0.8753, "step": 985 }, { "epoch": 0.06998491704374057, "grad_norm": 0.7601093053817749, "learning_rate": 3.7856430310823545e-05, "loss": 1.2323, "step": 986 }, { "epoch": 0.070055895661432, "grad_norm": 0.4051331579685211, "learning_rate": 3.767667266814442e-05, "loss": 0.7215, "step": 987 }, { "epoch": 0.07012687427912341, "grad_norm": 0.45125964283943176, "learning_rate": 3.749724370903216e-05, "loss": 1.032, "step": 988 }, { "epoch": 0.07019785289681484, "grad_norm": 0.609743058681488, "learning_rate": 3.731814437976723e-05, "loss": 1.0117, "step": 989 }, { "epoch": 0.07026883151450626, "grad_norm": 0.45094138383865356, "learning_rate": 3.713937562489179e-05, "loss": 1.21, "step": 990 }, { "epoch": 0.07033981013219767, "grad_norm": 0.46431252360343933, "learning_rate": 3.69609383872045e-05, "loss": 0.8001, "step": 991 }, { "epoch": 0.0704107887498891, "grad_norm": 0.539316713809967, "learning_rate": 3.678283360775571e-05, "loss": 1.0371, "step": 992 }, { "epoch": 0.07048176736758052, "grad_norm": 0.6256955862045288, "learning_rate": 3.6605062225842344e-05, "loss": 0.9815, "step": 993 }, { "epoch": 0.07055274598527193, "grad_norm": 0.5044265985488892, "learning_rate": 3.642762517900322e-05, "loss": 0.8973, "step": 994 }, { "epoch": 0.07062372460296336, "grad_norm": 0.3563026785850525, "learning_rate": 3.625052340301373e-05, "loss": 0.6246, "step": 995 }, { "epoch": 0.07069470322065478, "grad_norm": 0.4941602349281311, "learning_rate": 3.607375783188125e-05, "loss": 1.0394, "step": 996 }, { "epoch": 0.0707656818383462, "grad_norm": 0.48626214265823364, "learning_rate": 3.5897329397839976e-05, "loss": 1.0681, "step": 997 }, { "epoch": 0.07083666045603762, "grad_norm": 0.7382400035858154, "learning_rate": 3.5721239031346066e-05, "loss": 1.1941, "step": 998 }, { "epoch": 0.07090763907372905, "grad_norm": 0.45334354043006897, "learning_rate": 3.5545487661072906e-05, "loss": 0.7343, "step": 999 }, { "epoch": 0.07097861769142046, "grad_norm": 0.5391009449958801, "learning_rate": 3.53700762139059e-05, "loss": 1.2046, "step": 1000 }, { "epoch": 0.07104959630911188, "grad_norm": 0.4611399471759796, "learning_rate": 3.51950056149379e-05, "loss": 0.7286, "step": 1001 }, { "epoch": 0.0711205749268033, "grad_norm": 0.406324177980423, "learning_rate": 3.5020276787464056e-05, "loss": 0.817, "step": 1002 }, { "epoch": 0.07119155354449472, "grad_norm": 0.4133076071739197, "learning_rate": 3.484589065297711e-05, "loss": 0.7804, "step": 1003 }, { "epoch": 0.07126253216218614, "grad_norm": 0.4725409150123596, "learning_rate": 3.4671848131162544e-05, "loss": 1.0859, "step": 1004 }, { "epoch": 0.07133351077987757, "grad_norm": 0.39698466658592224, "learning_rate": 3.449815013989358e-05, "loss": 0.7455, "step": 1005 }, { "epoch": 0.07140448939756898, "grad_norm": 0.39133593440055847, "learning_rate": 3.4324797595226565e-05, "loss": 0.7592, "step": 1006 }, { "epoch": 0.0714754680152604, "grad_norm": 0.43078693747520447, "learning_rate": 3.415179141139591e-05, "loss": 0.6731, "step": 1007 }, { "epoch": 0.07154644663295183, "grad_norm": 0.46568024158477783, "learning_rate": 3.3979132500809405e-05, "loss": 0.6947, "step": 1008 }, { "epoch": 0.07161742525064324, "grad_norm": 0.6325703263282776, "learning_rate": 3.380682177404335e-05, "loss": 0.9359, "step": 1009 }, { "epoch": 0.07168840386833467, "grad_norm": 0.4919491112232208, "learning_rate": 3.363486013983788e-05, "loss": 0.9069, "step": 1010 }, { "epoch": 0.07175938248602609, "grad_norm": 0.5755070447921753, "learning_rate": 3.346324850509191e-05, "loss": 0.9001, "step": 1011 }, { "epoch": 0.0718303611037175, "grad_norm": 0.3590608239173889, "learning_rate": 3.329198777485869e-05, "loss": 0.8067, "step": 1012 }, { "epoch": 0.07190133972140893, "grad_norm": 0.4256442189216614, "learning_rate": 3.312107885234072e-05, "loss": 0.6523, "step": 1013 }, { "epoch": 0.07197231833910035, "grad_norm": 0.4097779393196106, "learning_rate": 3.2950522638885106e-05, "loss": 1.0435, "step": 1014 }, { "epoch": 0.07204329695679176, "grad_norm": 0.40019646286964417, "learning_rate": 3.2780320033978974e-05, "loss": 0.8757, "step": 1015 }, { "epoch": 0.07211427557448319, "grad_norm": 0.6098476052284241, "learning_rate": 3.261047193524439e-05, "loss": 1.092, "step": 1016 }, { "epoch": 0.07218525419217461, "grad_norm": 0.4918380081653595, "learning_rate": 3.244097923843398e-05, "loss": 0.6447, "step": 1017 }, { "epoch": 0.07225623280986603, "grad_norm": 0.4223822057247162, "learning_rate": 3.227184283742591e-05, "loss": 0.822, "step": 1018 }, { "epoch": 0.07232721142755745, "grad_norm": 0.43643802404403687, "learning_rate": 3.2103063624219345e-05, "loss": 1.3097, "step": 1019 }, { "epoch": 0.07239819004524888, "grad_norm": 0.45003101229667664, "learning_rate": 3.193464248892964e-05, "loss": 1.0872, "step": 1020 }, { "epoch": 0.07246916866294029, "grad_norm": 0.3571663796901703, "learning_rate": 3.176658031978381e-05, "loss": 0.6326, "step": 1021 }, { "epoch": 0.07254014728063171, "grad_norm": 0.6147691607475281, "learning_rate": 3.159887800311569e-05, "loss": 0.9312, "step": 1022 }, { "epoch": 0.07261112589832314, "grad_norm": 0.46768736839294434, "learning_rate": 3.1431536423361294e-05, "loss": 0.8453, "step": 1023 }, { "epoch": 0.07268210451601455, "grad_norm": 0.35765504837036133, "learning_rate": 3.126455646305416e-05, "loss": 0.6965, "step": 1024 }, { "epoch": 0.07275308313370597, "grad_norm": 0.37914925813674927, "learning_rate": 3.109793900282067e-05, "loss": 0.6912, "step": 1025 }, { "epoch": 0.0728240617513974, "grad_norm": 0.5296112895011902, "learning_rate": 3.093168492137557e-05, "loss": 0.924, "step": 1026 }, { "epoch": 0.07289504036908881, "grad_norm": 0.4362507462501526, "learning_rate": 3.076579509551703e-05, "loss": 0.909, "step": 1027 }, { "epoch": 0.07296601898678023, "grad_norm": 0.39314979314804077, "learning_rate": 3.0600270400122335e-05, "loss": 0.8277, "step": 1028 }, { "epoch": 0.07303699760447166, "grad_norm": 0.4281644821166992, "learning_rate": 3.0435111708143037e-05, "loss": 0.9005, "step": 1029 }, { "epoch": 0.07310797622216307, "grad_norm": 0.4734521806240082, "learning_rate": 3.0270319890600462e-05, "loss": 0.8484, "step": 1030 }, { "epoch": 0.0731789548398545, "grad_norm": 0.4367423951625824, "learning_rate": 3.0105895816581153e-05, "loss": 0.7923, "step": 1031 }, { "epoch": 0.07324993345754592, "grad_norm": 0.4903659224510193, "learning_rate": 2.994184035323213e-05, "loss": 0.8954, "step": 1032 }, { "epoch": 0.07332091207523733, "grad_norm": 0.4883213937282562, "learning_rate": 2.9778154365756538e-05, "loss": 1.056, "step": 1033 }, { "epoch": 0.07339189069292876, "grad_norm": 0.41054317355155945, "learning_rate": 2.9614838717408867e-05, "loss": 1.0326, "step": 1034 }, { "epoch": 0.07346286931062018, "grad_norm": 0.4107778072357178, "learning_rate": 2.945189426949053e-05, "loss": 1.0709, "step": 1035 }, { "epoch": 0.07346286931062018, "eval_loss": 1.1355592012405396, "eval_runtime": 314.3398, "eval_samples_per_second": 18.874, "eval_steps_per_second": 9.439, "step": 1035 }, { "epoch": 0.0735338479283116, "grad_norm": 0.5463880896568298, "learning_rate": 2.9289321881345254e-05, "loss": 1.0284, "step": 1036 }, { "epoch": 0.07360482654600302, "grad_norm": 0.5464505553245544, "learning_rate": 2.9127122410354647e-05, "loss": 1.1125, "step": 1037 }, { "epoch": 0.07367580516369444, "grad_norm": 0.5442793965339661, "learning_rate": 2.89652967119336e-05, "loss": 1.2379, "step": 1038 }, { "epoch": 0.07374678378138586, "grad_norm": 0.5177668929100037, "learning_rate": 2.8803845639525728e-05, "loss": 0.9388, "step": 1039 }, { "epoch": 0.07381776239907728, "grad_norm": 0.4273097813129425, "learning_rate": 2.8642770044598966e-05, "loss": 0.8004, "step": 1040 }, { "epoch": 0.0738887410167687, "grad_norm": 0.8218151926994324, "learning_rate": 2.848207077664099e-05, "loss": 1.2082, "step": 1041 }, { "epoch": 0.07395971963446012, "grad_norm": 0.490460067987442, "learning_rate": 2.8321748683154893e-05, "loss": 0.8117, "step": 1042 }, { "epoch": 0.07403069825215154, "grad_norm": 0.690358579158783, "learning_rate": 2.8161804609654484e-05, "loss": 0.9035, "step": 1043 }, { "epoch": 0.07410167686984295, "grad_norm": 0.498638778924942, "learning_rate": 2.800223939966007e-05, "loss": 0.9094, "step": 1044 }, { "epoch": 0.07417265548753438, "grad_norm": 0.7770102620124817, "learning_rate": 2.7843053894693803e-05, "loss": 1.151, "step": 1045 }, { "epoch": 0.0742436341052258, "grad_norm": 0.6199392676353455, "learning_rate": 2.7684248934275325e-05, "loss": 1.1426, "step": 1046 }, { "epoch": 0.07431461272291721, "grad_norm": 0.5609120726585388, "learning_rate": 2.7525825355917446e-05, "loss": 0.9465, "step": 1047 }, { "epoch": 0.07438559134060864, "grad_norm": 0.5529163479804993, "learning_rate": 2.73677839951215e-05, "loss": 0.8544, "step": 1048 }, { "epoch": 0.07445656995830006, "grad_norm": 0.4198525846004486, "learning_rate": 2.721012568537318e-05, "loss": 1.0313, "step": 1049 }, { "epoch": 0.07452754857599148, "grad_norm": 1.1126874685287476, "learning_rate": 2.7052851258137935e-05, "loss": 1.2214, "step": 1050 }, { "epoch": 0.0745985271936829, "grad_norm": 0.5191254615783691, "learning_rate": 2.689596154285672e-05, "loss": 0.7111, "step": 1051 }, { "epoch": 0.07466950581137433, "grad_norm": 0.5139110088348389, "learning_rate": 2.6739457366941543e-05, "loss": 1.0698, "step": 1052 }, { "epoch": 0.07474048442906574, "grad_norm": 0.6746183633804321, "learning_rate": 2.6583339555771192e-05, "loss": 1.1982, "step": 1053 }, { "epoch": 0.07481146304675716, "grad_norm": 0.528917133808136, "learning_rate": 2.6427608932686843e-05, "loss": 1.128, "step": 1054 }, { "epoch": 0.07488244166444859, "grad_norm": 0.49486279487609863, "learning_rate": 2.6272266318987603e-05, "loss": 0.7873, "step": 1055 }, { "epoch": 0.07495342028214, "grad_norm": 0.4257170557975769, "learning_rate": 2.6117312533926362e-05, "loss": 0.761, "step": 1056 }, { "epoch": 0.07502439889983142, "grad_norm": 0.6389133334159851, "learning_rate": 2.5962748394705328e-05, "loss": 0.8278, "step": 1057 }, { "epoch": 0.07509537751752285, "grad_norm": 0.5689634084701538, "learning_rate": 2.5808574716471856e-05, "loss": 0.9675, "step": 1058 }, { "epoch": 0.07516635613521426, "grad_norm": 0.4619472026824951, "learning_rate": 2.565479231231398e-05, "loss": 0.6782, "step": 1059 }, { "epoch": 0.07523733475290569, "grad_norm": 0.45457616448402405, "learning_rate": 2.55014019932563e-05, "loss": 0.8111, "step": 1060 }, { "epoch": 0.07530831337059711, "grad_norm": 0.6634262800216675, "learning_rate": 2.534840456825558e-05, "loss": 0.8143, "step": 1061 }, { "epoch": 0.07537929198828852, "grad_norm": 0.6119337677955627, "learning_rate": 2.519580084419646e-05, "loss": 0.8083, "step": 1062 }, { "epoch": 0.07545027060597995, "grad_norm": 0.3665156960487366, "learning_rate": 2.5043591625887407e-05, "loss": 0.6853, "step": 1063 }, { "epoch": 0.07552124922367137, "grad_norm": 0.7194316983222961, "learning_rate": 2.4891777716056176e-05, "loss": 1.0921, "step": 1064 }, { "epoch": 0.07559222784136278, "grad_norm": 0.3780640661716461, "learning_rate": 2.4740359915345835e-05, "loss": 0.9959, "step": 1065 }, { "epoch": 0.07566320645905421, "grad_norm": 0.5528492331504822, "learning_rate": 2.4589339022310386e-05, "loss": 0.7746, "step": 1066 }, { "epoch": 0.07573418507674563, "grad_norm": 0.532551109790802, "learning_rate": 2.4438715833410595e-05, "loss": 1.1754, "step": 1067 }, { "epoch": 0.07580516369443704, "grad_norm": 0.4763934016227722, "learning_rate": 2.4288491143009795e-05, "loss": 0.9457, "step": 1068 }, { "epoch": 0.07587614231212847, "grad_norm": 0.4264293611049652, "learning_rate": 2.413866574336975e-05, "loss": 0.7942, "step": 1069 }, { "epoch": 0.0759471209298199, "grad_norm": 0.5233449935913086, "learning_rate": 2.3989240424646355e-05, "loss": 0.9646, "step": 1070 }, { "epoch": 0.0760180995475113, "grad_norm": 0.4843657314777374, "learning_rate": 2.3840215974885615e-05, "loss": 0.8398, "step": 1071 }, { "epoch": 0.07608907816520273, "grad_norm": 0.43439170718193054, "learning_rate": 2.3691593180019366e-05, "loss": 0.7895, "step": 1072 }, { "epoch": 0.07616005678289416, "grad_norm": 0.513238787651062, "learning_rate": 2.3543372823861154e-05, "loss": 0.9543, "step": 1073 }, { "epoch": 0.07623103540058557, "grad_norm": 0.5333254337310791, "learning_rate": 2.339555568810221e-05, "loss": 1.0294, "step": 1074 }, { "epoch": 0.07630201401827699, "grad_norm": 0.44629040360450745, "learning_rate": 2.324814255230714e-05, "loss": 0.81, "step": 1075 }, { "epoch": 0.07637299263596842, "grad_norm": 0.5009654760360718, "learning_rate": 2.3101134193910024e-05, "loss": 0.95, "step": 1076 }, { "epoch": 0.07644397125365983, "grad_norm": 0.45047643780708313, "learning_rate": 2.295453138821013e-05, "loss": 0.904, "step": 1077 }, { "epoch": 0.07651494987135125, "grad_norm": 0.7342311143875122, "learning_rate": 2.2808334908367914e-05, "loss": 1.3786, "step": 1078 }, { "epoch": 0.07658592848904268, "grad_norm": 0.42931681871414185, "learning_rate": 2.266254552540097e-05, "loss": 1.0701, "step": 1079 }, { "epoch": 0.07665690710673409, "grad_norm": 0.49413037300109863, "learning_rate": 2.2517164008179882e-05, "loss": 1.0956, "step": 1080 }, { "epoch": 0.07672788572442552, "grad_norm": 0.3613120913505554, "learning_rate": 2.237219112342426e-05, "loss": 0.5434, "step": 1081 }, { "epoch": 0.07679886434211694, "grad_norm": 0.4425318241119385, "learning_rate": 2.222762763569862e-05, "loss": 0.9588, "step": 1082 }, { "epoch": 0.07686984295980835, "grad_norm": 0.5189344882965088, "learning_rate": 2.208347430740837e-05, "loss": 0.8712, "step": 1083 }, { "epoch": 0.07694082157749978, "grad_norm": 0.5996214747428894, "learning_rate": 2.1939731898795802e-05, "loss": 0.99, "step": 1084 }, { "epoch": 0.0770118001951912, "grad_norm": 0.6045960783958435, "learning_rate": 2.1796401167936143e-05, "loss": 1.0449, "step": 1085 }, { "epoch": 0.07708277881288261, "grad_norm": 0.5072668790817261, "learning_rate": 2.165348287073339e-05, "loss": 1.1151, "step": 1086 }, { "epoch": 0.07715375743057404, "grad_norm": 0.7324336767196655, "learning_rate": 2.151097776091654e-05, "loss": 1.2269, "step": 1087 }, { "epoch": 0.07722473604826546, "grad_norm": 0.5092812180519104, "learning_rate": 2.1368886590035443e-05, "loss": 0.8989, "step": 1088 }, { "epoch": 0.07729571466595687, "grad_norm": 0.42177918553352356, "learning_rate": 2.1227210107456852e-05, "loss": 0.9729, "step": 1089 }, { "epoch": 0.0773666932836483, "grad_norm": 0.38119325041770935, "learning_rate": 2.1085949060360654e-05, "loss": 0.9433, "step": 1090 }, { "epoch": 0.07743767190133972, "grad_norm": 0.35730451345443726, "learning_rate": 2.0945104193735644e-05, "loss": 0.5919, "step": 1091 }, { "epoch": 0.07750865051903114, "grad_norm": 0.4657171964645386, "learning_rate": 2.0804676250375867e-05, "loss": 0.8516, "step": 1092 }, { "epoch": 0.07757962913672256, "grad_norm": 0.42287111282348633, "learning_rate": 2.0664665970876496e-05, "loss": 0.7939, "step": 1093 }, { "epoch": 0.07765060775441399, "grad_norm": 0.4931599795818329, "learning_rate": 2.0525074093630036e-05, "loss": 0.7928, "step": 1094 }, { "epoch": 0.0777215863721054, "grad_norm": 0.4923013746738434, "learning_rate": 2.0385901354822434e-05, "loss": 1.3041, "step": 1095 }, { "epoch": 0.07779256498979682, "grad_norm": 0.5996274948120117, "learning_rate": 2.02471484884291e-05, "loss": 0.981, "step": 1096 }, { "epoch": 0.07786354360748825, "grad_norm": 0.429426908493042, "learning_rate": 2.01088162262112e-05, "loss": 0.7224, "step": 1097 }, { "epoch": 0.07793452222517966, "grad_norm": 0.617928147315979, "learning_rate": 1.9970905297711606e-05, "loss": 0.909, "step": 1098 }, { "epoch": 0.07800550084287108, "grad_norm": 0.5632895231246948, "learning_rate": 1.9833416430251174e-05, "loss": 0.9735, "step": 1099 }, { "epoch": 0.07807647946056251, "grad_norm": 0.4692310690879822, "learning_rate": 1.969635034892485e-05, "loss": 1.0158, "step": 1100 }, { "epoch": 0.07814745807825392, "grad_norm": 0.39456284046173096, "learning_rate": 1.9559707776597957e-05, "loss": 0.8835, "step": 1101 }, { "epoch": 0.07821843669594535, "grad_norm": 0.6546867489814758, "learning_rate": 1.9423489433902186e-05, "loss": 1.0172, "step": 1102 }, { "epoch": 0.07828941531363677, "grad_norm": 0.778107225894928, "learning_rate": 1.9287696039232005e-05, "loss": 1.5264, "step": 1103 }, { "epoch": 0.07836039393132818, "grad_norm": 0.3635626435279846, "learning_rate": 1.9152328308740707e-05, "loss": 0.6865, "step": 1104 }, { "epoch": 0.0784313725490196, "grad_norm": 0.6078665256500244, "learning_rate": 1.901738695633669e-05, "loss": 1.0725, "step": 1105 }, { "epoch": 0.07850235116671103, "grad_norm": 0.45997220277786255, "learning_rate": 1.888287269367979e-05, "loss": 0.7352, "step": 1106 }, { "epoch": 0.07857332978440244, "grad_norm": 0.3929154574871063, "learning_rate": 1.8748786230177294e-05, "loss": 0.7464, "step": 1107 }, { "epoch": 0.07864430840209387, "grad_norm": 0.48282280564308167, "learning_rate": 1.861512827298051e-05, "loss": 0.7389, "step": 1108 }, { "epoch": 0.0787152870197853, "grad_norm": 0.5808085203170776, "learning_rate": 1.848189952698074e-05, "loss": 1.1251, "step": 1109 }, { "epoch": 0.0787862656374767, "grad_norm": 0.404216468334198, "learning_rate": 1.834910069480571e-05, "loss": 0.7776, "step": 1110 }, { "epoch": 0.07885724425516813, "grad_norm": 0.3583569824695587, "learning_rate": 1.8216732476815933e-05, "loss": 0.7038, "step": 1111 }, { "epoch": 0.07892822287285955, "grad_norm": 0.7691454291343689, "learning_rate": 1.808479557110081e-05, "loss": 1.1151, "step": 1112 }, { "epoch": 0.07899920149055097, "grad_norm": 0.42444515228271484, "learning_rate": 1.7953290673475197e-05, "loss": 0.8253, "step": 1113 }, { "epoch": 0.07907018010824239, "grad_norm": 0.3662635087966919, "learning_rate": 1.7822218477475494e-05, "loss": 0.8391, "step": 1114 }, { "epoch": 0.07914115872593382, "grad_norm": 0.5567343831062317, "learning_rate": 1.769157967435615e-05, "loss": 0.9544, "step": 1115 }, { "epoch": 0.07921213734362523, "grad_norm": 0.6112503409385681, "learning_rate": 1.756137495308594e-05, "loss": 1.0489, "step": 1116 }, { "epoch": 0.07928311596131665, "grad_norm": 0.4554908275604248, "learning_rate": 1.7431605000344432e-05, "loss": 0.8538, "step": 1117 }, { "epoch": 0.07935409457900808, "grad_norm": 0.4239465594291687, "learning_rate": 1.7302270500518182e-05, "loss": 0.917, "step": 1118 }, { "epoch": 0.07942507319669949, "grad_norm": 0.4315805435180664, "learning_rate": 1.717337213569733e-05, "loss": 0.9517, "step": 1119 }, { "epoch": 0.07949605181439091, "grad_norm": 0.48045405745506287, "learning_rate": 1.704491058567187e-05, "loss": 1.022, "step": 1120 }, { "epoch": 0.07956703043208234, "grad_norm": 0.5694467425346375, "learning_rate": 1.6916886527928066e-05, "loss": 0.9376, "step": 1121 }, { "epoch": 0.07963800904977375, "grad_norm": 0.6039645075798035, "learning_rate": 1.6789300637645e-05, "loss": 1.1573, "step": 1122 }, { "epoch": 0.07970898766746518, "grad_norm": 0.5177348852157593, "learning_rate": 1.666215358769084e-05, "loss": 1.0799, "step": 1123 }, { "epoch": 0.0797799662851566, "grad_norm": 0.3842318058013916, "learning_rate": 1.653544604861945e-05, "loss": 0.8841, "step": 1124 }, { "epoch": 0.07985094490284801, "grad_norm": 0.57900071144104, "learning_rate": 1.6409178688666726e-05, "loss": 1.0456, "step": 1125 }, { "epoch": 0.07992192352053944, "grad_norm": 0.6663162112236023, "learning_rate": 1.6283352173747145e-05, "loss": 1.1332, "step": 1126 }, { "epoch": 0.07999290213823086, "grad_norm": 0.5811548233032227, "learning_rate": 1.615796716745026e-05, "loss": 0.9705, "step": 1127 }, { "epoch": 0.08006388075592227, "grad_norm": 0.512183666229248, "learning_rate": 1.6033024331037138e-05, "loss": 1.1737, "step": 1128 }, { "epoch": 0.0801348593736137, "grad_norm": 0.3928144872188568, "learning_rate": 1.5908524323436957e-05, "loss": 0.8263, "step": 1129 }, { "epoch": 0.08020583799130512, "grad_norm": 0.47604188323020935, "learning_rate": 1.578446780124344e-05, "loss": 0.9866, "step": 1130 }, { "epoch": 0.08027681660899653, "grad_norm": 0.609736979007721, "learning_rate": 1.566085541871145e-05, "loss": 0.9433, "step": 1131 }, { "epoch": 0.08034779522668796, "grad_norm": 0.4574166238307953, "learning_rate": 1.553768782775351e-05, "loss": 0.9907, "step": 1132 }, { "epoch": 0.08041877384437938, "grad_norm": 0.4988664388656616, "learning_rate": 1.5414965677936445e-05, "loss": 0.7608, "step": 1133 }, { "epoch": 0.0804897524620708, "grad_norm": 0.7940763235092163, "learning_rate": 1.5292689616477806e-05, "loss": 1.2404, "step": 1134 }, { "epoch": 0.08056073107976222, "grad_norm": 0.36464962363243103, "learning_rate": 1.5170860288242638e-05, "loss": 0.8057, "step": 1135 }, { "epoch": 0.08063170969745365, "grad_norm": 0.7205368876457214, "learning_rate": 1.5049478335739886e-05, "loss": 0.9051, "step": 1136 }, { "epoch": 0.08070268831514506, "grad_norm": 0.5451712012290955, "learning_rate": 1.4928544399119148e-05, "loss": 1.0423, "step": 1137 }, { "epoch": 0.08077366693283648, "grad_norm": 0.8497772812843323, "learning_rate": 1.4808059116167305e-05, "loss": 1.2004, "step": 1138 }, { "epoch": 0.08084464555052791, "grad_norm": 0.4573586881160736, "learning_rate": 1.4688023122305005e-05, "loss": 0.8203, "step": 1139 }, { "epoch": 0.08091562416821932, "grad_norm": 0.5086919665336609, "learning_rate": 1.4568437050583517e-05, "loss": 1.0065, "step": 1140 }, { "epoch": 0.08098660278591074, "grad_norm": 0.5985661745071411, "learning_rate": 1.4449301531681226e-05, "loss": 1.0, "step": 1141 }, { "epoch": 0.08105758140360217, "grad_norm": 0.5257460474967957, "learning_rate": 1.4330617193900364e-05, "loss": 1.0537, "step": 1142 }, { "epoch": 0.08112856002129358, "grad_norm": 0.5764384865760803, "learning_rate": 1.4212384663163791e-05, "loss": 1.2133, "step": 1143 }, { "epoch": 0.081199538638985, "grad_norm": 0.36415451765060425, "learning_rate": 1.4094604563011472e-05, "loss": 0.7653, "step": 1144 }, { "epoch": 0.08127051725667643, "grad_norm": 0.3863047957420349, "learning_rate": 1.3977277514597464e-05, "loss": 0.7643, "step": 1145 }, { "epoch": 0.08134149587436784, "grad_norm": 0.4379030168056488, "learning_rate": 1.3860404136686411e-05, "loss": 0.6285, "step": 1146 }, { "epoch": 0.08141247449205927, "grad_norm": 0.4873761534690857, "learning_rate": 1.3743985045650364e-05, "loss": 0.688, "step": 1147 }, { "epoch": 0.08148345310975069, "grad_norm": 0.8107748031616211, "learning_rate": 1.3628020855465572e-05, "loss": 1.2016, "step": 1148 }, { "epoch": 0.0815544317274421, "grad_norm": 0.6649096012115479, "learning_rate": 1.3512512177709224e-05, "loss": 1.1479, "step": 1149 }, { "epoch": 0.08162541034513353, "grad_norm": 0.41352081298828125, "learning_rate": 1.339745962155613e-05, "loss": 0.8386, "step": 1150 }, { "epoch": 0.08169638896282495, "grad_norm": 0.4705093502998352, "learning_rate": 1.3282863793775702e-05, "loss": 1.1097, "step": 1151 }, { "epoch": 0.08176736758051636, "grad_norm": 0.4707050919532776, "learning_rate": 1.3168725298728524e-05, "loss": 0.857, "step": 1152 }, { "epoch": 0.08183834619820779, "grad_norm": 0.4865884482860565, "learning_rate": 1.305504473836331e-05, "loss": 1.1221, "step": 1153 }, { "epoch": 0.08190932481589921, "grad_norm": 0.39382314682006836, "learning_rate": 1.294182271221377e-05, "loss": 0.5837, "step": 1154 }, { "epoch": 0.08198030343359063, "grad_norm": 0.5165771245956421, "learning_rate": 1.2829059817395261e-05, "loss": 1.2069, "step": 1155 }, { "epoch": 0.08205128205128205, "grad_norm": 0.4368021786212921, "learning_rate": 1.2716756648601857e-05, "loss": 0.8342, "step": 1156 }, { "epoch": 0.08212226066897348, "grad_norm": 0.5267868638038635, "learning_rate": 1.2604913798103034e-05, "loss": 0.9511, "step": 1157 }, { "epoch": 0.08219323928666489, "grad_norm": 0.43068644404411316, "learning_rate": 1.2493531855740625e-05, "loss": 1.0738, "step": 1158 }, { "epoch": 0.08226421790435631, "grad_norm": 0.43189457058906555, "learning_rate": 1.2382611408925793e-05, "loss": 0.7604, "step": 1159 }, { "epoch": 0.08233519652204774, "grad_norm": 0.5125055313110352, "learning_rate": 1.2272153042635704e-05, "loss": 0.7189, "step": 1160 }, { "epoch": 0.08240617513973915, "grad_norm": 0.47541847825050354, "learning_rate": 1.2162157339410718e-05, "loss": 0.9183, "step": 1161 }, { "epoch": 0.08247715375743057, "grad_norm": 0.5350788235664368, "learning_rate": 1.2052624879351104e-05, "loss": 1.1197, "step": 1162 }, { "epoch": 0.082548132375122, "grad_norm": 0.4992624521255493, "learning_rate": 1.1943556240114062e-05, "loss": 0.9298, "step": 1163 }, { "epoch": 0.08261911099281341, "grad_norm": 0.44900640845298767, "learning_rate": 1.183495199691068e-05, "loss": 0.9439, "step": 1164 }, { "epoch": 0.08269008961050484, "grad_norm": 0.4040171205997467, "learning_rate": 1.1726812722502945e-05, "loss": 0.9444, "step": 1165 }, { "epoch": 0.08276106822819626, "grad_norm": 0.3812856078147888, "learning_rate": 1.1619138987200562e-05, "loss": 0.9681, "step": 1166 }, { "epoch": 0.08283204684588767, "grad_norm": 0.4886763095855713, "learning_rate": 1.1511931358858185e-05, "loss": 0.9581, "step": 1167 }, { "epoch": 0.0829030254635791, "grad_norm": 0.5425056219100952, "learning_rate": 1.1405190402872202e-05, "loss": 0.7195, "step": 1168 }, { "epoch": 0.08297400408127052, "grad_norm": 0.48632103204727173, "learning_rate": 1.129891668217783e-05, "loss": 1.0468, "step": 1169 }, { "epoch": 0.08304498269896193, "grad_norm": 0.4761428236961365, "learning_rate": 1.119311075724625e-05, "loss": 1.111, "step": 1170 }, { "epoch": 0.08311596131665336, "grad_norm": 0.5897856950759888, "learning_rate": 1.1087773186081473e-05, "loss": 1.1972, "step": 1171 }, { "epoch": 0.08318693993434478, "grad_norm": 0.4703165590763092, "learning_rate": 1.0982904524217551e-05, "loss": 0.9719, "step": 1172 }, { "epoch": 0.0832579185520362, "grad_norm": 0.41847726702690125, "learning_rate": 1.0878505324715538e-05, "loss": 0.6459, "step": 1173 }, { "epoch": 0.08332889716972762, "grad_norm": 0.5312420129776001, "learning_rate": 1.0774576138160597e-05, "loss": 0.9006, "step": 1174 }, { "epoch": 0.08339987578741904, "grad_norm": 0.49165722727775574, "learning_rate": 1.067111751265919e-05, "loss": 0.7813, "step": 1175 }, { "epoch": 0.08347085440511046, "grad_norm": 0.4631386399269104, "learning_rate": 1.056812999383604e-05, "loss": 0.8755, "step": 1176 }, { "epoch": 0.08354183302280188, "grad_norm": 0.4755702316761017, "learning_rate": 1.0465614124831379e-05, "loss": 0.8196, "step": 1177 }, { "epoch": 0.0836128116404933, "grad_norm": 0.49447572231292725, "learning_rate": 1.0363570446297999e-05, "loss": 1.1008, "step": 1178 }, { "epoch": 0.08368379025818472, "grad_norm": 0.5133283138275146, "learning_rate": 1.0261999496398434e-05, "loss": 1.117, "step": 1179 }, { "epoch": 0.08375476887587614, "grad_norm": 1.6076154708862305, "learning_rate": 1.0160901810802115e-05, "loss": 1.458, "step": 1180 }, { "epoch": 0.08382574749356757, "grad_norm": 0.39624008536338806, "learning_rate": 1.0060277922682605e-05, "loss": 0.6493, "step": 1181 }, { "epoch": 0.08389672611125898, "grad_norm": 0.565704882144928, "learning_rate": 9.960128362714637e-06, "loss": 0.941, "step": 1182 }, { "epoch": 0.0839677047289504, "grad_norm": 0.39284053444862366, "learning_rate": 9.860453659071533e-06, "loss": 0.5983, "step": 1183 }, { "epoch": 0.08403868334664183, "grad_norm": 0.67037433385849, "learning_rate": 9.761254337422176e-06, "loss": 1.0344, "step": 1184 }, { "epoch": 0.08410966196433324, "grad_norm": 0.5679614543914795, "learning_rate": 9.662530920928403e-06, "loss": 0.9806, "step": 1185 }, { "epoch": 0.08418064058202467, "grad_norm": 0.4572247564792633, "learning_rate": 9.564283930242257e-06, "loss": 0.9108, "step": 1186 }, { "epoch": 0.08425161919971609, "grad_norm": 0.6498078107833862, "learning_rate": 9.466513883503091e-06, "loss": 1.0266, "step": 1187 }, { "epoch": 0.0843225978174075, "grad_norm": 0.4459131956100464, "learning_rate": 9.369221296335006e-06, "loss": 0.7574, "step": 1188 }, { "epoch": 0.08439357643509893, "grad_norm": 0.5533161759376526, "learning_rate": 9.272406681844015e-06, "loss": 1.1075, "step": 1189 }, { "epoch": 0.08446455505279035, "grad_norm": 0.6843585968017578, "learning_rate": 9.176070550615378e-06, "loss": 1.0174, "step": 1190 }, { "epoch": 0.08453553367048176, "grad_norm": 0.6016431450843811, "learning_rate": 9.080213410710913e-06, "loss": 0.8994, "step": 1191 }, { "epoch": 0.08460651228817319, "grad_norm": 0.4497312605381012, "learning_rate": 8.98483576766631e-06, "loss": 0.9091, "step": 1192 }, { "epoch": 0.08467749090586461, "grad_norm": 0.4260040819644928, "learning_rate": 8.889938124488517e-06, "loss": 0.686, "step": 1193 }, { "epoch": 0.08474846952355602, "grad_norm": 0.48108816146850586, "learning_rate": 8.795520981652961e-06, "loss": 0.9346, "step": 1194 }, { "epoch": 0.08481944814124745, "grad_norm": 0.45303186774253845, "learning_rate": 8.701584837101018e-06, "loss": 0.816, "step": 1195 }, { "epoch": 0.08489042675893888, "grad_norm": 0.5208687782287598, "learning_rate": 8.608130186237329e-06, "loss": 0.7334, "step": 1196 }, { "epoch": 0.08496140537663029, "grad_norm": 0.7175068855285645, "learning_rate": 8.515157521927286e-06, "loss": 1.2355, "step": 1197 }, { "epoch": 0.08503238399432171, "grad_norm": 0.5213837027549744, "learning_rate": 8.422667334494249e-06, "loss": 1.0409, "step": 1198 }, { "epoch": 0.08510336261201314, "grad_norm": 0.4475990831851959, "learning_rate": 8.330660111717192e-06, "loss": 1.0196, "step": 1199 }, { "epoch": 0.08517434122970455, "grad_norm": 0.523715615272522, "learning_rate": 8.239136338827903e-06, "loss": 0.8783, "step": 1200 }, { "epoch": 0.08524531984739597, "grad_norm": 1.0819703340530396, "learning_rate": 8.148096498508573e-06, "loss": 1.6188, "step": 1201 }, { "epoch": 0.0853162984650874, "grad_norm": 0.3292709290981293, "learning_rate": 8.05754107088923e-06, "loss": 0.5635, "step": 1202 }, { "epoch": 0.08538727708277881, "grad_norm": 0.4222264289855957, "learning_rate": 7.96747053354513e-06, "loss": 0.8573, "step": 1203 }, { "epoch": 0.08545825570047023, "grad_norm": 0.4970225691795349, "learning_rate": 7.877885361494353e-06, "loss": 0.7876, "step": 1204 }, { "epoch": 0.08552923431816166, "grad_norm": 0.5125081539154053, "learning_rate": 7.788786027195195e-06, "loss": 0.8243, "step": 1205 }, { "epoch": 0.08560021293585307, "grad_norm": 0.5454389452934265, "learning_rate": 7.700173000543742e-06, "loss": 0.8166, "step": 1206 }, { "epoch": 0.0856711915535445, "grad_norm": 0.4455251395702362, "learning_rate": 7.612046748871327e-06, "loss": 0.7249, "step": 1207 }, { "epoch": 0.08574217017123592, "grad_norm": 0.461681604385376, "learning_rate": 7.524407736942174e-06, "loss": 0.8663, "step": 1208 }, { "epoch": 0.08581314878892733, "grad_norm": 0.429550439119339, "learning_rate": 7.4372564269508445e-06, "loss": 0.9401, "step": 1209 }, { "epoch": 0.08588412740661876, "grad_norm": 0.44344857335090637, "learning_rate": 7.350593278519824e-06, "loss": 0.9269, "step": 1210 }, { "epoch": 0.08595510602431018, "grad_norm": 0.45307299494743347, "learning_rate": 7.264418748697144e-06, "loss": 0.854, "step": 1211 }, { "epoch": 0.0860260846420016, "grad_norm": 0.5577502250671387, "learning_rate": 7.178733291953865e-06, "loss": 0.8163, "step": 1212 }, { "epoch": 0.08609706325969302, "grad_norm": 0.3416275084018707, "learning_rate": 7.093537360181868e-06, "loss": 0.8937, "step": 1213 }, { "epoch": 0.08616804187738444, "grad_norm": 0.3705112338066101, "learning_rate": 7.00883140269123e-06, "loss": 0.7082, "step": 1214 }, { "epoch": 0.08623902049507585, "grad_norm": 0.5228879451751709, "learning_rate": 6.924615866208095e-06, "loss": 1.0063, "step": 1215 }, { "epoch": 0.08630999911276728, "grad_norm": 0.4439268708229065, "learning_rate": 6.840891194872112e-06, "loss": 0.9436, "step": 1216 }, { "epoch": 0.0863809777304587, "grad_norm": 0.5648981928825378, "learning_rate": 6.75765783023421e-06, "loss": 0.7849, "step": 1217 }, { "epoch": 0.08645195634815012, "grad_norm": 0.5214237570762634, "learning_rate": 6.674916211254289e-06, "loss": 0.8858, "step": 1218 }, { "epoch": 0.08652293496584154, "grad_norm": 0.5296366810798645, "learning_rate": 6.592666774298784e-06, "loss": 0.7019, "step": 1219 }, { "epoch": 0.08659391358353297, "grad_norm": 0.43214577436447144, "learning_rate": 6.510909953138511e-06, "loss": 0.6289, "step": 1220 }, { "epoch": 0.08666489220122438, "grad_norm": 0.4352208375930786, "learning_rate": 6.429646178946247e-06, "loss": 0.9008, "step": 1221 }, { "epoch": 0.0867358708189158, "grad_norm": 0.36285680532455444, "learning_rate": 6.3488758802945354e-06, "loss": 0.8594, "step": 1222 }, { "epoch": 0.08680684943660723, "grad_norm": 0.728671133518219, "learning_rate": 6.268599483153403e-06, "loss": 1.052, "step": 1223 }, { "epoch": 0.08687782805429864, "grad_norm": 0.5141593813896179, "learning_rate": 6.188817410888148e-06, "loss": 0.9049, "step": 1224 }, { "epoch": 0.08694880667199006, "grad_norm": 0.46414846181869507, "learning_rate": 6.109530084257042e-06, "loss": 1.0772, "step": 1225 }, { "epoch": 0.08701978528968149, "grad_norm": 0.46595925092697144, "learning_rate": 6.030737921409169e-06, "loss": 0.9155, "step": 1226 }, { "epoch": 0.0870907639073729, "grad_norm": 0.4655408263206482, "learning_rate": 5.952441337882153e-06, "loss": 0.9474, "step": 1227 }, { "epoch": 0.08716174252506433, "grad_norm": 0.4467454254627228, "learning_rate": 5.8746407466000464e-06, "loss": 0.7343, "step": 1228 }, { "epoch": 0.08723272114275575, "grad_norm": 0.49474817514419556, "learning_rate": 5.797336557871125e-06, "loss": 1.0227, "step": 1229 }, { "epoch": 0.08730369976044716, "grad_norm": 0.43683508038520813, "learning_rate": 5.720529179385659e-06, "loss": 0.7119, "step": 1230 }, { "epoch": 0.08737467837813859, "grad_norm": 0.47792723774909973, "learning_rate": 5.644219016213903e-06, "loss": 0.8374, "step": 1231 }, { "epoch": 0.08744565699583001, "grad_norm": 0.5540241599082947, "learning_rate": 5.568406470803799e-06, "loss": 0.9396, "step": 1232 }, { "epoch": 0.08751663561352142, "grad_norm": 0.4485909342765808, "learning_rate": 5.493091942978956e-06, "loss": 0.8863, "step": 1233 }, { "epoch": 0.08758761423121285, "grad_norm": 0.6412665843963623, "learning_rate": 5.418275829936537e-06, "loss": 0.9413, "step": 1234 }, { "epoch": 0.08765859284890427, "grad_norm": 0.46158745884895325, "learning_rate": 5.343958526245096e-06, "loss": 0.9195, "step": 1235 }, { "epoch": 0.08772957146659568, "grad_norm": 0.4636899530887604, "learning_rate": 5.270140423842607e-06, "loss": 0.7886, "step": 1236 }, { "epoch": 0.08780055008428711, "grad_norm": 0.6638062000274658, "learning_rate": 5.1968219120342775e-06, "loss": 1.1065, "step": 1237 }, { "epoch": 0.08787152870197854, "grad_norm": 0.4253000319004059, "learning_rate": 5.124003377490582e-06, "loss": 1.1271, "step": 1238 }, { "epoch": 0.08794250731966995, "grad_norm": 0.5168247222900391, "learning_rate": 5.0516852042451605e-06, "loss": 0.7275, "step": 1239 }, { "epoch": 0.08801348593736137, "grad_norm": 0.5668867230415344, "learning_rate": 4.979867773692881e-06, "loss": 0.8994, "step": 1240 }, { "epoch": 0.0880844645550528, "grad_norm": 0.5159637331962585, "learning_rate": 4.908551464587707e-06, "loss": 1.2211, "step": 1241 }, { "epoch": 0.08815544317274421, "grad_norm": 0.9431582689285278, "learning_rate": 4.8377366530408254e-06, "loss": 1.2378, "step": 1242 }, { "epoch": 0.08822642179043563, "grad_norm": 0.423022985458374, "learning_rate": 4.767423712518559e-06, "loss": 0.8821, "step": 1243 }, { "epoch": 0.08829740040812706, "grad_norm": 0.5624905824661255, "learning_rate": 4.697613013840441e-06, "loss": 0.7666, "step": 1244 }, { "epoch": 0.08836837902581847, "grad_norm": 0.7056260704994202, "learning_rate": 4.628304925177318e-06, "loss": 1.6976, "step": 1245 }, { "epoch": 0.0884393576435099, "grad_norm": 0.6922840476036072, "learning_rate": 4.559499812049251e-06, "loss": 1.151, "step": 1246 }, { "epoch": 0.08851033626120132, "grad_norm": 0.5164980888366699, "learning_rate": 4.491198037323796e-06, "loss": 0.9296, "step": 1247 }, { "epoch": 0.08858131487889273, "grad_norm": 0.5316797494888306, "learning_rate": 4.423399961213892e-06, "loss": 0.8836, "step": 1248 }, { "epoch": 0.08865229349658416, "grad_norm": 0.45848792791366577, "learning_rate": 4.3561059412760675e-06, "loss": 1.0931, "step": 1249 }, { "epoch": 0.08872327211427558, "grad_norm": 0.5291265845298767, "learning_rate": 4.2893163324085885e-06, "loss": 0.9476, "step": 1250 }, { "epoch": 0.08879425073196699, "grad_norm": 0.6027399301528931, "learning_rate": 4.223031486849472e-06, "loss": 0.967, "step": 1251 }, { "epoch": 0.08886522934965842, "grad_norm": 0.8059013485908508, "learning_rate": 4.1572517541747294e-06, "loss": 1.1932, "step": 1252 }, { "epoch": 0.08893620796734984, "grad_norm": 0.4496321976184845, "learning_rate": 4.091977481296483e-06, "loss": 0.7402, "step": 1253 }, { "epoch": 0.08900718658504125, "grad_norm": 0.5405556559562683, "learning_rate": 4.027209012461108e-06, "loss": 1.0949, "step": 1254 }, { "epoch": 0.08907816520273268, "grad_norm": 0.39425739645957947, "learning_rate": 3.962946689247471e-06, "loss": 0.7058, "step": 1255 }, { "epoch": 0.0891491438204241, "grad_norm": 0.4992437958717346, "learning_rate": 3.899190850565115e-06, "loss": 0.9825, "step": 1256 }, { "epoch": 0.08922012243811552, "grad_norm": 0.31615936756134033, "learning_rate": 3.835941832652434e-06, "loss": 0.6421, "step": 1257 }, { "epoch": 0.08929110105580694, "grad_norm": 0.5412048101425171, "learning_rate": 3.7731999690749585e-06, "loss": 1.0389, "step": 1258 }, { "epoch": 0.08936207967349837, "grad_norm": 0.4242056906223297, "learning_rate": 3.7109655907235584e-06, "loss": 1.0086, "step": 1259 }, { "epoch": 0.08943305829118978, "grad_norm": 0.4553348124027252, "learning_rate": 3.6492390258126673e-06, "loss": 0.73, "step": 1260 }, { "epoch": 0.0895040369088812, "grad_norm": 0.4538853168487549, "learning_rate": 3.5880205998786388e-06, "loss": 0.8733, "step": 1261 }, { "epoch": 0.08957501552657263, "grad_norm": 0.42450854182243347, "learning_rate": 3.5273106357779585e-06, "loss": 0.8259, "step": 1262 }, { "epoch": 0.08964599414426404, "grad_norm": 0.5689201354980469, "learning_rate": 3.4671094536855687e-06, "loss": 0.9794, "step": 1263 }, { "epoch": 0.08971697276195546, "grad_norm": 0.46215319633483887, "learning_rate": 3.40741737109318e-06, "loss": 1.3214, "step": 1264 }, { "epoch": 0.08978795137964689, "grad_norm": 0.6304516196250916, "learning_rate": 3.3482347028075513e-06, "loss": 1.1341, "step": 1265 }, { "epoch": 0.0898589299973383, "grad_norm": 0.5463753938674927, "learning_rate": 3.2895617609489336e-06, "loss": 0.8568, "step": 1266 }, { "epoch": 0.08992990861502972, "grad_norm": 0.7814898490905762, "learning_rate": 3.231398854949297e-06, "loss": 1.2902, "step": 1267 }, { "epoch": 0.09000088723272115, "grad_norm": 0.395901083946228, "learning_rate": 3.1737462915508277e-06, "loss": 0.9164, "step": 1268 }, { "epoch": 0.09007186585041256, "grad_norm": 0.47108373045921326, "learning_rate": 3.1166043748041772e-06, "loss": 0.9783, "step": 1269 }, { "epoch": 0.09014284446810399, "grad_norm": 0.6641191244125366, "learning_rate": 3.059973406066963e-06, "loss": 1.2843, "step": 1270 }, { "epoch": 0.09021382308579541, "grad_norm": 0.44633758068084717, "learning_rate": 3.0038536840021335e-06, "loss": 0.8454, "step": 1271 }, { "epoch": 0.09028480170348682, "grad_norm": 0.49430206418037415, "learning_rate": 2.948245504576419e-06, "loss": 1.2095, "step": 1272 }, { "epoch": 0.09035578032117825, "grad_norm": 0.4891290068626404, "learning_rate": 2.893149161058717e-06, "loss": 0.7466, "step": 1273 }, { "epoch": 0.09042675893886967, "grad_norm": 0.48959529399871826, "learning_rate": 2.838564944018618e-06, "loss": 0.8947, "step": 1274 }, { "epoch": 0.09049773755656108, "grad_norm": 0.4747793972492218, "learning_rate": 2.784493141324818e-06, "loss": 1.1404, "step": 1275 }, { "epoch": 0.09056871617425251, "grad_norm": 0.41771140694618225, "learning_rate": 2.730934038143607e-06, "loss": 0.9322, "step": 1276 }, { "epoch": 0.09063969479194392, "grad_norm": 0.46313968300819397, "learning_rate": 2.677887916937416e-06, "loss": 0.9321, "step": 1277 }, { "epoch": 0.09071067340963535, "grad_norm": 0.6357395052909851, "learning_rate": 2.6253550574632303e-06, "loss": 1.2126, "step": 1278 }, { "epoch": 0.09078165202732677, "grad_norm": 0.48085835576057434, "learning_rate": 2.573335736771254e-06, "loss": 0.7637, "step": 1279 }, { "epoch": 0.09085263064501818, "grad_norm": 0.7907678484916687, "learning_rate": 2.5218302292032816e-06, "loss": 1.1955, "step": 1280 }, { "epoch": 0.0909236092627096, "grad_norm": 0.6866028308868408, "learning_rate": 2.4708388063913844e-06, "loss": 1.1785, "step": 1281 }, { "epoch": 0.09099458788040103, "grad_norm": 0.41570764780044556, "learning_rate": 2.420361737256438e-06, "loss": 0.94, "step": 1282 }, { "epoch": 0.09106556649809244, "grad_norm": 0.4501184821128845, "learning_rate": 2.3703992880066638e-06, "loss": 1.0563, "step": 1283 }, { "epoch": 0.09113654511578387, "grad_norm": 0.47688695788383484, "learning_rate": 2.3209517221362777e-06, "loss": 0.8763, "step": 1284 }, { "epoch": 0.0912075237334753, "grad_norm": 0.467288076877594, "learning_rate": 2.2720193004240774e-06, "loss": 0.9097, "step": 1285 }, { "epoch": 0.0912785023511667, "grad_norm": 0.4979704022407532, "learning_rate": 2.22360228093208e-06, "loss": 1.069, "step": 1286 }, { "epoch": 0.09134948096885813, "grad_norm": 0.6934332847595215, "learning_rate": 2.1757009190041088e-06, "loss": 0.8044, "step": 1287 }, { "epoch": 0.09142045958654955, "grad_norm": 1.4915975332260132, "learning_rate": 2.128315467264552e-06, "loss": 0.9586, "step": 1288 }, { "epoch": 0.09149143820424097, "grad_norm": 0.3834763765335083, "learning_rate": 2.08144617561693e-06, "loss": 0.7679, "step": 1289 }, { "epoch": 0.09156241682193239, "grad_norm": 0.36206838488578796, "learning_rate": 2.035093291242607e-06, "loss": 0.7367, "step": 1290 }, { "epoch": 0.09163339543962382, "grad_norm": 0.5193326473236084, "learning_rate": 1.9892570585995363e-06, "loss": 0.9015, "step": 1291 }, { "epoch": 0.09170437405731523, "grad_norm": 0.46295395493507385, "learning_rate": 1.943937719420863e-06, "loss": 0.8407, "step": 1292 }, { "epoch": 0.09177535267500665, "grad_norm": 0.4065030813217163, "learning_rate": 1.899135512713801e-06, "loss": 0.9282, "step": 1293 }, { "epoch": 0.09184633129269808, "grad_norm": 0.4650251865386963, "learning_rate": 1.8548506747582129e-06, "loss": 1.0284, "step": 1294 }, { "epoch": 0.09191730991038949, "grad_norm": 0.43935665488243103, "learning_rate": 1.8110834391054764e-06, "loss": 0.8786, "step": 1295 }, { "epoch": 0.09198828852808091, "grad_norm": 0.3632609248161316, "learning_rate": 1.7678340365772206e-06, "loss": 0.8486, "step": 1296 }, { "epoch": 0.09205926714577234, "grad_norm": 0.5031712055206299, "learning_rate": 1.725102695264058e-06, "loss": 0.8989, "step": 1297 }, { "epoch": 0.09213024576346375, "grad_norm": 0.5061722993850708, "learning_rate": 1.6828896405244988e-06, "loss": 0.8698, "step": 1298 }, { "epoch": 0.09220122438115518, "grad_norm": 0.6190900802612305, "learning_rate": 1.6411950949836164e-06, "loss": 1.1324, "step": 1299 }, { "epoch": 0.0922722029988466, "grad_norm": 0.4328751266002655, "learning_rate": 1.6000192785320057e-06, "loss": 0.9834, "step": 1300 }, { "epoch": 0.09234318161653801, "grad_norm": 0.45743778347969055, "learning_rate": 1.5593624083245162e-06, "loss": 0.8712, "step": 1301 }, { "epoch": 0.09241416023422944, "grad_norm": 0.4247590899467468, "learning_rate": 1.5192246987791981e-06, "loss": 0.7241, "step": 1302 }, { "epoch": 0.09248513885192086, "grad_norm": 0.5770044326782227, "learning_rate": 1.4796063615760913e-06, "loss": 0.8319, "step": 1303 }, { "epoch": 0.09255611746961227, "grad_norm": 0.4635300934314728, "learning_rate": 1.4405076056561828e-06, "loss": 1.0233, "step": 1304 }, { "epoch": 0.0926270960873037, "grad_norm": 0.42207738757133484, "learning_rate": 1.40192863722024e-06, "loss": 0.7407, "step": 1305 }, { "epoch": 0.09269807470499512, "grad_norm": 0.5023476481437683, "learning_rate": 1.3638696597277679e-06, "loss": 0.9832, "step": 1306 }, { "epoch": 0.09276905332268653, "grad_norm": 0.5583863258361816, "learning_rate": 1.326330873895909e-06, "loss": 1.1022, "step": 1307 }, { "epoch": 0.09284003194037796, "grad_norm": 0.4288213551044464, "learning_rate": 1.28931247769839e-06, "loss": 0.9897, "step": 1308 }, { "epoch": 0.09291101055806938, "grad_norm": 0.5217162370681763, "learning_rate": 1.2528146663645102e-06, "loss": 0.8312, "step": 1309 }, { "epoch": 0.0929819891757608, "grad_norm": 0.6898660063743591, "learning_rate": 1.216837632378065e-06, "loss": 1.0515, "step": 1310 }, { "epoch": 0.09305296779345222, "grad_norm": 0.48667794466018677, "learning_rate": 1.1813815654763472e-06, "loss": 0.8842, "step": 1311 }, { "epoch": 0.09312394641114365, "grad_norm": 0.4529171884059906, "learning_rate": 1.146446652649169e-06, "loss": 1.3358, "step": 1312 }, { "epoch": 0.09319492502883506, "grad_norm": 0.6250870227813721, "learning_rate": 1.1120330781378307e-06, "loss": 0.8719, "step": 1313 }, { "epoch": 0.09326590364652648, "grad_norm": 0.4203828275203705, "learning_rate": 1.0781410234342094e-06, "loss": 0.7693, "step": 1314 }, { "epoch": 0.09333688226421791, "grad_norm": 0.40615299344062805, "learning_rate": 1.0447706672797264e-06, "loss": 0.7339, "step": 1315 }, { "epoch": 0.09340786088190932, "grad_norm": 0.4049832224845886, "learning_rate": 1.0119221856644712e-06, "loss": 0.7479, "step": 1316 }, { "epoch": 0.09347883949960074, "grad_norm": 0.41221579909324646, "learning_rate": 9.795957518262566e-07, "loss": 0.713, "step": 1317 }, { "epoch": 0.09354981811729217, "grad_norm": 0.42666563391685486, "learning_rate": 9.477915362496758e-07, "loss": 1.06, "step": 1318 }, { "epoch": 0.09362079673498358, "grad_norm": 0.4528525173664093, "learning_rate": 9.16509706665225e-07, "loss": 0.7928, "step": 1319 }, { "epoch": 0.093691775352675, "grad_norm": 0.5852678418159485, "learning_rate": 8.857504280484375e-07, "loss": 0.9635, "step": 1320 }, { "epoch": 0.09376275397036643, "grad_norm": 0.47171440720558167, "learning_rate": 8.555138626189618e-07, "loss": 1.0355, "step": 1321 }, { "epoch": 0.09383373258805784, "grad_norm": 0.5309885740280151, "learning_rate": 8.258001698397744e-07, "loss": 1.0872, "step": 1322 }, { "epoch": 0.09390471120574927, "grad_norm": 0.6979671716690063, "learning_rate": 7.966095064162682e-07, "loss": 1.2462, "step": 1323 }, { "epoch": 0.09397568982344069, "grad_norm": 0.5324621796607971, "learning_rate": 7.679420262954984e-07, "loss": 0.9156, "step": 1324 }, { "epoch": 0.0940466684411321, "grad_norm": 0.42955929040908813, "learning_rate": 7.397978806652939e-07, "loss": 0.636, "step": 1325 }, { "epoch": 0.09411764705882353, "grad_norm": 0.4044051468372345, "learning_rate": 7.121772179535135e-07, "loss": 0.8698, "step": 1326 }, { "epoch": 0.09418862567651495, "grad_norm": 0.5710899829864502, "learning_rate": 6.850801838272691e-07, "loss": 1.2614, "step": 1327 }, { "epoch": 0.09425960429420636, "grad_norm": 0.5085283517837524, "learning_rate": 6.585069211921035e-07, "loss": 0.9378, "step": 1328 }, { "epoch": 0.09433058291189779, "grad_norm": 0.4785238802433014, "learning_rate": 6.324575701912805e-07, "loss": 1.1437, "step": 1329 }, { "epoch": 0.09440156152958921, "grad_norm": 0.6342504024505615, "learning_rate": 6.069322682050516e-07, "loss": 0.9806, "step": 1330 }, { "epoch": 0.09447254014728063, "grad_norm": 0.5013599395751953, "learning_rate": 5.819311498499013e-07, "loss": 1.2397, "step": 1331 }, { "epoch": 0.09454351876497205, "grad_norm": 0.36016663908958435, "learning_rate": 5.57454346977837e-07, "loss": 0.7081, "step": 1332 }, { "epoch": 0.09461449738266348, "grad_norm": 0.3618755340576172, "learning_rate": 5.335019886757442e-07, "loss": 0.7656, "step": 1333 }, { "epoch": 0.09468547600035489, "grad_norm": 0.37141433358192444, "learning_rate": 5.10074201264632e-07, "loss": 0.8844, "step": 1334 }, { "epoch": 0.09475645461804631, "grad_norm": 0.4491944909095764, "learning_rate": 4.871711082990227e-07, "loss": 0.8487, "step": 1335 }, { "epoch": 0.09482743323573774, "grad_norm": 0.4374077320098877, "learning_rate": 4.647928305662852e-07, "loss": 0.8902, "step": 1336 }, { "epoch": 0.09489841185342915, "grad_norm": 0.4476800262928009, "learning_rate": 4.4293948608596925e-07, "loss": 1.0808, "step": 1337 }, { "epoch": 0.09496939047112057, "grad_norm": 0.623502790927887, "learning_rate": 4.216111901092501e-07, "loss": 1.0005, "step": 1338 }, { "epoch": 0.095040369088812, "grad_norm": 0.4662952125072479, "learning_rate": 4.008080551182403e-07, "loss": 0.7606, "step": 1339 }, { "epoch": 0.09511134770650341, "grad_norm": 0.5216524004936218, "learning_rate": 3.805301908254455e-07, "loss": 0.9909, "step": 1340 }, { "epoch": 0.09518232632419484, "grad_norm": 0.4408697485923767, "learning_rate": 3.6077770417318744e-07, "loss": 0.7013, "step": 1341 }, { "epoch": 0.09525330494188626, "grad_norm": 0.4359828233718872, "learning_rate": 3.415506993330153e-07, "loss": 1.0082, "step": 1342 }, { "epoch": 0.09532428355957767, "grad_norm": 0.3965587019920349, "learning_rate": 3.228492777051728e-07, "loss": 1.0, "step": 1343 }, { "epoch": 0.0953952621772691, "grad_norm": 0.41036057472229004, "learning_rate": 3.046735379180543e-07, "loss": 0.8307, "step": 1344 }, { "epoch": 0.09546624079496052, "grad_norm": 0.8139565587043762, "learning_rate": 2.87023575827694e-07, "loss": 1.0802, "step": 1345 }, { "epoch": 0.09553721941265193, "grad_norm": 0.9029821753501892, "learning_rate": 2.6989948451726643e-07, "loss": 2.0635, "step": 1346 }, { "epoch": 0.09560819803034336, "grad_norm": 0.5215340852737427, "learning_rate": 2.5330135429656457e-07, "loss": 0.9462, "step": 1347 }, { "epoch": 0.09567917664803478, "grad_norm": 0.4245148003101349, "learning_rate": 2.372292727015557e-07, "loss": 0.7983, "step": 1348 }, { "epoch": 0.0957501552657262, "grad_norm": 0.4835399389266968, "learning_rate": 2.2168332449390427e-07, "loss": 1.0793, "step": 1349 }, { "epoch": 0.09582113388341762, "grad_norm": 0.5569635629653931, "learning_rate": 2.066635916605386e-07, "loss": 0.9318, "step": 1350 }, { "epoch": 0.09589211250110904, "grad_norm": 0.4609260559082031, "learning_rate": 1.921701534131848e-07, "loss": 1.0345, "step": 1351 }, { "epoch": 0.09596309111880046, "grad_norm": 0.389445424079895, "learning_rate": 1.782030861880113e-07, "loss": 0.7788, "step": 1352 }, { "epoch": 0.09603406973649188, "grad_norm": 0.3875621557235718, "learning_rate": 1.6476246364515169e-07, "loss": 0.7591, "step": 1353 }, { "epoch": 0.0961050483541833, "grad_norm": 0.5420302152633667, "learning_rate": 1.518483566683826e-07, "loss": 1.1174, "step": 1354 }, { "epoch": 0.09617602697187472, "grad_norm": 0.5549187660217285, "learning_rate": 1.3946083336467964e-07, "loss": 0.9157, "step": 1355 }, { "epoch": 0.09624700558956614, "grad_norm": 0.6199358701705933, "learning_rate": 1.2759995906392874e-07, "loss": 0.8482, "step": 1356 }, { "epoch": 0.09631798420725757, "grad_norm": 0.646535336971283, "learning_rate": 1.1626579631853762e-07, "loss": 1.086, "step": 1357 }, { "epoch": 0.09638896282494898, "grad_norm": 0.4827463924884796, "learning_rate": 1.0545840490313596e-07, "loss": 0.8859, "step": 1358 }, { "epoch": 0.0964599414426404, "grad_norm": 0.3394274413585663, "learning_rate": 9.517784181422019e-08, "loss": 0.8065, "step": 1359 }, { "epoch": 0.09653092006033183, "grad_norm": 0.5661752223968506, "learning_rate": 8.542416126989805e-08, "loss": 0.9616, "step": 1360 }, { "epoch": 0.09660189867802324, "grad_norm": 0.3854394853115082, "learning_rate": 7.619741470955566e-08, "loss": 0.7678, "step": 1361 }, { "epoch": 0.09667287729571467, "grad_norm": 0.6302008032798767, "learning_rate": 6.749765079363534e-08, "loss": 1.1516, "step": 1362 }, { "epoch": 0.09674385591340609, "grad_norm": 0.4585036635398865, "learning_rate": 5.9324915403324855e-08, "loss": 0.6375, "step": 1363 }, { "epoch": 0.0968148345310975, "grad_norm": 0.493896484375, "learning_rate": 5.167925164037968e-08, "loss": 0.7289, "step": 1364 }, { "epoch": 0.09688581314878893, "grad_norm": 0.42399123311042786, "learning_rate": 4.456069982684552e-08, "loss": 1.0201, "step": 1365 }, { "epoch": 0.09695679176648035, "grad_norm": 0.6302375793457031, "learning_rate": 3.796929750485845e-08, "loss": 1.1759, "step": 1366 }, { "epoch": 0.09702777038417176, "grad_norm": 0.4702526926994324, "learning_rate": 3.190507943644505e-08, "loss": 0.8664, "step": 1367 }, { "epoch": 0.09709874900186319, "grad_norm": 0.5638194680213928, "learning_rate": 2.6368077603367015e-08, "loss": 1.117, "step": 1368 }, { "epoch": 0.09716972761955461, "grad_norm": 0.5271203517913818, "learning_rate": 2.135832120689907e-08, "loss": 0.8312, "step": 1369 }, { "epoch": 0.09724070623724602, "grad_norm": 0.43867847323417664, "learning_rate": 1.687583666772907e-08, "loss": 0.7566, "step": 1370 }, { "epoch": 0.09731168485493745, "grad_norm": 0.45883628726005554, "learning_rate": 1.2920647625769277e-08, "loss": 0.9586, "step": 1371 }, { "epoch": 0.09738266347262887, "grad_norm": 0.542040228843689, "learning_rate": 9.49277494008971e-09, "loss": 0.9555, "step": 1372 }, { "epoch": 0.09745364209032029, "grad_norm": 0.45585867762565613, "learning_rate": 6.592236688762743e-09, "loss": 0.8365, "step": 1373 }, { "epoch": 0.09752462070801171, "grad_norm": 0.5066502094268799, "learning_rate": 4.219048168763174e-09, "loss": 0.8912, "step": 1374 }, { "epoch": 0.09759559932570314, "grad_norm": 0.38993802666664124, "learning_rate": 2.3732218959349185e-09, "loss": 0.6505, "step": 1375 }, { "epoch": 0.09766657794339455, "grad_norm": 0.6296946406364441, "learning_rate": 1.0547676048688892e-09, "loss": 0.8125, "step": 1376 }, { "epoch": 0.09773755656108597, "grad_norm": 0.6637731194496155, "learning_rate": 2.6369224888078693e-10, "loss": 0.8864, "step": 1377 }, { "epoch": 0.0978085351787774, "grad_norm": 0.5165863037109375, "learning_rate": 0.0, "loss": 1.3353, "step": 1378 } ], "logging_steps": 1, "max_steps": 1378, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 345, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.028988515155968e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }