{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.27237964506324147, "eval_steps": 348, "global_step": 1389, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019609765663300324, "grad_norm": 20.33372688293457, "learning_rate": 2e-05, "loss": 3.0843, "step": 1 }, { "epoch": 0.00019609765663300324, "eval_loss": 1.1017773151397705, "eval_runtime": 79.9135, "eval_samples_per_second": 26.879, "eval_steps_per_second": 13.44, "step": 1 }, { "epoch": 0.0003921953132660065, "grad_norm": 19.32895278930664, "learning_rate": 4e-05, "loss": 3.2221, "step": 2 }, { "epoch": 0.0005882929698990097, "grad_norm": 18.6882266998291, "learning_rate": 6e-05, "loss": 3.8951, "step": 3 }, { "epoch": 0.000784390626532013, "grad_norm": 43.008060455322266, "learning_rate": 8e-05, "loss": 5.167, "step": 4 }, { "epoch": 0.0009804882831650162, "grad_norm": 21.642993927001953, "learning_rate": 0.0001, "loss": 3.1304, "step": 5 }, { "epoch": 0.0011765859397980193, "grad_norm": 29.79266929626465, "learning_rate": 0.00012, "loss": 4.5153, "step": 6 }, { "epoch": 0.0013726835964310226, "grad_norm": 25.503681182861328, "learning_rate": 0.00014, "loss": 3.8083, "step": 7 }, { "epoch": 0.001568781253064026, "grad_norm": 32.35524368286133, "learning_rate": 0.00016, "loss": 4.253, "step": 8 }, { "epoch": 0.0017648789096970292, "grad_norm": 21.053390502929688, "learning_rate": 0.00018, "loss": 3.3757, "step": 9 }, { "epoch": 0.0019609765663300325, "grad_norm": 25.7067928314209, "learning_rate": 0.0002, "loss": 3.2484, "step": 10 }, { "epoch": 0.0021570742229630358, "grad_norm": 22.57227897644043, "learning_rate": 0.00019999974049780868, "loss": 2.8378, "step": 11 }, { "epoch": 0.0023531718795960386, "grad_norm": 19.06597900390625, "learning_rate": 0.00019999896199258152, "loss": 3.231, "step": 12 }, { "epoch": 0.002549269536229042, "grad_norm": 17.590620040893555, "learning_rate": 0.000199997664488359, "loss": 2.2391, "step": 13 }, { "epoch": 0.002745367192862045, "grad_norm": 8.627043724060059, "learning_rate": 0.00019999584799187522, "loss": 1.7095, "step": 14 }, { "epoch": 0.0029414648494950485, "grad_norm": 21.60858917236328, "learning_rate": 0.0001999935125125579, "loss": 3.9299, "step": 15 }, { "epoch": 0.003137562506128052, "grad_norm": 8.075380325317383, "learning_rate": 0.00019999065806252829, "loss": 1.7939, "step": 16 }, { "epoch": 0.003333660162761055, "grad_norm": 11.393594741821289, "learning_rate": 0.00019998728465660105, "loss": 1.601, "step": 17 }, { "epoch": 0.0035297578193940584, "grad_norm": 8.256339073181152, "learning_rate": 0.00019998339231228434, "loss": 3.1556, "step": 18 }, { "epoch": 0.0037258554760270617, "grad_norm": 20.03615951538086, "learning_rate": 0.0001999789810497796, "loss": 2.0883, "step": 19 }, { "epoch": 0.003921953132660065, "grad_norm": 10.166353225708008, "learning_rate": 0.0001999740508919815, "loss": 3.5616, "step": 20 }, { "epoch": 0.004118050789293068, "grad_norm": 15.80553913116455, "learning_rate": 0.0001999686018644777, "loss": 3.0344, "step": 21 }, { "epoch": 0.0043141484459260715, "grad_norm": 7.451974391937256, "learning_rate": 0.00019996263399554897, "loss": 2.1049, "step": 22 }, { "epoch": 0.004510246102559075, "grad_norm": 5.434274673461914, "learning_rate": 0.00019995614731616875, "loss": 2.3178, "step": 23 }, { "epoch": 0.004706343759192077, "grad_norm": 10.594315528869629, "learning_rate": 0.00019994914186000328, "loss": 1.7096, "step": 24 }, { "epoch": 0.0049024414158250805, "grad_norm": 5.348718166351318, "learning_rate": 0.0001999416176634111, "loss": 2.695, "step": 25 }, { "epoch": 0.005098539072458084, "grad_norm": 17.776073455810547, "learning_rate": 0.00019993357476544312, "loss": 1.7411, "step": 26 }, { "epoch": 0.005294636729091087, "grad_norm": 10.051606178283691, "learning_rate": 0.0001999250132078424, "loss": 2.6161, "step": 27 }, { "epoch": 0.00549073438572409, "grad_norm": 26.03020668029785, "learning_rate": 0.00019991593303504376, "loss": 3.3977, "step": 28 }, { "epoch": 0.005686832042357094, "grad_norm": 10.213540077209473, "learning_rate": 0.00019990633429417363, "loss": 1.2442, "step": 29 }, { "epoch": 0.005882929698990097, "grad_norm": 11.69288444519043, "learning_rate": 0.00019989621703505, "loss": 1.4702, "step": 30 }, { "epoch": 0.0060790273556231, "grad_norm": 4.343452453613281, "learning_rate": 0.00019988558131018186, "loss": 1.0779, "step": 31 }, { "epoch": 0.006275125012256104, "grad_norm": 9.106976509094238, "learning_rate": 0.00019987442717476906, "loss": 2.5887, "step": 32 }, { "epoch": 0.006471222668889107, "grad_norm": 17.658370971679688, "learning_rate": 0.00019986275468670205, "loss": 2.2258, "step": 33 }, { "epoch": 0.00666732032552211, "grad_norm": 6.7451090812683105, "learning_rate": 0.00019985056390656162, "loss": 1.7206, "step": 34 }, { "epoch": 0.0068634179821551134, "grad_norm": 28.07065200805664, "learning_rate": 0.00019983785489761837, "loss": 2.7356, "step": 35 }, { "epoch": 0.007059515638788117, "grad_norm": 11.387879371643066, "learning_rate": 0.00019982462772583266, "loss": 1.973, "step": 36 }, { "epoch": 0.00725561329542112, "grad_norm": 9.64372444152832, "learning_rate": 0.00019981088245985408, "loss": 2.7339, "step": 37 }, { "epoch": 0.007451710952054123, "grad_norm": 9.302544593811035, "learning_rate": 0.00019979661917102115, "loss": 1.7498, "step": 38 }, { "epoch": 0.007647808608687127, "grad_norm": 15.064400672912598, "learning_rate": 0.000199781837933361, "loss": 3.0109, "step": 39 }, { "epoch": 0.00784390626532013, "grad_norm": 7.281099319458008, "learning_rate": 0.00019976653882358884, "loss": 1.3118, "step": 40 }, { "epoch": 0.008040003921953132, "grad_norm": 6.4474873542785645, "learning_rate": 0.0001997507219211078, "loss": 1.408, "step": 41 }, { "epoch": 0.008236101578586136, "grad_norm": 13.101079940795898, "learning_rate": 0.00019973438730800822, "loss": 2.3367, "step": 42 }, { "epoch": 0.008432199235219139, "grad_norm": 5.951049327850342, "learning_rate": 0.00019971753506906753, "loss": 0.9101, "step": 43 }, { "epoch": 0.008628296891852143, "grad_norm": 11.212276458740234, "learning_rate": 0.00019970016529174947, "loss": 2.7058, "step": 44 }, { "epoch": 0.008824394548485145, "grad_norm": 8.68136978149414, "learning_rate": 0.0001996822780662041, "loss": 2.0276, "step": 45 }, { "epoch": 0.00902049220511815, "grad_norm": 17.70038414001465, "learning_rate": 0.00019966387348526683, "loss": 2.7989, "step": 46 }, { "epoch": 0.009216589861751152, "grad_norm": 10.247598648071289, "learning_rate": 0.00019964495164445824, "loss": 1.9618, "step": 47 }, { "epoch": 0.009412687518384154, "grad_norm": 10.378255844116211, "learning_rate": 0.0001996255126419835, "loss": 1.8003, "step": 48 }, { "epoch": 0.009608785175017159, "grad_norm": 31.620820999145508, "learning_rate": 0.0001996055565787319, "loss": 2.8785, "step": 49 }, { "epoch": 0.009804882831650161, "grad_norm": 9.976147651672363, "learning_rate": 0.0001995850835582763, "loss": 2.5605, "step": 50 }, { "epoch": 0.010000980488283165, "grad_norm": 11.751899719238281, "learning_rate": 0.00019956409368687258, "loss": 2.7556, "step": 51 }, { "epoch": 0.010197078144916168, "grad_norm": 15.828932762145996, "learning_rate": 0.000199542587073459, "loss": 2.7773, "step": 52 }, { "epoch": 0.010393175801549172, "grad_norm": 10.772979736328125, "learning_rate": 0.00019952056382965597, "loss": 1.9553, "step": 53 }, { "epoch": 0.010589273458182174, "grad_norm": 10.821427345275879, "learning_rate": 0.00019949802406976495, "loss": 1.8528, "step": 54 }, { "epoch": 0.010785371114815178, "grad_norm": 7.228662490844727, "learning_rate": 0.00019947496791076837, "loss": 1.1844, "step": 55 }, { "epoch": 0.01098146877144818, "grad_norm": 7.164773941040039, "learning_rate": 0.00019945139547232872, "loss": 1.0291, "step": 56 }, { "epoch": 0.011177566428081185, "grad_norm": 13.927733421325684, "learning_rate": 0.0001994273068767879, "loss": 1.5417, "step": 57 }, { "epoch": 0.011373664084714187, "grad_norm": 10.366493225097656, "learning_rate": 0.00019940270224916688, "loss": 1.5122, "step": 58 }, { "epoch": 0.011569761741347192, "grad_norm": 11.2214994430542, "learning_rate": 0.00019937758171716468, "loss": 1.6003, "step": 59 }, { "epoch": 0.011765859397980194, "grad_norm": 14.360090255737305, "learning_rate": 0.000199351945411158, "loss": 1.5651, "step": 60 }, { "epoch": 0.011961957054613198, "grad_norm": 17.97150993347168, "learning_rate": 0.00019932579346420038, "loss": 1.6064, "step": 61 }, { "epoch": 0.0121580547112462, "grad_norm": 10.190518379211426, "learning_rate": 0.00019929912601202151, "loss": 1.9151, "step": 62 }, { "epoch": 0.012354152367879203, "grad_norm": 13.573248863220215, "learning_rate": 0.00019927194319302677, "loss": 4.0602, "step": 63 }, { "epoch": 0.012550250024512207, "grad_norm": 16.919841766357422, "learning_rate": 0.00019924424514829606, "loss": 2.8292, "step": 64 }, { "epoch": 0.01274634768114521, "grad_norm": 58.470252990722656, "learning_rate": 0.00019921603202158354, "loss": 1.9637, "step": 65 }, { "epoch": 0.012942445337778214, "grad_norm": 18.334800720214844, "learning_rate": 0.00019918730395931649, "loss": 2.5609, "step": 66 }, { "epoch": 0.013138542994411216, "grad_norm": 12.280759811401367, "learning_rate": 0.00019915806111059486, "loss": 1.2495, "step": 67 }, { "epoch": 0.01333464065104422, "grad_norm": 8.015874862670898, "learning_rate": 0.0001991283036271903, "loss": 1.505, "step": 68 }, { "epoch": 0.013530738307677223, "grad_norm": 7.713284969329834, "learning_rate": 0.0001990980316635455, "loss": 2.3898, "step": 69 }, { "epoch": 0.013726835964310227, "grad_norm": 18.01800537109375, "learning_rate": 0.00019906724537677316, "loss": 3.0263, "step": 70 }, { "epoch": 0.01392293362094323, "grad_norm": 21.270421981811523, "learning_rate": 0.00019903594492665558, "loss": 3.2547, "step": 71 }, { "epoch": 0.014119031277576233, "grad_norm": 21.60205841064453, "learning_rate": 0.0001990041304756434, "loss": 2.577, "step": 72 }, { "epoch": 0.014315128934209236, "grad_norm": 10.01419734954834, "learning_rate": 0.00019897180218885507, "loss": 1.9092, "step": 73 }, { "epoch": 0.01451122659084224, "grad_norm": 14.10943603515625, "learning_rate": 0.00019893896023407578, "loss": 2.2377, "step": 74 }, { "epoch": 0.014707324247475242, "grad_norm": 11.310667037963867, "learning_rate": 0.0001989056047817567, "loss": 1.6645, "step": 75 }, { "epoch": 0.014903421904108247, "grad_norm": 6.586666107177734, "learning_rate": 0.0001988717360050141, "loss": 2.2651, "step": 76 }, { "epoch": 0.015099519560741249, "grad_norm": 4.402716159820557, "learning_rate": 0.00019883735407962846, "loss": 1.3483, "step": 77 }, { "epoch": 0.015295617217374253, "grad_norm": 9.384387016296387, "learning_rate": 0.00019880245918404342, "loss": 2.6391, "step": 78 }, { "epoch": 0.015491714874007256, "grad_norm": 6.753894329071045, "learning_rate": 0.000198767051499365, "loss": 2.9391, "step": 79 }, { "epoch": 0.01568781253064026, "grad_norm": 6.399787902832031, "learning_rate": 0.00019873113120936074, "loss": 3.7452, "step": 80 }, { "epoch": 0.01588391018727326, "grad_norm": 8.880107879638672, "learning_rate": 0.00019869469850045842, "loss": 1.2771, "step": 81 }, { "epoch": 0.016080007843906265, "grad_norm": 12.630661964416504, "learning_rate": 0.00019865775356174545, "loss": 2.2072, "step": 82 }, { "epoch": 0.01627610550053927, "grad_norm": 7.974503993988037, "learning_rate": 0.00019862029658496762, "loss": 1.9795, "step": 83 }, { "epoch": 0.016472203157172273, "grad_norm": 50.43594741821289, "learning_rate": 0.00019858232776452837, "loss": 1.5331, "step": 84 }, { "epoch": 0.016668300813805274, "grad_norm": 7.273484230041504, "learning_rate": 0.00019854384729748746, "loss": 2.4005, "step": 85 }, { "epoch": 0.016864398470438278, "grad_norm": 5.826492786407471, "learning_rate": 0.00019850485538356027, "loss": 2.1915, "step": 86 }, { "epoch": 0.017060496127071282, "grad_norm": 9.881019592285156, "learning_rate": 0.0001984653522251165, "loss": 2.3309, "step": 87 }, { "epoch": 0.017256593783704286, "grad_norm": 9.147713661193848, "learning_rate": 0.00019842533802717923, "loss": 1.1404, "step": 88 }, { "epoch": 0.017452691440337287, "grad_norm": 13.98263931274414, "learning_rate": 0.00019838481299742398, "loss": 1.2166, "step": 89 }, { "epoch": 0.01764878909697029, "grad_norm": 8.206791877746582, "learning_rate": 0.0001983437773461774, "loss": 2.6039, "step": 90 }, { "epoch": 0.017844886753603295, "grad_norm": 10.445443153381348, "learning_rate": 0.00019830223128641637, "loss": 2.3554, "step": 91 }, { "epoch": 0.0180409844102363, "grad_norm": 11.756292343139648, "learning_rate": 0.00019826017503376666, "loss": 1.7371, "step": 92 }, { "epoch": 0.0182370820668693, "grad_norm": 7.509032249450684, "learning_rate": 0.00019821760880650214, "loss": 1.389, "step": 93 }, { "epoch": 0.018433179723502304, "grad_norm": 8.619280815124512, "learning_rate": 0.00019817453282554333, "loss": 1.6818, "step": 94 }, { "epoch": 0.01862927738013531, "grad_norm": 9.11640739440918, "learning_rate": 0.00019813094731445654, "loss": 1.631, "step": 95 }, { "epoch": 0.01882537503676831, "grad_norm": 14.109521865844727, "learning_rate": 0.00019808685249945245, "loss": 2.0497, "step": 96 }, { "epoch": 0.019021472693401313, "grad_norm": 10.804281234741211, "learning_rate": 0.00019804224860938506, "loss": 2.2364, "step": 97 }, { "epoch": 0.019217570350034317, "grad_norm": 7.363731384277344, "learning_rate": 0.0001979971358757505, "loss": 1.0967, "step": 98 }, { "epoch": 0.01941366800666732, "grad_norm": 15.269912719726562, "learning_rate": 0.0001979515145326859, "loss": 2.8752, "step": 99 }, { "epoch": 0.019609765663300322, "grad_norm": 5.457535266876221, "learning_rate": 0.000197905384816968, "loss": 1.7098, "step": 100 }, { "epoch": 0.019805863319933326, "grad_norm": 4.689967632293701, "learning_rate": 0.00019785874696801202, "loss": 2.2133, "step": 101 }, { "epoch": 0.02000196097656633, "grad_norm": 10.993409156799316, "learning_rate": 0.00019781160122787046, "loss": 2.314, "step": 102 }, { "epoch": 0.020198058633199335, "grad_norm": 8.199251174926758, "learning_rate": 0.00019776394784123177, "loss": 2.5164, "step": 103 }, { "epoch": 0.020394156289832335, "grad_norm": 15.144885063171387, "learning_rate": 0.00019771578705541916, "loss": 2.0058, "step": 104 }, { "epoch": 0.02059025394646534, "grad_norm": 5.252450466156006, "learning_rate": 0.00019766711912038915, "loss": 1.7012, "step": 105 }, { "epoch": 0.020786351603098344, "grad_norm": 8.265049934387207, "learning_rate": 0.0001976179442887305, "loss": 1.8646, "step": 106 }, { "epoch": 0.020982449259731348, "grad_norm": 8.365408897399902, "learning_rate": 0.00019756826281566272, "loss": 1.9615, "step": 107 }, { "epoch": 0.02117854691636435, "grad_norm": 7.514213562011719, "learning_rate": 0.00019751807495903484, "loss": 1.4897, "step": 108 }, { "epoch": 0.021374644572997353, "grad_norm": 15.234655380249023, "learning_rate": 0.00019746738097932407, "loss": 2.0467, "step": 109 }, { "epoch": 0.021570742229630357, "grad_norm": 6.856448650360107, "learning_rate": 0.0001974161811396343, "loss": 1.4492, "step": 110 }, { "epoch": 0.021766839886263357, "grad_norm": 7.893224716186523, "learning_rate": 0.00019736447570569503, "loss": 1.919, "step": 111 }, { "epoch": 0.02196293754289636, "grad_norm": 8.966511726379395, "learning_rate": 0.0001973122649458597, "loss": 2.4484, "step": 112 }, { "epoch": 0.022159035199529366, "grad_norm": 7.631579875946045, "learning_rate": 0.00019725954913110442, "loss": 1.4992, "step": 113 }, { "epoch": 0.02235513285616237, "grad_norm": 7.418518543243408, "learning_rate": 0.0001972063285350266, "loss": 0.8401, "step": 114 }, { "epoch": 0.02255123051279537, "grad_norm": 7.739930629730225, "learning_rate": 0.00019715260343384347, "loss": 2.0713, "step": 115 }, { "epoch": 0.022747328169428375, "grad_norm": 6.441893100738525, "learning_rate": 0.00019709837410639063, "loss": 1.4438, "step": 116 }, { "epoch": 0.02294342582606138, "grad_norm": 6.008083820343018, "learning_rate": 0.0001970436408341207, "loss": 1.3503, "step": 117 }, { "epoch": 0.023139523482694383, "grad_norm": 7.100820541381836, "learning_rate": 0.00019698840390110176, "loss": 1.4726, "step": 118 }, { "epoch": 0.023335621139327384, "grad_norm": 10.213756561279297, "learning_rate": 0.0001969326635940159, "loss": 0.8107, "step": 119 }, { "epoch": 0.023531718795960388, "grad_norm": 5.251387119293213, "learning_rate": 0.00019687642020215775, "loss": 1.5542, "step": 120 }, { "epoch": 0.023727816452593392, "grad_norm": 6.100740432739258, "learning_rate": 0.00019681967401743297, "loss": 1.2512, "step": 121 }, { "epoch": 0.023923914109226396, "grad_norm": 7.356696128845215, "learning_rate": 0.00019676242533435678, "loss": 2.4725, "step": 122 }, { "epoch": 0.024120011765859397, "grad_norm": 11.542431831359863, "learning_rate": 0.00019670467445005233, "loss": 3.0307, "step": 123 }, { "epoch": 0.0243161094224924, "grad_norm": 12.166086196899414, "learning_rate": 0.00019664642166424928, "loss": 1.2784, "step": 124 }, { "epoch": 0.024512207079125405, "grad_norm": 5.222433090209961, "learning_rate": 0.00019658766727928206, "loss": 1.1759, "step": 125 }, { "epoch": 0.024708304735758406, "grad_norm": 4.77174711227417, "learning_rate": 0.00019652841160008858, "loss": 1.1041, "step": 126 }, { "epoch": 0.02490440239239141, "grad_norm": 4.879274368286133, "learning_rate": 0.0001964686549342084, "loss": 2.6326, "step": 127 }, { "epoch": 0.025100500049024414, "grad_norm": 14.171689987182617, "learning_rate": 0.00019640839759178116, "loss": 3.4144, "step": 128 }, { "epoch": 0.02529659770565742, "grad_norm": 7.598373889923096, "learning_rate": 0.00019634763988554522, "loss": 2.0596, "step": 129 }, { "epoch": 0.02549269536229042, "grad_norm": 6.88770866394043, "learning_rate": 0.00019628638213083565, "loss": 1.4691, "step": 130 }, { "epoch": 0.025688793018923423, "grad_norm": 7.128096580505371, "learning_rate": 0.00019622462464558295, "loss": 1.3307, "step": 131 }, { "epoch": 0.025884890675556427, "grad_norm": 6.430881500244141, "learning_rate": 0.00019616236775031113, "loss": 0.9491, "step": 132 }, { "epoch": 0.02608098833218943, "grad_norm": 9.912070274353027, "learning_rate": 0.00019609961176813624, "loss": 2.5006, "step": 133 }, { "epoch": 0.026277085988822432, "grad_norm": 8.550467491149902, "learning_rate": 0.0001960363570247645, "loss": 2.4952, "step": 134 }, { "epoch": 0.026473183645455436, "grad_norm": 4.201476573944092, "learning_rate": 0.0001959726038484909, "loss": 0.9033, "step": 135 }, { "epoch": 0.02666928130208844, "grad_norm": 5.774847984313965, "learning_rate": 0.00019590835257019714, "loss": 2.1291, "step": 136 }, { "epoch": 0.026865378958721445, "grad_norm": 8.179195404052734, "learning_rate": 0.00019584360352335023, "loss": 2.7527, "step": 137 }, { "epoch": 0.027061476615354445, "grad_norm": 15.658841133117676, "learning_rate": 0.0001957783570440005, "loss": 1.8304, "step": 138 }, { "epoch": 0.02725757427198745, "grad_norm": 5.7399163246154785, "learning_rate": 0.0001957126134707801, "loss": 1.7071, "step": 139 }, { "epoch": 0.027453671928620454, "grad_norm": 5.0817389488220215, "learning_rate": 0.00019564637314490108, "loss": 1.8933, "step": 140 }, { "epoch": 0.027649769585253458, "grad_norm": 5.634946346282959, "learning_rate": 0.0001955796364101535, "loss": 1.7343, "step": 141 }, { "epoch": 0.02784586724188646, "grad_norm": 6.406938552856445, "learning_rate": 0.00019551240361290407, "loss": 2.3013, "step": 142 }, { "epoch": 0.028041964898519463, "grad_norm": 8.239458084106445, "learning_rate": 0.00019544467510209388, "loss": 1.2177, "step": 143 }, { "epoch": 0.028238062555152467, "grad_norm": 11.887965202331543, "learning_rate": 0.0001953764512292369, "loss": 2.4312, "step": 144 }, { "epoch": 0.028434160211785468, "grad_norm": 7.482359409332275, "learning_rate": 0.00019530773234841803, "loss": 1.1083, "step": 145 }, { "epoch": 0.028630257868418472, "grad_norm": 8.86729621887207, "learning_rate": 0.00019523851881629126, "loss": 1.6451, "step": 146 }, { "epoch": 0.028826355525051476, "grad_norm": 7.395509719848633, "learning_rate": 0.0001951688109920778, "loss": 1.31, "step": 147 }, { "epoch": 0.02902245318168448, "grad_norm": 4.955163955688477, "learning_rate": 0.00019509860923756442, "loss": 2.5206, "step": 148 }, { "epoch": 0.02921855083831748, "grad_norm": 5.034746170043945, "learning_rate": 0.00019502791391710125, "loss": 0.9336, "step": 149 }, { "epoch": 0.029414648494950485, "grad_norm": 12.375234603881836, "learning_rate": 0.00019495672539760007, "loss": 2.1276, "step": 150 }, { "epoch": 0.02961074615158349, "grad_norm": 5.832932949066162, "learning_rate": 0.00019488504404853248, "loss": 1.3252, "step": 151 }, { "epoch": 0.029806843808216493, "grad_norm": 5.934417724609375, "learning_rate": 0.00019481287024192775, "loss": 1.5907, "step": 152 }, { "epoch": 0.030002941464849494, "grad_norm": 9.238896369934082, "learning_rate": 0.00019474020435237117, "loss": 1.1184, "step": 153 }, { "epoch": 0.030199039121482498, "grad_norm": 9.787931442260742, "learning_rate": 0.00019466704675700185, "loss": 1.4931, "step": 154 }, { "epoch": 0.030395136778115502, "grad_norm": 7.260796070098877, "learning_rate": 0.00019459339783551094, "loss": 0.8924, "step": 155 }, { "epoch": 0.030591234434748506, "grad_norm": 8.712836265563965, "learning_rate": 0.00019451925797013954, "loss": 1.586, "step": 156 }, { "epoch": 0.030787332091381507, "grad_norm": 11.15104866027832, "learning_rate": 0.00019444462754567682, "loss": 1.5007, "step": 157 }, { "epoch": 0.03098342974801451, "grad_norm": 7.158255100250244, "learning_rate": 0.00019436950694945798, "loss": 2.4118, "step": 158 }, { "epoch": 0.031179527404647515, "grad_norm": 11.58385944366455, "learning_rate": 0.00019429389657136213, "loss": 2.1638, "step": 159 }, { "epoch": 0.03137562506128052, "grad_norm": 7.469117641448975, "learning_rate": 0.00019421779680381054, "loss": 3.0682, "step": 160 }, { "epoch": 0.031571722717913524, "grad_norm": 10.78966999053955, "learning_rate": 0.00019414120804176426, "loss": 1.1822, "step": 161 }, { "epoch": 0.03176782037454652, "grad_norm": 9.68694019317627, "learning_rate": 0.00019406413068272238, "loss": 2.5351, "step": 162 }, { "epoch": 0.031963918031179525, "grad_norm": 11.67428970336914, "learning_rate": 0.00019398656512671972, "loss": 1.9244, "step": 163 }, { "epoch": 0.03216001568781253, "grad_norm": 12.72513198852539, "learning_rate": 0.00019390851177632497, "loss": 3.2138, "step": 164 }, { "epoch": 0.03235611334444553, "grad_norm": 8.345921516418457, "learning_rate": 0.00019382997103663838, "loss": 2.6435, "step": 165 }, { "epoch": 0.03255221100107854, "grad_norm": 7.740304470062256, "learning_rate": 0.0001937509433152899, "loss": 0.8189, "step": 166 }, { "epoch": 0.03274830865771154, "grad_norm": 9.329862594604492, "learning_rate": 0.0001936714290224368, "loss": 1.4106, "step": 167 }, { "epoch": 0.032944406314344546, "grad_norm": 7.179844379425049, "learning_rate": 0.00019359142857076176, "loss": 1.8125, "step": 168 }, { "epoch": 0.03314050397097755, "grad_norm": 7.835447311401367, "learning_rate": 0.00019351094237547066, "loss": 1.6617, "step": 169 }, { "epoch": 0.03333660162761055, "grad_norm": 6.018518924713135, "learning_rate": 0.0001934299708542904, "loss": 2.4333, "step": 170 }, { "epoch": 0.03353269928424355, "grad_norm": 8.176468849182129, "learning_rate": 0.00019334851442746664, "loss": 2.5915, "step": 171 }, { "epoch": 0.033728796940876556, "grad_norm": 8.241739273071289, "learning_rate": 0.00019326657351776186, "loss": 1.666, "step": 172 }, { "epoch": 0.03392489459750956, "grad_norm": 8.064835548400879, "learning_rate": 0.000193184148550453, "loss": 1.477, "step": 173 }, { "epoch": 0.034120992254142564, "grad_norm": 5.790217399597168, "learning_rate": 0.00019310123995332917, "loss": 0.7703, "step": 174 }, { "epoch": 0.03431708991077557, "grad_norm": 9.38430118560791, "learning_rate": 0.00019301784815668974, "loss": 1.5785, "step": 175 }, { "epoch": 0.03451318756740857, "grad_norm": 8.252826690673828, "learning_rate": 0.00019293397359334167, "loss": 2.1462, "step": 176 }, { "epoch": 0.03470928522404157, "grad_norm": 12.65652847290039, "learning_rate": 0.00019284961669859766, "loss": 1.3009, "step": 177 }, { "epoch": 0.034905382880674574, "grad_norm": 6.8490753173828125, "learning_rate": 0.00019276477791027374, "loss": 2.4905, "step": 178 }, { "epoch": 0.03510148053730758, "grad_norm": 4.2581048011779785, "learning_rate": 0.0001926794576686869, "loss": 0.9042, "step": 179 }, { "epoch": 0.03529757819394058, "grad_norm": 6.415445327758789, "learning_rate": 0.0001925936564166529, "loss": 2.238, "step": 180 }, { "epoch": 0.035493675850573586, "grad_norm": 13.620756149291992, "learning_rate": 0.00019250737459948405, "loss": 1.5966, "step": 181 }, { "epoch": 0.03568977350720659, "grad_norm": 10.609662055969238, "learning_rate": 0.00019242061266498675, "loss": 1.081, "step": 182 }, { "epoch": 0.035885871163839594, "grad_norm": 8.404073715209961, "learning_rate": 0.00019233337106345925, "loss": 1.849, "step": 183 }, { "epoch": 0.0360819688204726, "grad_norm": 5.560455322265625, "learning_rate": 0.00019224565024768926, "loss": 1.4533, "step": 184 }, { "epoch": 0.036278066477105596, "grad_norm": 7.896220684051514, "learning_rate": 0.00019215745067295169, "loss": 2.482, "step": 185 }, { "epoch": 0.0364741641337386, "grad_norm": 9.554024696350098, "learning_rate": 0.00019206877279700612, "loss": 1.9367, "step": 186 }, { "epoch": 0.036670261790371604, "grad_norm": 3.333113193511963, "learning_rate": 0.00019197961708009473, "loss": 1.1477, "step": 187 }, { "epoch": 0.03686635944700461, "grad_norm": 9.468240737915039, "learning_rate": 0.00019188998398493953, "loss": 1.0849, "step": 188 }, { "epoch": 0.03706245710363761, "grad_norm": 10.807921409606934, "learning_rate": 0.00019179987397674022, "loss": 2.0192, "step": 189 }, { "epoch": 0.03725855476027062, "grad_norm": 7.14724588394165, "learning_rate": 0.0001917092875231717, "loss": 2.1502, "step": 190 }, { "epoch": 0.03745465241690362, "grad_norm": 12.262707710266113, "learning_rate": 0.00019161822509438162, "loss": 2.423, "step": 191 }, { "epoch": 0.03765075007353662, "grad_norm": 35.0489387512207, "learning_rate": 0.000191526687162988, "loss": 2.5959, "step": 192 }, { "epoch": 0.03784684773016962, "grad_norm": 6.615735054016113, "learning_rate": 0.0001914346742040767, "loss": 1.7733, "step": 193 }, { "epoch": 0.038042945386802626, "grad_norm": 4.537426471710205, "learning_rate": 0.00019134218669519896, "loss": 1.0028, "step": 194 }, { "epoch": 0.03823904304343563, "grad_norm": 5.247801303863525, "learning_rate": 0.00019124922511636912, "loss": 0.8412, "step": 195 }, { "epoch": 0.038435140700068635, "grad_norm": 6.2183918952941895, "learning_rate": 0.00019115578995006173, "loss": 1.7212, "step": 196 }, { "epoch": 0.03863123835670164, "grad_norm": 9.330825805664062, "learning_rate": 0.00019106188168120948, "loss": 1.5341, "step": 197 }, { "epoch": 0.03882733601333464, "grad_norm": 9.86260986328125, "learning_rate": 0.00019096750079720037, "loss": 2.765, "step": 198 }, { "epoch": 0.03902343366996765, "grad_norm": 10.341052055358887, "learning_rate": 0.00019087264778787534, "loss": 1.9024, "step": 199 }, { "epoch": 0.039219531326600644, "grad_norm": 9.549159049987793, "learning_rate": 0.00019077732314552566, "loss": 1.2644, "step": 200 }, { "epoch": 0.03941562898323365, "grad_norm": 5.25094747543335, "learning_rate": 0.00019068152736489036, "loss": 1.334, "step": 201 }, { "epoch": 0.03961172663986665, "grad_norm": 7.197662830352783, "learning_rate": 0.00019058526094315378, "loss": 1.9093, "step": 202 }, { "epoch": 0.03980782429649966, "grad_norm": 8.476766586303711, "learning_rate": 0.0001904885243799429, "loss": 1.477, "step": 203 }, { "epoch": 0.04000392195313266, "grad_norm": 8.232537269592285, "learning_rate": 0.00019039131817732462, "loss": 1.4013, "step": 204 }, { "epoch": 0.040200019609765665, "grad_norm": 15.687997817993164, "learning_rate": 0.0001902936428398035, "loss": 1.6772, "step": 205 }, { "epoch": 0.04039611726639867, "grad_norm": 7.573246479034424, "learning_rate": 0.00019019549887431877, "loss": 1.5007, "step": 206 }, { "epoch": 0.040592214923031666, "grad_norm": 11.531679153442383, "learning_rate": 0.0001900968867902419, "loss": 2.6798, "step": 207 }, { "epoch": 0.04078831257966467, "grad_norm": 6.225399494171143, "learning_rate": 0.00018999780709937398, "loss": 1.3078, "step": 208 }, { "epoch": 0.040984410236297675, "grad_norm": 10.358306884765625, "learning_rate": 0.0001898982603159429, "loss": 1.7353, "step": 209 }, { "epoch": 0.04118050789293068, "grad_norm": 8.146821975708008, "learning_rate": 0.00018979824695660087, "loss": 1.415, "step": 210 }, { "epoch": 0.04137660554956368, "grad_norm": 4.390834808349609, "learning_rate": 0.00018969776754042156, "loss": 1.7612, "step": 211 }, { "epoch": 0.04157270320619669, "grad_norm": 7.958174228668213, "learning_rate": 0.0001895968225888976, "loss": 2.6614, "step": 212 }, { "epoch": 0.04176880086282969, "grad_norm": 9.981225967407227, "learning_rate": 0.00018949541262593762, "loss": 2.0158, "step": 213 }, { "epoch": 0.041964898519462696, "grad_norm": 4.456605911254883, "learning_rate": 0.00018939353817786387, "loss": 1.0621, "step": 214 }, { "epoch": 0.04216099617609569, "grad_norm": 7.546274662017822, "learning_rate": 0.00018929119977340917, "loss": 1.7333, "step": 215 }, { "epoch": 0.0423570938327287, "grad_norm": 11.629569053649902, "learning_rate": 0.0001891883979437143, "loss": 1.4268, "step": 216 }, { "epoch": 0.0425531914893617, "grad_norm": 17.710948944091797, "learning_rate": 0.00018908513322232528, "loss": 2.8701, "step": 217 }, { "epoch": 0.042749289145994705, "grad_norm": 6.267049789428711, "learning_rate": 0.00018898140614519054, "loss": 1.6313, "step": 218 }, { "epoch": 0.04294538680262771, "grad_norm": 4.971591949462891, "learning_rate": 0.00018887721725065814, "loss": 2.0962, "step": 219 }, { "epoch": 0.043141484459260714, "grad_norm": 5.603585243225098, "learning_rate": 0.00018877256707947306, "loss": 0.6683, "step": 220 }, { "epoch": 0.04333758211589372, "grad_norm": 6.029137134552002, "learning_rate": 0.00018866745617477423, "loss": 1.5375, "step": 221 }, { "epoch": 0.043533679772526715, "grad_norm": 7.4105143547058105, "learning_rate": 0.00018856188508209183, "loss": 1.9524, "step": 222 }, { "epoch": 0.04372977742915972, "grad_norm": 8.321500778198242, "learning_rate": 0.00018845585434934452, "loss": 2.1109, "step": 223 }, { "epoch": 0.04392587508579272, "grad_norm": 9.238992691040039, "learning_rate": 0.00018834936452683638, "loss": 1.4247, "step": 224 }, { "epoch": 0.04412197274242573, "grad_norm": 5.125700950622559, "learning_rate": 0.00018824241616725434, "loss": 1.1266, "step": 225 }, { "epoch": 0.04431807039905873, "grad_norm": 7.538069725036621, "learning_rate": 0.000188135009825665, "loss": 2.1554, "step": 226 }, { "epoch": 0.044514168055691736, "grad_norm": 8.309137344360352, "learning_rate": 0.00018802714605951199, "loss": 1.1435, "step": 227 }, { "epoch": 0.04471026571232474, "grad_norm": 22.02942657470703, "learning_rate": 0.00018791882542861302, "loss": 1.8154, "step": 228 }, { "epoch": 0.044906363368957744, "grad_norm": 7.017299652099609, "learning_rate": 0.0001878100484951569, "loss": 1.4998, "step": 229 }, { "epoch": 0.04510246102559074, "grad_norm": 18.39406394958496, "learning_rate": 0.00018770081582370068, "loss": 2.1662, "step": 230 }, { "epoch": 0.045298558682223745, "grad_norm": 9.11802864074707, "learning_rate": 0.0001875911279811667, "loss": 0.7446, "step": 231 }, { "epoch": 0.04549465633885675, "grad_norm": 7.193735122680664, "learning_rate": 0.00018748098553683968, "loss": 1.9472, "step": 232 }, { "epoch": 0.045690753995489754, "grad_norm": 23.407245635986328, "learning_rate": 0.0001873703890623637, "loss": 2.1782, "step": 233 }, { "epoch": 0.04588685165212276, "grad_norm": 6.547053813934326, "learning_rate": 0.00018725933913173938, "loss": 1.9687, "step": 234 }, { "epoch": 0.04608294930875576, "grad_norm": 10.576699256896973, "learning_rate": 0.00018714783632132068, "loss": 1.8832, "step": 235 }, { "epoch": 0.046279046965388766, "grad_norm": 5.852027416229248, "learning_rate": 0.00018703588120981207, "loss": 1.8932, "step": 236 }, { "epoch": 0.04647514462202176, "grad_norm": 7.023755073547363, "learning_rate": 0.00018692347437826548, "loss": 3.7953, "step": 237 }, { "epoch": 0.04667124227865477, "grad_norm": 13.61612606048584, "learning_rate": 0.00018681061641007737, "loss": 1.9077, "step": 238 }, { "epoch": 0.04686733993528777, "grad_norm": 5.3344526290893555, "learning_rate": 0.0001866973078909854, "loss": 1.4342, "step": 239 }, { "epoch": 0.047063437591920776, "grad_norm": 38.80408477783203, "learning_rate": 0.00018658354940906586, "loss": 2.3665, "step": 240 }, { "epoch": 0.04725953524855378, "grad_norm": 9.670344352722168, "learning_rate": 0.00018646934155473022, "loss": 0.9006, "step": 241 }, { "epoch": 0.047455632905186784, "grad_norm": 5.1102495193481445, "learning_rate": 0.00018635468492072228, "loss": 1.2289, "step": 242 }, { "epoch": 0.04765173056181979, "grad_norm": 9.1209077835083, "learning_rate": 0.00018623958010211493, "loss": 1.6009, "step": 243 }, { "epoch": 0.04784782821845279, "grad_norm": 16.793027877807617, "learning_rate": 0.0001861240276963073, "loss": 0.94, "step": 244 }, { "epoch": 0.04804392587508579, "grad_norm": 6.90054988861084, "learning_rate": 0.00018600802830302134, "loss": 1.559, "step": 245 }, { "epoch": 0.048240023531718794, "grad_norm": 13.111268043518066, "learning_rate": 0.0001858915825242991, "loss": 2.1186, "step": 246 }, { "epoch": 0.0484361211883518, "grad_norm": 6.356579780578613, "learning_rate": 0.00018577469096449925, "loss": 1.6653, "step": 247 }, { "epoch": 0.0486322188449848, "grad_norm": 9.505541801452637, "learning_rate": 0.00018565735423029404, "loss": 0.9774, "step": 248 }, { "epoch": 0.048828316501617806, "grad_norm": 8.927581787109375, "learning_rate": 0.00018553957293066632, "loss": 2.6455, "step": 249 }, { "epoch": 0.04902441415825081, "grad_norm": 7.568793773651123, "learning_rate": 0.00018542134767690616, "loss": 1.1464, "step": 250 }, { "epoch": 0.049220511814883815, "grad_norm": 7.632232189178467, "learning_rate": 0.00018530267908260784, "loss": 1.2671, "step": 251 }, { "epoch": 0.04941660947151681, "grad_norm": 4.4279561042785645, "learning_rate": 0.00018518356776366657, "loss": 2.0384, "step": 252 }, { "epoch": 0.049612707128149816, "grad_norm": 10.818602561950684, "learning_rate": 0.00018506401433827528, "loss": 1.0559, "step": 253 }, { "epoch": 0.04980880478478282, "grad_norm": 5.57148551940918, "learning_rate": 0.00018494401942692153, "loss": 0.9603, "step": 254 }, { "epoch": 0.050004902441415824, "grad_norm": 11.1985502243042, "learning_rate": 0.00018482358365238413, "loss": 2.4928, "step": 255 }, { "epoch": 0.05020100009804883, "grad_norm": 4.890799522399902, "learning_rate": 0.00018470270763973004, "loss": 1.4034, "step": 256 }, { "epoch": 0.05039709775468183, "grad_norm": 6.2078680992126465, "learning_rate": 0.00018458139201631108, "loss": 1.782, "step": 257 }, { "epoch": 0.05059319541131484, "grad_norm": 24.89278221130371, "learning_rate": 0.00018445963741176065, "loss": 3.7879, "step": 258 }, { "epoch": 0.05078929306794784, "grad_norm": 5.363570213317871, "learning_rate": 0.00018433744445799045, "loss": 1.4292, "step": 259 }, { "epoch": 0.05098539072458084, "grad_norm": 7.669764041900635, "learning_rate": 0.0001842148137891873, "loss": 2.0483, "step": 260 }, { "epoch": 0.05118148838121384, "grad_norm": 5.229150295257568, "learning_rate": 0.00018409174604180976, "loss": 3.2863, "step": 261 }, { "epoch": 0.05137758603784685, "grad_norm": 5.850373268127441, "learning_rate": 0.0001839682418545848, "loss": 1.8197, "step": 262 }, { "epoch": 0.05157368369447985, "grad_norm": 7.138283729553223, "learning_rate": 0.00018384430186850454, "loss": 2.7101, "step": 263 }, { "epoch": 0.051769781351112855, "grad_norm": 10.918169975280762, "learning_rate": 0.000183719926726823, "loss": 1.8243, "step": 264 }, { "epoch": 0.05196587900774586, "grad_norm": 9.205517768859863, "learning_rate": 0.00018359511707505258, "loss": 1.4992, "step": 265 }, { "epoch": 0.05216197666437886, "grad_norm": 8.567139625549316, "learning_rate": 0.00018346987356096086, "loss": 1.051, "step": 266 }, { "epoch": 0.05235807432101187, "grad_norm": 10.313075065612793, "learning_rate": 0.00018334419683456717, "loss": 2.6062, "step": 267 }, { "epoch": 0.052554171977644865, "grad_norm": 7.515801906585693, "learning_rate": 0.0001832180875481392, "loss": 1.266, "step": 268 }, { "epoch": 0.05275026963427787, "grad_norm": 5.345809459686279, "learning_rate": 0.00018309154635618965, "loss": 1.2526, "step": 269 }, { "epoch": 0.05294636729091087, "grad_norm": 13.568882942199707, "learning_rate": 0.00018296457391547296, "loss": 2.5183, "step": 270 }, { "epoch": 0.05314246494754388, "grad_norm": 10.022235870361328, "learning_rate": 0.00018283717088498155, "loss": 2.2774, "step": 271 }, { "epoch": 0.05333856260417688, "grad_norm": 6.537176132202148, "learning_rate": 0.0001827093379259428, "loss": 1.4989, "step": 272 }, { "epoch": 0.053534660260809885, "grad_norm": 17.213987350463867, "learning_rate": 0.00018258107570181533, "loss": 2.4885, "step": 273 }, { "epoch": 0.05373075791744289, "grad_norm": 6.48647403717041, "learning_rate": 0.00018245238487828573, "loss": 1.2309, "step": 274 }, { "epoch": 0.05392685557407589, "grad_norm": 5.479822158813477, "learning_rate": 0.000182323266123265, "loss": 1.8959, "step": 275 }, { "epoch": 0.05412295323070889, "grad_norm": 7.716124534606934, "learning_rate": 0.00018219372010688515, "loss": 1.8321, "step": 276 }, { "epoch": 0.054319050887341895, "grad_norm": 9.968965530395508, "learning_rate": 0.00018206374750149567, "loss": 4.1652, "step": 277 }, { "epoch": 0.0545151485439749, "grad_norm": 6.009235382080078, "learning_rate": 0.00018193334898166007, "loss": 0.8178, "step": 278 }, { "epoch": 0.0547112462006079, "grad_norm": 8.031886100769043, "learning_rate": 0.00018180252522415242, "loss": 1.783, "step": 279 }, { "epoch": 0.05490734385724091, "grad_norm": 5.5589680671691895, "learning_rate": 0.00018167127690795368, "loss": 1.3049, "step": 280 }, { "epoch": 0.05510344151387391, "grad_norm": 5.04995059967041, "learning_rate": 0.0001815396047142485, "loss": 0.8962, "step": 281 }, { "epoch": 0.055299539170506916, "grad_norm": 5.3526692390441895, "learning_rate": 0.0001814075093264212, "loss": 1.201, "step": 282 }, { "epoch": 0.05549563682713991, "grad_norm": 11.980429649353027, "learning_rate": 0.00018127499143005268, "loss": 0.6955, "step": 283 }, { "epoch": 0.05569173448377292, "grad_norm": 38.28229904174805, "learning_rate": 0.00018114205171291663, "loss": 1.7335, "step": 284 }, { "epoch": 0.05588783214040592, "grad_norm": 6.15138053894043, "learning_rate": 0.000181008690864976, "loss": 1.2766, "step": 285 }, { "epoch": 0.056083929797038926, "grad_norm": 7.846836566925049, "learning_rate": 0.00018087490957837944, "loss": 1.155, "step": 286 }, { "epoch": 0.05628002745367193, "grad_norm": 7.675628185272217, "learning_rate": 0.00018074070854745772, "loss": 1.6129, "step": 287 }, { "epoch": 0.056476125110304934, "grad_norm": 12.245649337768555, "learning_rate": 0.00018060608846872005, "loss": 1.7585, "step": 288 }, { "epoch": 0.05667222276693794, "grad_norm": 10.520101547241211, "learning_rate": 0.00018047105004085053, "loss": 1.9265, "step": 289 }, { "epoch": 0.056868320423570935, "grad_norm": 7.400151252746582, "learning_rate": 0.00018033559396470454, "loss": 1.4189, "step": 290 }, { "epoch": 0.05706441808020394, "grad_norm": 12.058060646057129, "learning_rate": 0.00018019972094330503, "loss": 2.3312, "step": 291 }, { "epoch": 0.057260515736836944, "grad_norm": 5.313794136047363, "learning_rate": 0.00018006343168183893, "loss": 2.0051, "step": 292 }, { "epoch": 0.05745661339346995, "grad_norm": 11.182997703552246, "learning_rate": 0.0001799267268876535, "loss": 1.4779, "step": 293 }, { "epoch": 0.05765271105010295, "grad_norm": 16.24866485595703, "learning_rate": 0.0001797896072702526, "loss": 2.4689, "step": 294 }, { "epoch": 0.057848808706735956, "grad_norm": 7.471411228179932, "learning_rate": 0.00017965207354129307, "loss": 3.0599, "step": 295 }, { "epoch": 0.05804490636336896, "grad_norm": 7.715878486633301, "learning_rate": 0.00017951412641458098, "loss": 0.8256, "step": 296 }, { "epoch": 0.058241004020001964, "grad_norm": 22.084482192993164, "learning_rate": 0.000179375766606068, "loss": 2.457, "step": 297 }, { "epoch": 0.05843710167663496, "grad_norm": 8.041847229003906, "learning_rate": 0.00017923699483384753, "loss": 1.5642, "step": 298 }, { "epoch": 0.058633199333267966, "grad_norm": 12.814888000488281, "learning_rate": 0.00017909781181815117, "loss": 1.5129, "step": 299 }, { "epoch": 0.05882929698990097, "grad_norm": 9.216371536254883, "learning_rate": 0.0001789582182813449, "loss": 2.0632, "step": 300 }, { "epoch": 0.059025394646533974, "grad_norm": 12.80371379852295, "learning_rate": 0.00017881821494792528, "loss": 2.8705, "step": 301 }, { "epoch": 0.05922149230316698, "grad_norm": 7.234943389892578, "learning_rate": 0.00017867780254451576, "loss": 2.6664, "step": 302 }, { "epoch": 0.05941758995979998, "grad_norm": 11.168726921081543, "learning_rate": 0.00017853698179986282, "loss": 1.347, "step": 303 }, { "epoch": 0.059613687616432987, "grad_norm": 19.369266510009766, "learning_rate": 0.00017839575344483238, "loss": 2.68, "step": 304 }, { "epoch": 0.059809785273065984, "grad_norm": 7.1730570793151855, "learning_rate": 0.0001782541182124057, "loss": 2.3908, "step": 305 }, { "epoch": 0.06000588292969899, "grad_norm": 7.243929862976074, "learning_rate": 0.0001781120768376759, "loss": 1.0056, "step": 306 }, { "epoch": 0.06020198058633199, "grad_norm": 7.748988628387451, "learning_rate": 0.00017796963005784394, "loss": 2.1776, "step": 307 }, { "epoch": 0.060398078242964996, "grad_norm": 13.446945190429688, "learning_rate": 0.0001778267786122148, "loss": 2.3275, "step": 308 }, { "epoch": 0.060594175899598, "grad_norm": 10.720627784729004, "learning_rate": 0.0001776835232421938, "loss": 1.046, "step": 309 }, { "epoch": 0.060790273556231005, "grad_norm": 11.274985313415527, "learning_rate": 0.00017753986469128257, "loss": 2.4269, "step": 310 }, { "epoch": 0.06098637121286401, "grad_norm": 8.671335220336914, "learning_rate": 0.00017739580370507532, "loss": 2.1488, "step": 311 }, { "epoch": 0.06118246886949701, "grad_norm": 8.375978469848633, "learning_rate": 0.0001772513410312548, "loss": 1.8458, "step": 312 }, { "epoch": 0.06137856652613001, "grad_norm": 11.178112983703613, "learning_rate": 0.00017710647741958868, "loss": 2.7169, "step": 313 }, { "epoch": 0.061574664182763014, "grad_norm": 8.29799747467041, "learning_rate": 0.00017696121362192544, "loss": 1.455, "step": 314 }, { "epoch": 0.06177076183939602, "grad_norm": 6.712766647338867, "learning_rate": 0.00017681555039219054, "loss": 1.2604, "step": 315 }, { "epoch": 0.06196685949602902, "grad_norm": 7.891608238220215, "learning_rate": 0.00017666948848638257, "loss": 2.1795, "step": 316 }, { "epoch": 0.06216295715266203, "grad_norm": 5.039219379425049, "learning_rate": 0.00017652302866256916, "loss": 0.9069, "step": 317 }, { "epoch": 0.06235905480929503, "grad_norm": 9.421103477478027, "learning_rate": 0.00017637617168088325, "loss": 2.4256, "step": 318 }, { "epoch": 0.06255515246592804, "grad_norm": 4.435902118682861, "learning_rate": 0.000176228918303519, "loss": 1.9269, "step": 319 }, { "epoch": 0.06275125012256104, "grad_norm": 10.938987731933594, "learning_rate": 0.00017608126929472795, "loss": 1.4649, "step": 320 }, { "epoch": 0.06294734777919404, "grad_norm": 6.332970142364502, "learning_rate": 0.00017593322542081485, "loss": 2.0089, "step": 321 }, { "epoch": 0.06314344543582705, "grad_norm": 6.731532573699951, "learning_rate": 0.00017578478745013392, "loss": 2.4046, "step": 322 }, { "epoch": 0.06333954309246005, "grad_norm": 8.772012710571289, "learning_rate": 0.00017563595615308474, "loss": 1.4935, "step": 323 }, { "epoch": 0.06353564074909304, "grad_norm": 5.693745136260986, "learning_rate": 0.00017548673230210823, "loss": 1.848, "step": 324 }, { "epoch": 0.06373173840572605, "grad_norm": 15.056157112121582, "learning_rate": 0.0001753371166716828, "loss": 1.4598, "step": 325 }, { "epoch": 0.06392783606235905, "grad_norm": 9.370506286621094, "learning_rate": 0.00017518711003832002, "loss": 1.4809, "step": 326 }, { "epoch": 0.06412393371899205, "grad_norm": 19.398839950561523, "learning_rate": 0.000175036713180561, "loss": 1.0093, "step": 327 }, { "epoch": 0.06432003137562506, "grad_norm": 4.393742084503174, "learning_rate": 0.00017488592687897193, "loss": 0.817, "step": 328 }, { "epoch": 0.06451612903225806, "grad_norm": 6.7713799476623535, "learning_rate": 0.00017473475191614037, "loss": 2.1701, "step": 329 }, { "epoch": 0.06471222668889107, "grad_norm": 5.920267581939697, "learning_rate": 0.00017458318907667098, "loss": 3.3491, "step": 330 }, { "epoch": 0.06490832434552407, "grad_norm": 15.095996856689453, "learning_rate": 0.0001744312391471816, "loss": 1.7637, "step": 331 }, { "epoch": 0.06510442200215708, "grad_norm": 9.470211029052734, "learning_rate": 0.00017427890291629893, "loss": 2.7744, "step": 332 }, { "epoch": 0.06530051965879008, "grad_norm": 9.082067489624023, "learning_rate": 0.00017412618117465477, "loss": 3.1791, "step": 333 }, { "epoch": 0.06549661731542308, "grad_norm": 5.174635410308838, "learning_rate": 0.0001739730747148816, "loss": 1.2189, "step": 334 }, { "epoch": 0.06569271497205609, "grad_norm": 5.053405284881592, "learning_rate": 0.00017381958433160865, "loss": 1.7119, "step": 335 }, { "epoch": 0.06588881262868909, "grad_norm": 5.771046161651611, "learning_rate": 0.0001736657108214578, "loss": 1.4188, "step": 336 }, { "epoch": 0.0660849102853221, "grad_norm": 8.400517463684082, "learning_rate": 0.00017351145498303925, "loss": 2.3167, "step": 337 }, { "epoch": 0.0662810079419551, "grad_norm": 4.6646728515625, "learning_rate": 0.0001733568176169476, "loss": 1.2102, "step": 338 }, { "epoch": 0.06647710559858809, "grad_norm": 8.288646697998047, "learning_rate": 0.0001732017995257575, "loss": 2.4803, "step": 339 }, { "epoch": 0.0666732032552211, "grad_norm": 10.970074653625488, "learning_rate": 0.00017304640151401967, "loss": 2.5839, "step": 340 }, { "epoch": 0.0668693009118541, "grad_norm": 6.0125732421875, "learning_rate": 0.00017289062438825665, "loss": 1.5807, "step": 341 }, { "epoch": 0.0670653985684871, "grad_norm": 5.844028472900391, "learning_rate": 0.0001727344689569585, "loss": 3.34, "step": 342 }, { "epoch": 0.06726149622512011, "grad_norm": 7.1026387214660645, "learning_rate": 0.00017257793603057871, "loss": 1.4347, "step": 343 }, { "epoch": 0.06745759388175311, "grad_norm": 9.198262214660645, "learning_rate": 0.00017242102642153016, "loss": 1.834, "step": 344 }, { "epoch": 0.06765369153838612, "grad_norm": 5.76854133605957, "learning_rate": 0.00017226374094418044, "loss": 0.9294, "step": 345 }, { "epoch": 0.06784978919501912, "grad_norm": 10.319186210632324, "learning_rate": 0.0001721060804148482, "loss": 2.0088, "step": 346 }, { "epoch": 0.06804588685165212, "grad_norm": 22.298240661621094, "learning_rate": 0.00017194804565179842, "loss": 2.6901, "step": 347 }, { "epoch": 0.06824198450828513, "grad_norm": 11.38401985168457, "learning_rate": 0.00017178963747523847, "loss": 2.6342, "step": 348 }, { "epoch": 0.06824198450828513, "eval_loss": 0.4400941729545593, "eval_runtime": 78.7276, "eval_samples_per_second": 27.284, "eval_steps_per_second": 13.642, "step": 348 }, { "epoch": 0.06843808216491813, "grad_norm": 7.237430095672607, "learning_rate": 0.00017163085670731371, "loss": 1.6659, "step": 349 }, { "epoch": 0.06863417982155114, "grad_norm": 10.189397811889648, "learning_rate": 0.00017147170417210333, "loss": 1.5962, "step": 350 }, { "epoch": 0.06883027747818414, "grad_norm": 11.650528907775879, "learning_rate": 0.00017131218069561593, "loss": 2.8224, "step": 351 }, { "epoch": 0.06902637513481714, "grad_norm": 5.0007147789001465, "learning_rate": 0.00017115228710578534, "loss": 1.8055, "step": 352 }, { "epoch": 0.06922247279145015, "grad_norm": 4.950777530670166, "learning_rate": 0.0001709920242324663, "loss": 1.1707, "step": 353 }, { "epoch": 0.06941857044808314, "grad_norm": 11.658537864685059, "learning_rate": 0.0001708313929074302, "loss": 1.6848, "step": 354 }, { "epoch": 0.06961466810471614, "grad_norm": 6.6815009117126465, "learning_rate": 0.00017067039396436058, "loss": 1.6768, "step": 355 }, { "epoch": 0.06981076576134915, "grad_norm": 7.522084712982178, "learning_rate": 0.00017050902823884903, "loss": 1.2074, "step": 356 }, { "epoch": 0.07000686341798215, "grad_norm": 9.470911026000977, "learning_rate": 0.00017034729656839078, "loss": 0.8594, "step": 357 }, { "epoch": 0.07020296107461516, "grad_norm": 4.798830032348633, "learning_rate": 0.00017018519979238023, "loss": 1.6142, "step": 358 }, { "epoch": 0.07039905873124816, "grad_norm": 4.626184940338135, "learning_rate": 0.0001700227387521068, "loss": 1.6397, "step": 359 }, { "epoch": 0.07059515638788116, "grad_norm": 6.68535041809082, "learning_rate": 0.00016985991429075036, "loss": 0.623, "step": 360 }, { "epoch": 0.07079125404451417, "grad_norm": 3.028799533843994, "learning_rate": 0.00016969672725337706, "loss": 0.7686, "step": 361 }, { "epoch": 0.07098735170114717, "grad_norm": 14.592923164367676, "learning_rate": 0.00016953317848693474, "loss": 1.5171, "step": 362 }, { "epoch": 0.07118344935778018, "grad_norm": 9.847641944885254, "learning_rate": 0.00016936926884024864, "loss": 2.5365, "step": 363 }, { "epoch": 0.07137954701441318, "grad_norm": 5.795401573181152, "learning_rate": 0.00016920499916401707, "loss": 1.2201, "step": 364 }, { "epoch": 0.07157564467104618, "grad_norm": 10.178254127502441, "learning_rate": 0.0001690403703108068, "loss": 2.3089, "step": 365 }, { "epoch": 0.07177174232767919, "grad_norm": 5.105298042297363, "learning_rate": 0.00016887538313504883, "loss": 1.3503, "step": 366 }, { "epoch": 0.07196783998431219, "grad_norm": 5.165500164031982, "learning_rate": 0.00016871003849303382, "loss": 1.9088, "step": 367 }, { "epoch": 0.0721639376409452, "grad_norm": 6.532622337341309, "learning_rate": 0.0001685443372429077, "loss": 1.2754, "step": 368 }, { "epoch": 0.07236003529757819, "grad_norm": 6.758758068084717, "learning_rate": 0.0001683782802446672, "loss": 1.6721, "step": 369 }, { "epoch": 0.07255613295421119, "grad_norm": 4.5956902503967285, "learning_rate": 0.0001682118683601555, "loss": 1.7976, "step": 370 }, { "epoch": 0.0727522306108442, "grad_norm": 8.587203025817871, "learning_rate": 0.00016804510245305745, "loss": 1.7933, "step": 371 }, { "epoch": 0.0729483282674772, "grad_norm": 8.018026351928711, "learning_rate": 0.00016787798338889552, "loss": 2.4964, "step": 372 }, { "epoch": 0.0731444259241102, "grad_norm": 8.505922317504883, "learning_rate": 0.00016771051203502493, "loss": 2.9766, "step": 373 }, { "epoch": 0.07334052358074321, "grad_norm": 7.051482200622559, "learning_rate": 0.00016754268926062938, "loss": 1.9882, "step": 374 }, { "epoch": 0.07353662123737621, "grad_norm": 4.620705604553223, "learning_rate": 0.00016737451593671636, "loss": 2.3198, "step": 375 }, { "epoch": 0.07373271889400922, "grad_norm": 8.614477157592773, "learning_rate": 0.00016720599293611286, "loss": 2.0611, "step": 376 }, { "epoch": 0.07392881655064222, "grad_norm": 5.840315818786621, "learning_rate": 0.0001670371211334606, "loss": 1.5961, "step": 377 }, { "epoch": 0.07412491420727522, "grad_norm": 6.045500755310059, "learning_rate": 0.00016686790140521164, "loss": 1.5132, "step": 378 }, { "epoch": 0.07432101186390823, "grad_norm": 11.521757125854492, "learning_rate": 0.0001666983346296238, "loss": 2.0003, "step": 379 }, { "epoch": 0.07451710952054123, "grad_norm": 7.7568230628967285, "learning_rate": 0.0001665284216867561, "loss": 2.4289, "step": 380 }, { "epoch": 0.07471320717717424, "grad_norm": 9.928805351257324, "learning_rate": 0.0001663581634584641, "loss": 2.2129, "step": 381 }, { "epoch": 0.07490930483380724, "grad_norm": 6.011744022369385, "learning_rate": 0.00016618756082839554, "loss": 1.8396, "step": 382 }, { "epoch": 0.07510540249044025, "grad_norm": 7.308403968811035, "learning_rate": 0.0001660166146819855, "loss": 0.9335, "step": 383 }, { "epoch": 0.07530150014707324, "grad_norm": 10.000845909118652, "learning_rate": 0.0001658453259064519, "loss": 1.7884, "step": 384 }, { "epoch": 0.07549759780370624, "grad_norm": 16.585180282592773, "learning_rate": 0.00016567369539079114, "loss": 3.0675, "step": 385 }, { "epoch": 0.07569369546033924, "grad_norm": 10.055068969726562, "learning_rate": 0.000165501724025773, "loss": 2.0398, "step": 386 }, { "epoch": 0.07588979311697225, "grad_norm": 9.56424331665039, "learning_rate": 0.0001653294127039365, "loss": 3.1253, "step": 387 }, { "epoch": 0.07608589077360525, "grad_norm": 6.628775119781494, "learning_rate": 0.0001651567623195849, "loss": 0.869, "step": 388 }, { "epoch": 0.07628198843023826, "grad_norm": 8.333356857299805, "learning_rate": 0.00016498377376878126, "loss": 2.1941, "step": 389 }, { "epoch": 0.07647808608687126, "grad_norm": 6.76808500289917, "learning_rate": 0.0001648104479493437, "loss": 1.3802, "step": 390 }, { "epoch": 0.07667418374350427, "grad_norm": 7.221822738647461, "learning_rate": 0.0001646367857608409, "loss": 1.0955, "step": 391 }, { "epoch": 0.07687028140013727, "grad_norm": 31.469024658203125, "learning_rate": 0.00016446278810458716, "loss": 2.4294, "step": 392 }, { "epoch": 0.07706637905677027, "grad_norm": 5.735899448394775, "learning_rate": 0.000164288455883638, "loss": 2.1574, "step": 393 }, { "epoch": 0.07726247671340328, "grad_norm": 6.462769031524658, "learning_rate": 0.00016411379000278524, "loss": 1.8524, "step": 394 }, { "epoch": 0.07745857437003628, "grad_norm": 12.341320037841797, "learning_rate": 0.00016393879136855248, "loss": 2.2439, "step": 395 }, { "epoch": 0.07765467202666929, "grad_norm": 6.433187007904053, "learning_rate": 0.0001637634608891903, "loss": 1.2662, "step": 396 }, { "epoch": 0.07785076968330229, "grad_norm": 5.184267997741699, "learning_rate": 0.00016358779947467158, "loss": 1.0905, "step": 397 }, { "epoch": 0.0780468673399353, "grad_norm": 8.464557647705078, "learning_rate": 0.00016341180803668674, "loss": 2.0666, "step": 398 }, { "epoch": 0.07824296499656828, "grad_norm": 9.498639106750488, "learning_rate": 0.00016323548748863907, "loss": 2.2394, "step": 399 }, { "epoch": 0.07843906265320129, "grad_norm": 7.747230052947998, "learning_rate": 0.00016305883874563994, "loss": 1.4313, "step": 400 }, { "epoch": 0.07863516030983429, "grad_norm": 8.759115219116211, "learning_rate": 0.0001628818627245041, "loss": 1.6428, "step": 401 }, { "epoch": 0.0788312579664673, "grad_norm": 5.250945568084717, "learning_rate": 0.00016270456034374474, "loss": 0.7777, "step": 402 }, { "epoch": 0.0790273556231003, "grad_norm": 9.001906394958496, "learning_rate": 0.00016252693252356916, "loss": 1.678, "step": 403 }, { "epoch": 0.0792234532797333, "grad_norm": 7.177880764007568, "learning_rate": 0.00016234898018587337, "loss": 1.4463, "step": 404 }, { "epoch": 0.07941955093636631, "grad_norm": 10.035067558288574, "learning_rate": 0.00016217070425423788, "loss": 2.3035, "step": 405 }, { "epoch": 0.07961564859299931, "grad_norm": 4.974843978881836, "learning_rate": 0.0001619921056539226, "loss": 1.2633, "step": 406 }, { "epoch": 0.07981174624963232, "grad_norm": 7.599184036254883, "learning_rate": 0.00016181318531186206, "loss": 0.8719, "step": 407 }, { "epoch": 0.08000784390626532, "grad_norm": 11.066450119018555, "learning_rate": 0.0001616339441566607, "loss": 2.2938, "step": 408 }, { "epoch": 0.08020394156289833, "grad_norm": 6.441056251525879, "learning_rate": 0.000161454383118588, "loss": 0.9166, "step": 409 }, { "epoch": 0.08040003921953133, "grad_norm": 9.670071601867676, "learning_rate": 0.00016127450312957353, "loss": 2.2514, "step": 410 }, { "epoch": 0.08059613687616433, "grad_norm": 7.870420455932617, "learning_rate": 0.00016109430512320237, "loss": 1.417, "step": 411 }, { "epoch": 0.08079223453279734, "grad_norm": 8.569132804870605, "learning_rate": 0.00016091379003471007, "loss": 1.6958, "step": 412 }, { "epoch": 0.08098833218943034, "grad_norm": 4.8630900382995605, "learning_rate": 0.00016073295880097784, "loss": 1.8919, "step": 413 }, { "epoch": 0.08118442984606333, "grad_norm": 10.894848823547363, "learning_rate": 0.0001605518123605277, "loss": 1.7899, "step": 414 }, { "epoch": 0.08138052750269634, "grad_norm": 6.055443286895752, "learning_rate": 0.00016037035165351768, "loss": 1.9432, "step": 415 }, { "epoch": 0.08157662515932934, "grad_norm": 7.106583118438721, "learning_rate": 0.0001601885776217367, "loss": 1.736, "step": 416 }, { "epoch": 0.08177272281596235, "grad_norm": 7.7677764892578125, "learning_rate": 0.00016000649120860003, "loss": 2.4425, "step": 417 }, { "epoch": 0.08196882047259535, "grad_norm": 5.940224647521973, "learning_rate": 0.00015982409335914407, "loss": 1.0954, "step": 418 }, { "epoch": 0.08216491812922835, "grad_norm": 6.025548458099365, "learning_rate": 0.00015964138502002175, "loss": 0.93, "step": 419 }, { "epoch": 0.08236101578586136, "grad_norm": 7.528570652008057, "learning_rate": 0.00015945836713949726, "loss": 1.8381, "step": 420 }, { "epoch": 0.08255711344249436, "grad_norm": 15.305673599243164, "learning_rate": 0.00015927504066744148, "loss": 1.7413, "step": 421 }, { "epoch": 0.08275321109912737, "grad_norm": 4.707043647766113, "learning_rate": 0.0001590914065553268, "loss": 1.4186, "step": 422 }, { "epoch": 0.08294930875576037, "grad_norm": 5.808046817779541, "learning_rate": 0.00015890746575622231, "loss": 1.095, "step": 423 }, { "epoch": 0.08314540641239337, "grad_norm": 6.294839859008789, "learning_rate": 0.00015872321922478884, "loss": 1.918, "step": 424 }, { "epoch": 0.08334150406902638, "grad_norm": 4.934250354766846, "learning_rate": 0.00015853866791727396, "loss": 1.2578, "step": 425 }, { "epoch": 0.08353760172565938, "grad_norm": 8.470808982849121, "learning_rate": 0.00015835381279150705, "loss": 2.0231, "step": 426 }, { "epoch": 0.08373369938229239, "grad_norm": 18.526151657104492, "learning_rate": 0.00015816865480689426, "loss": 4.1691, "step": 427 }, { "epoch": 0.08392979703892539, "grad_norm": 7.958248615264893, "learning_rate": 0.0001579831949244137, "loss": 1.066, "step": 428 }, { "epoch": 0.08412589469555838, "grad_norm": 6.250977516174316, "learning_rate": 0.00015779743410661033, "loss": 0.8756, "step": 429 }, { "epoch": 0.08432199235219139, "grad_norm": 4.739164352416992, "learning_rate": 0.00015761137331759084, "loss": 1.3986, "step": 430 }, { "epoch": 0.08451809000882439, "grad_norm": 10.723671913146973, "learning_rate": 0.00015742501352301893, "loss": 2.3962, "step": 431 }, { "epoch": 0.0847141876654574, "grad_norm": 5.097165584564209, "learning_rate": 0.00015723835569011007, "loss": 3.1665, "step": 432 }, { "epoch": 0.0849102853220904, "grad_norm": 7.0065388679504395, "learning_rate": 0.00015705140078762665, "loss": 1.4634, "step": 433 }, { "epoch": 0.0851063829787234, "grad_norm": 6.814738750457764, "learning_rate": 0.00015686414978587277, "loss": 0.9439, "step": 434 }, { "epoch": 0.0853024806353564, "grad_norm": 8.381726264953613, "learning_rate": 0.0001566766036566893, "loss": 1.6046, "step": 435 }, { "epoch": 0.08549857829198941, "grad_norm": 8.249631881713867, "learning_rate": 0.00015648876337344896, "loss": 1.6886, "step": 436 }, { "epoch": 0.08569467594862241, "grad_norm": 9.020310401916504, "learning_rate": 0.00015630062991105098, "loss": 2.5881, "step": 437 }, { "epoch": 0.08589077360525542, "grad_norm": 3.999058723449707, "learning_rate": 0.0001561122042459163, "loss": 1.8363, "step": 438 }, { "epoch": 0.08608687126188842, "grad_norm": 7.487119674682617, "learning_rate": 0.00015592348735598237, "loss": 1.4798, "step": 439 }, { "epoch": 0.08628296891852143, "grad_norm": 4.982146739959717, "learning_rate": 0.00015573448022069815, "loss": 2.7132, "step": 440 }, { "epoch": 0.08647906657515443, "grad_norm": 3.5383474826812744, "learning_rate": 0.00015554518382101892, "loss": 0.6142, "step": 441 }, { "epoch": 0.08667516423178744, "grad_norm": 3.881410598754883, "learning_rate": 0.00015535559913940126, "loss": 0.8708, "step": 442 }, { "epoch": 0.08687126188842044, "grad_norm": 5.159698963165283, "learning_rate": 0.00015516572715979806, "loss": 2.32, "step": 443 }, { "epoch": 0.08706735954505343, "grad_norm": 9.29107666015625, "learning_rate": 0.00015497556886765316, "loss": 2.5744, "step": 444 }, { "epoch": 0.08726345720168643, "grad_norm": 8.70083999633789, "learning_rate": 0.00015478512524989645, "loss": 0.8412, "step": 445 }, { "epoch": 0.08745955485831944, "grad_norm": 6.164897918701172, "learning_rate": 0.00015459439729493865, "loss": 1.4857, "step": 446 }, { "epoch": 0.08765565251495244, "grad_norm": 4.213920593261719, "learning_rate": 0.00015440338599266622, "loss": 1.2077, "step": 447 }, { "epoch": 0.08785175017158545, "grad_norm": 14.897852897644043, "learning_rate": 0.00015421209233443617, "loss": 1.815, "step": 448 }, { "epoch": 0.08804784782821845, "grad_norm": 6.773970127105713, "learning_rate": 0.00015402051731307093, "loss": 2.9485, "step": 449 }, { "epoch": 0.08824394548485145, "grad_norm": 5.569252014160156, "learning_rate": 0.0001538286619228533, "loss": 1.1505, "step": 450 }, { "epoch": 0.08844004314148446, "grad_norm": 5.61276388168335, "learning_rate": 0.0001536365271595212, "loss": 1.2065, "step": 451 }, { "epoch": 0.08863614079811746, "grad_norm": 3.9499528408050537, "learning_rate": 0.00015344411402026245, "loss": 1.3547, "step": 452 }, { "epoch": 0.08883223845475047, "grad_norm": 11.833647727966309, "learning_rate": 0.00015325142350370967, "loss": 2.8476, "step": 453 }, { "epoch": 0.08902833611138347, "grad_norm": 6.281819820404053, "learning_rate": 0.00015305845660993503, "loss": 1.0563, "step": 454 }, { "epoch": 0.08922443376801648, "grad_norm": 9.338071823120117, "learning_rate": 0.00015286521434044526, "loss": 1.9847, "step": 455 }, { "epoch": 0.08942053142464948, "grad_norm": 12.811955451965332, "learning_rate": 0.0001526716976981761, "loss": 1.2632, "step": 456 }, { "epoch": 0.08961662908128248, "grad_norm": 5.077617168426514, "learning_rate": 0.0001524779076874875, "loss": 0.9728, "step": 457 }, { "epoch": 0.08981272673791549, "grad_norm": 5.802744388580322, "learning_rate": 0.0001522838453141581, "loss": 0.9826, "step": 458 }, { "epoch": 0.09000882439454848, "grad_norm": 6.643836498260498, "learning_rate": 0.00015208951158538004, "loss": 1.3868, "step": 459 }, { "epoch": 0.09020492205118148, "grad_norm": 8.100834846496582, "learning_rate": 0.000151894907509754, "loss": 1.5572, "step": 460 }, { "epoch": 0.09040101970781449, "grad_norm": 8.695015907287598, "learning_rate": 0.00015170003409728356, "loss": 1.7788, "step": 461 }, { "epoch": 0.09059711736444749, "grad_norm": 11.81883430480957, "learning_rate": 0.00015150489235937035, "loss": 1.7457, "step": 462 }, { "epoch": 0.0907932150210805, "grad_norm": 8.165855407714844, "learning_rate": 0.00015130948330880847, "loss": 1.3886, "step": 463 }, { "epoch": 0.0909893126777135, "grad_norm": 22.797456741333008, "learning_rate": 0.00015111380795977954, "loss": 1.8671, "step": 464 }, { "epoch": 0.0911854103343465, "grad_norm": 5.275067329406738, "learning_rate": 0.00015091786732784716, "loss": 0.7186, "step": 465 }, { "epoch": 0.09138150799097951, "grad_norm": 8.38714599609375, "learning_rate": 0.00015072166242995175, "loss": 1.7764, "step": 466 }, { "epoch": 0.09157760564761251, "grad_norm": 8.916413307189941, "learning_rate": 0.0001505251942844054, "loss": 1.5589, "step": 467 }, { "epoch": 0.09177370330424552, "grad_norm": 5.007913112640381, "learning_rate": 0.00015032846391088635, "loss": 0.6688, "step": 468 }, { "epoch": 0.09196980096087852, "grad_norm": 10.685876846313477, "learning_rate": 0.0001501314723304339, "loss": 2.2118, "step": 469 }, { "epoch": 0.09216589861751152, "grad_norm": 7.626431465148926, "learning_rate": 0.00014993422056544295, "loss": 1.1849, "step": 470 }, { "epoch": 0.09236199627414453, "grad_norm": 5.862109184265137, "learning_rate": 0.00014973670963965883, "loss": 1.4594, "step": 471 }, { "epoch": 0.09255809393077753, "grad_norm": 5.194091320037842, "learning_rate": 0.00014953894057817188, "loss": 1.2243, "step": 472 }, { "epoch": 0.09275419158741054, "grad_norm": 7.806075572967529, "learning_rate": 0.0001493409144074122, "loss": 1.2365, "step": 473 }, { "epoch": 0.09295028924404353, "grad_norm": 5.004404067993164, "learning_rate": 0.00014914263215514431, "loss": 1.6081, "step": 474 }, { "epoch": 0.09314638690067653, "grad_norm": 5.245405197143555, "learning_rate": 0.00014894409485046177, "loss": 2.2626, "step": 475 }, { "epoch": 0.09334248455730954, "grad_norm": 6.017452716827393, "learning_rate": 0.00014874530352378194, "loss": 3.0452, "step": 476 }, { "epoch": 0.09353858221394254, "grad_norm": 8.504718780517578, "learning_rate": 0.00014854625920684042, "loss": 1.4529, "step": 477 }, { "epoch": 0.09373467987057554, "grad_norm": 7.850302696228027, "learning_rate": 0.00014834696293268603, "loss": 1.5511, "step": 478 }, { "epoch": 0.09393077752720855, "grad_norm": 6.846382141113281, "learning_rate": 0.00014814741573567514, "loss": 2.0178, "step": 479 }, { "epoch": 0.09412687518384155, "grad_norm": 8.834015846252441, "learning_rate": 0.00014794761865146648, "loss": 1.6438, "step": 480 }, { "epoch": 0.09432297284047456, "grad_norm": 7.127365589141846, "learning_rate": 0.00014774757271701557, "loss": 0.5588, "step": 481 }, { "epoch": 0.09451907049710756, "grad_norm": 13.859881401062012, "learning_rate": 0.00014754727897056967, "loss": 2.9212, "step": 482 }, { "epoch": 0.09471516815374056, "grad_norm": 5.914462089538574, "learning_rate": 0.0001473467384516621, "loss": 1.334, "step": 483 }, { "epoch": 0.09491126581037357, "grad_norm": 3.947435140609741, "learning_rate": 0.0001471459522011069, "loss": 1.5284, "step": 484 }, { "epoch": 0.09510736346700657, "grad_norm": 7.171452522277832, "learning_rate": 0.00014694492126099353, "loss": 1.7676, "step": 485 }, { "epoch": 0.09530346112363958, "grad_norm": 7.364241600036621, "learning_rate": 0.0001467436466746814, "loss": 1.7551, "step": 486 }, { "epoch": 0.09549955878027258, "grad_norm": 12.173360824584961, "learning_rate": 0.0001465421294867944, "loss": 2.5703, "step": 487 }, { "epoch": 0.09569565643690559, "grad_norm": 14.59663200378418, "learning_rate": 0.00014634037074321557, "loss": 1.7594, "step": 488 }, { "epoch": 0.09589175409353858, "grad_norm": 5.692611217498779, "learning_rate": 0.00014613837149108163, "loss": 2.5973, "step": 489 }, { "epoch": 0.09608785175017158, "grad_norm": 6.9588470458984375, "learning_rate": 0.00014593613277877758, "loss": 2.6119, "step": 490 }, { "epoch": 0.09628394940680458, "grad_norm": 14.370820999145508, "learning_rate": 0.0001457336556559312, "loss": 1.9495, "step": 491 }, { "epoch": 0.09648004706343759, "grad_norm": 8.259851455688477, "learning_rate": 0.0001455309411734076, "loss": 2.2663, "step": 492 }, { "epoch": 0.09667614472007059, "grad_norm": 11.025699615478516, "learning_rate": 0.00014532799038330385, "loss": 2.2184, "step": 493 }, { "epoch": 0.0968722423767036, "grad_norm": 6.014841079711914, "learning_rate": 0.00014512480433894343, "loss": 2.3436, "step": 494 }, { "epoch": 0.0970683400333366, "grad_norm": 11.881712913513184, "learning_rate": 0.00014492138409487085, "loss": 0.7869, "step": 495 }, { "epoch": 0.0972644376899696, "grad_norm": 4.720561504364014, "learning_rate": 0.00014471773070684599, "loss": 2.417, "step": 496 }, { "epoch": 0.09746053534660261, "grad_norm": 11.112417221069336, "learning_rate": 0.00014451384523183903, "loss": 2.155, "step": 497 }, { "epoch": 0.09765663300323561, "grad_norm": 5.968791961669922, "learning_rate": 0.0001443097287280244, "loss": 0.8096, "step": 498 }, { "epoch": 0.09785273065986862, "grad_norm": 7.543048858642578, "learning_rate": 0.0001441053822547757, "loss": 1.7723, "step": 499 }, { "epoch": 0.09804882831650162, "grad_norm": 7.892668724060059, "learning_rate": 0.00014390080687266013, "loss": 1.4027, "step": 500 }, { "epoch": 0.09824492597313463, "grad_norm": 7.690077304840088, "learning_rate": 0.00014369600364343285, "loss": 1.901, "step": 501 }, { "epoch": 0.09844102362976763, "grad_norm": 4.714064598083496, "learning_rate": 0.00014349097363003163, "loss": 1.6198, "step": 502 }, { "epoch": 0.09863712128640063, "grad_norm": 4.764143466949463, "learning_rate": 0.0001432857178965712, "loss": 1.3231, "step": 503 }, { "epoch": 0.09883321894303362, "grad_norm": 9.042250633239746, "learning_rate": 0.00014308023750833783, "loss": 2.1337, "step": 504 }, { "epoch": 0.09902931659966663, "grad_norm": 7.619399547576904, "learning_rate": 0.00014287453353178372, "loss": 1.1525, "step": 505 }, { "epoch": 0.09922541425629963, "grad_norm": 10.186138153076172, "learning_rate": 0.00014266860703452156, "loss": 2.5917, "step": 506 }, { "epoch": 0.09942151191293264, "grad_norm": 6.784362316131592, "learning_rate": 0.00014246245908531882, "loss": 2.1194, "step": 507 }, { "epoch": 0.09961760956956564, "grad_norm": 46.69243621826172, "learning_rate": 0.0001422560907540925, "loss": 2.0196, "step": 508 }, { "epoch": 0.09981370722619864, "grad_norm": 5.98226261138916, "learning_rate": 0.00014204950311190318, "loss": 2.131, "step": 509 }, { "epoch": 0.10000980488283165, "grad_norm": 19.974212646484375, "learning_rate": 0.00014184269723094988, "loss": 1.9812, "step": 510 }, { "epoch": 0.10020590253946465, "grad_norm": 6.612239360809326, "learning_rate": 0.00014163567418456406, "loss": 1.2688, "step": 511 }, { "epoch": 0.10040200019609766, "grad_norm": 5.024367332458496, "learning_rate": 0.0001414284350472045, "loss": 1.0559, "step": 512 }, { "epoch": 0.10059809785273066, "grad_norm": 6.517631530761719, "learning_rate": 0.00014122098089445142, "loss": 1.2611, "step": 513 }, { "epoch": 0.10079419550936367, "grad_norm": 7.715580463409424, "learning_rate": 0.0001410133128030009, "loss": 1.0419, "step": 514 }, { "epoch": 0.10099029316599667, "grad_norm": 5.8916120529174805, "learning_rate": 0.00014080543185065943, "loss": 0.938, "step": 515 }, { "epoch": 0.10118639082262967, "grad_norm": 5.587378978729248, "learning_rate": 0.0001405973391163383, "loss": 1.4988, "step": 516 }, { "epoch": 0.10138248847926268, "grad_norm": 7.234095096588135, "learning_rate": 0.0001403890356800479, "loss": 2.0476, "step": 517 }, { "epoch": 0.10157858613589568, "grad_norm": 5.781263828277588, "learning_rate": 0.00014018052262289223, "loss": 1.9736, "step": 518 }, { "epoch": 0.10177468379252867, "grad_norm": 7.789620399475098, "learning_rate": 0.0001399718010270632, "loss": 1.2162, "step": 519 }, { "epoch": 0.10197078144916168, "grad_norm": 2.692659378051758, "learning_rate": 0.00013976287197583494, "loss": 0.391, "step": 520 }, { "epoch": 0.10216687910579468, "grad_norm": 7.2949676513671875, "learning_rate": 0.0001395537365535585, "loss": 1.2454, "step": 521 }, { "epoch": 0.10236297676242768, "grad_norm": 14.400397300720215, "learning_rate": 0.00013934439584565583, "loss": 1.844, "step": 522 }, { "epoch": 0.10255907441906069, "grad_norm": 5.972201347351074, "learning_rate": 0.0001391348509386144, "loss": 1.2853, "step": 523 }, { "epoch": 0.1027551720756937, "grad_norm": 8.810315132141113, "learning_rate": 0.00013892510291998146, "loss": 2.7965, "step": 524 }, { "epoch": 0.1029512697323267, "grad_norm": 8.922815322875977, "learning_rate": 0.00013871515287835839, "loss": 1.2606, "step": 525 }, { "epoch": 0.1031473673889597, "grad_norm": 7.5427021980285645, "learning_rate": 0.00013850500190339514, "loss": 1.0504, "step": 526 }, { "epoch": 0.1033434650455927, "grad_norm": 6.646986484527588, "learning_rate": 0.00013829465108578445, "loss": 1.1522, "step": 527 }, { "epoch": 0.10353956270222571, "grad_norm": 8.387070655822754, "learning_rate": 0.0001380841015172563, "loss": 1.4298, "step": 528 }, { "epoch": 0.10373566035885871, "grad_norm": 21.383251190185547, "learning_rate": 0.0001378733542905722, "loss": 1.9162, "step": 529 }, { "epoch": 0.10393175801549172, "grad_norm": 5.776891231536865, "learning_rate": 0.00013766241049951948, "loss": 1.2837, "step": 530 }, { "epoch": 0.10412785567212472, "grad_norm": 4.5066094398498535, "learning_rate": 0.00013745127123890565, "loss": 1.5433, "step": 531 }, { "epoch": 0.10432395332875773, "grad_norm": 6.012543201446533, "learning_rate": 0.00013723993760455272, "loss": 1.3534, "step": 532 }, { "epoch": 0.10452005098539073, "grad_norm": 8.182510375976562, "learning_rate": 0.0001370284106932915, "loss": 1.5422, "step": 533 }, { "epoch": 0.10471614864202373, "grad_norm": 12.530463218688965, "learning_rate": 0.00013681669160295597, "loss": 1.651, "step": 534 }, { "epoch": 0.10491224629865673, "grad_norm": 8.226496696472168, "learning_rate": 0.00013660478143237746, "loss": 1.8992, "step": 535 }, { "epoch": 0.10510834395528973, "grad_norm": 4.896340847015381, "learning_rate": 0.00013639268128137907, "loss": 2.2539, "step": 536 }, { "epoch": 0.10530444161192273, "grad_norm": 4.573820114135742, "learning_rate": 0.00013618039225076986, "loss": 2.4075, "step": 537 }, { "epoch": 0.10550053926855574, "grad_norm": 4.920849323272705, "learning_rate": 0.0001359679154423392, "loss": 2.2481, "step": 538 }, { "epoch": 0.10569663692518874, "grad_norm": 6.206282138824463, "learning_rate": 0.00013575525195885107, "loss": 2.5488, "step": 539 }, { "epoch": 0.10589273458182175, "grad_norm": 9.469644546508789, "learning_rate": 0.0001355424029040382, "loss": 2.0894, "step": 540 }, { "epoch": 0.10608883223845475, "grad_norm": 5.231925010681152, "learning_rate": 0.00013532936938259656, "loss": 1.2463, "step": 541 }, { "epoch": 0.10628492989508775, "grad_norm": 6.779435157775879, "learning_rate": 0.0001351161525001795, "loss": 1.8802, "step": 542 }, { "epoch": 0.10648102755172076, "grad_norm": 7.8035197257995605, "learning_rate": 0.00013490275336339188, "loss": 1.1796, "step": 543 }, { "epoch": 0.10667712520835376, "grad_norm": 5.93556022644043, "learning_rate": 0.00013468917307978467, "loss": 1.1082, "step": 544 }, { "epoch": 0.10687322286498677, "grad_norm": 8.9798002243042, "learning_rate": 0.00013447541275784887, "loss": 1.6289, "step": 545 }, { "epoch": 0.10706932052161977, "grad_norm": 4.270269393920898, "learning_rate": 0.00013426147350700996, "loss": 3.2414, "step": 546 }, { "epoch": 0.10726541817825277, "grad_norm": 7.755331516265869, "learning_rate": 0.00013404735643762192, "loss": 3.5263, "step": 547 }, { "epoch": 0.10746151583488578, "grad_norm": 3.364109754562378, "learning_rate": 0.0001338330626609618, "loss": 1.1847, "step": 548 }, { "epoch": 0.10765761349151878, "grad_norm": 7.73459529876709, "learning_rate": 0.0001336185932892237, "loss": 2.0363, "step": 549 }, { "epoch": 0.10785371114815177, "grad_norm": 7.035179138183594, "learning_rate": 0.000133403949435513, "loss": 1.3317, "step": 550 }, { "epoch": 0.10804980880478478, "grad_norm": 5.30495023727417, "learning_rate": 0.00013318913221384076, "loss": 1.3776, "step": 551 }, { "epoch": 0.10824590646141778, "grad_norm": 6.3127593994140625, "learning_rate": 0.00013297414273911784, "loss": 0.9931, "step": 552 }, { "epoch": 0.10844200411805079, "grad_norm": 3.763789653778076, "learning_rate": 0.00013275898212714889, "loss": 0.9685, "step": 553 }, { "epoch": 0.10863810177468379, "grad_norm": 9.935264587402344, "learning_rate": 0.00013254365149462699, "loss": 1.1951, "step": 554 }, { "epoch": 0.1088341994313168, "grad_norm": 6.151608943939209, "learning_rate": 0.00013232815195912754, "loss": 1.9978, "step": 555 }, { "epoch": 0.1090302970879498, "grad_norm": 7.497893333435059, "learning_rate": 0.00013211248463910262, "loss": 2.1158, "step": 556 }, { "epoch": 0.1092263947445828, "grad_norm": 4.703649520874023, "learning_rate": 0.00013189665065387507, "loss": 2.9424, "step": 557 }, { "epoch": 0.1094224924012158, "grad_norm": 6.602818489074707, "learning_rate": 0.00013168065112363264, "loss": 1.9994, "step": 558 }, { "epoch": 0.10961859005784881, "grad_norm": 5.672735691070557, "learning_rate": 0.00013146448716942245, "loss": 1.6569, "step": 559 }, { "epoch": 0.10981468771448182, "grad_norm": 6.252768039703369, "learning_rate": 0.0001312481599131449, "loss": 1.0263, "step": 560 }, { "epoch": 0.11001078537111482, "grad_norm": 5.062610626220703, "learning_rate": 0.00013103167047754784, "loss": 1.8557, "step": 561 }, { "epoch": 0.11020688302774782, "grad_norm": 17.327529907226562, "learning_rate": 0.000130815019986221, "loss": 2.2288, "step": 562 }, { "epoch": 0.11040298068438083, "grad_norm": 9.787447929382324, "learning_rate": 0.00013059820956358998, "loss": 2.8692, "step": 563 }, { "epoch": 0.11059907834101383, "grad_norm": 17.163454055786133, "learning_rate": 0.00013038124033491025, "loss": 2.3984, "step": 564 }, { "epoch": 0.11079517599764682, "grad_norm": 7.947646617889404, "learning_rate": 0.00013016411342626168, "loss": 1.2144, "step": 565 }, { "epoch": 0.11099127365427983, "grad_norm": 8.151859283447266, "learning_rate": 0.00012994682996454247, "loss": 1.7198, "step": 566 }, { "epoch": 0.11118737131091283, "grad_norm": 8.358308792114258, "learning_rate": 0.00012972939107746325, "loss": 1.1448, "step": 567 }, { "epoch": 0.11138346896754583, "grad_norm": 5.084352970123291, "learning_rate": 0.0001295117978935414, "loss": 2.3229, "step": 568 }, { "epoch": 0.11157956662417884, "grad_norm": 4.846659183502197, "learning_rate": 0.0001292940515420951, "loss": 1.3941, "step": 569 }, { "epoch": 0.11177566428081184, "grad_norm": 5.432335376739502, "learning_rate": 0.0001290761531532374, "loss": 1.9803, "step": 570 }, { "epoch": 0.11197176193744485, "grad_norm": 5.953497886657715, "learning_rate": 0.00012885810385787055, "loss": 1.0619, "step": 571 }, { "epoch": 0.11216785959407785, "grad_norm": 4.770197868347168, "learning_rate": 0.00012863990478767994, "loss": 2.2999, "step": 572 }, { "epoch": 0.11236395725071086, "grad_norm": 5.719841957092285, "learning_rate": 0.00012842155707512825, "loss": 2.0115, "step": 573 }, { "epoch": 0.11256005490734386, "grad_norm": 6.13926362991333, "learning_rate": 0.00012820306185344976, "loss": 1.7, "step": 574 }, { "epoch": 0.11275615256397686, "grad_norm": 14.388799667358398, "learning_rate": 0.0001279844202566442, "loss": 1.4477, "step": 575 }, { "epoch": 0.11295225022060987, "grad_norm": 5.510779857635498, "learning_rate": 0.00012776563341947104, "loss": 1.145, "step": 576 }, { "epoch": 0.11314834787724287, "grad_norm": 7.694248199462891, "learning_rate": 0.00012754670247744354, "loss": 2.0622, "step": 577 }, { "epoch": 0.11334444553387588, "grad_norm": 10.04172420501709, "learning_rate": 0.0001273276285668229, "loss": 2.1074, "step": 578 }, { "epoch": 0.11354054319050888, "grad_norm": 4.613293170928955, "learning_rate": 0.00012710841282461238, "loss": 1.3278, "step": 579 }, { "epoch": 0.11373664084714187, "grad_norm": 7.878372669219971, "learning_rate": 0.0001268890563885512, "loss": 3.3353, "step": 580 }, { "epoch": 0.11393273850377487, "grad_norm": 4.252342224121094, "learning_rate": 0.0001266695603971089, "loss": 1.2478, "step": 581 }, { "epoch": 0.11412883616040788, "grad_norm": 13.550718307495117, "learning_rate": 0.0001264499259894793, "loss": 2.3346, "step": 582 }, { "epoch": 0.11432493381704088, "grad_norm": 5.674976348876953, "learning_rate": 0.0001262301543055746, "loss": 1.9865, "step": 583 }, { "epoch": 0.11452103147367389, "grad_norm": 14.854180335998535, "learning_rate": 0.0001260102464860195, "loss": 2.1993, "step": 584 }, { "epoch": 0.11471712913030689, "grad_norm": 5.227717399597168, "learning_rate": 0.0001257902036721452, "loss": 1.3025, "step": 585 }, { "epoch": 0.1149132267869399, "grad_norm": 9.518735885620117, "learning_rate": 0.00012557002700598353, "loss": 1.9782, "step": 586 }, { "epoch": 0.1151093244435729, "grad_norm": 15.842093467712402, "learning_rate": 0.00012534971763026104, "loss": 2.2285, "step": 587 }, { "epoch": 0.1153054221002059, "grad_norm": 7.949627876281738, "learning_rate": 0.00012512927668839304, "loss": 1.0101, "step": 588 }, { "epoch": 0.11550151975683891, "grad_norm": 10.285954475402832, "learning_rate": 0.00012490870532447774, "loss": 2.2186, "step": 589 }, { "epoch": 0.11569761741347191, "grad_norm": 13.825008392333984, "learning_rate": 0.00012468800468329013, "loss": 1.9117, "step": 590 }, { "epoch": 0.11589371507010492, "grad_norm": 4.062519073486328, "learning_rate": 0.00012446717591027624, "loss": 1.3969, "step": 591 }, { "epoch": 0.11608981272673792, "grad_norm": 13.4169282913208, "learning_rate": 0.00012424622015154703, "loss": 1.6026, "step": 592 }, { "epoch": 0.11628591038337092, "grad_norm": 5.054627895355225, "learning_rate": 0.0001240251385538726, "loss": 3.3848, "step": 593 }, { "epoch": 0.11648200804000393, "grad_norm": 5.519535541534424, "learning_rate": 0.00012380393226467615, "loss": 2.0605, "step": 594 }, { "epoch": 0.11667810569663692, "grad_norm": 8.747673034667969, "learning_rate": 0.000123582602432028, "loss": 1.945, "step": 595 }, { "epoch": 0.11687420335326992, "grad_norm": 7.695986747741699, "learning_rate": 0.0001233611502046397, "loss": 1.2818, "step": 596 }, { "epoch": 0.11707030100990293, "grad_norm": 6.775882720947266, "learning_rate": 0.000123139576731858, "loss": 1.5774, "step": 597 }, { "epoch": 0.11726639866653593, "grad_norm": 6.141412258148193, "learning_rate": 0.00012291788316365888, "loss": 2.0431, "step": 598 }, { "epoch": 0.11746249632316894, "grad_norm": 6.908384323120117, "learning_rate": 0.00012269607065064177, "loss": 2.298, "step": 599 }, { "epoch": 0.11765859397980194, "grad_norm": 6.526587963104248, "learning_rate": 0.0001224741403440233, "loss": 0.7667, "step": 600 }, { "epoch": 0.11785469163643494, "grad_norm": 6.505580902099609, "learning_rate": 0.00012225209339563145, "loss": 2.2598, "step": 601 }, { "epoch": 0.11805078929306795, "grad_norm": 5.94541072845459, "learning_rate": 0.00012202993095789966, "loss": 2.5347, "step": 602 }, { "epoch": 0.11824688694970095, "grad_norm": 4.825118064880371, "learning_rate": 0.00012180765418386068, "loss": 1.3592, "step": 603 }, { "epoch": 0.11844298460633396, "grad_norm": 10.260912895202637, "learning_rate": 0.00012158526422714076, "loss": 1.9816, "step": 604 }, { "epoch": 0.11863908226296696, "grad_norm": 4.47437047958374, "learning_rate": 0.00012136276224195348, "loss": 1.3715, "step": 605 }, { "epoch": 0.11883517991959996, "grad_norm": 5.319314002990723, "learning_rate": 0.00012114014938309393, "loss": 1.7164, "step": 606 }, { "epoch": 0.11903127757623297, "grad_norm": 9.467094421386719, "learning_rate": 0.00012091742680593254, "loss": 1.711, "step": 607 }, { "epoch": 0.11922737523286597, "grad_norm": 15.36571979522705, "learning_rate": 0.0001206945956664093, "loss": 2.1494, "step": 608 }, { "epoch": 0.11942347288949898, "grad_norm": 5.132857322692871, "learning_rate": 0.00012047165712102759, "loss": 1.0723, "step": 609 }, { "epoch": 0.11961957054613197, "grad_norm": 7.701923847198486, "learning_rate": 0.00012024861232684823, "loss": 0.6081, "step": 610 }, { "epoch": 0.11981566820276497, "grad_norm": 8.608162879943848, "learning_rate": 0.00012002546244148345, "loss": 2.7694, "step": 611 }, { "epoch": 0.12001176585939798, "grad_norm": 6.971923351287842, "learning_rate": 0.00011980220862309097, "loss": 2.4804, "step": 612 }, { "epoch": 0.12020786351603098, "grad_norm": 6.421287536621094, "learning_rate": 0.00011957885203036785, "loss": 1.1601, "step": 613 }, { "epoch": 0.12040396117266398, "grad_norm": 5.901442527770996, "learning_rate": 0.00011935539382254459, "loss": 1.1592, "step": 614 }, { "epoch": 0.12060005882929699, "grad_norm": 8.312341690063477, "learning_rate": 0.00011913183515937916, "loss": 1.4096, "step": 615 }, { "epoch": 0.12079615648592999, "grad_norm": 9.224618911743164, "learning_rate": 0.00011890817720115075, "loss": 1.8728, "step": 616 }, { "epoch": 0.120992254142563, "grad_norm": 6.559041500091553, "learning_rate": 0.00011868442110865399, "loss": 1.4327, "step": 617 }, { "epoch": 0.121188351799196, "grad_norm": 6.139137268066406, "learning_rate": 0.0001184605680431928, "loss": 2.3143, "step": 618 }, { "epoch": 0.121384449455829, "grad_norm": 7.806881427764893, "learning_rate": 0.0001182366191665744, "loss": 2.0893, "step": 619 }, { "epoch": 0.12158054711246201, "grad_norm": 5.146418571472168, "learning_rate": 0.00011801257564110329, "loss": 0.9106, "step": 620 }, { "epoch": 0.12177664476909501, "grad_norm": 10.43996810913086, "learning_rate": 0.00011778843862957514, "loss": 1.5653, "step": 621 }, { "epoch": 0.12197274242572802, "grad_norm": 9.192870140075684, "learning_rate": 0.0001175642092952709, "loss": 1.097, "step": 622 }, { "epoch": 0.12216884008236102, "grad_norm": 12.34306526184082, "learning_rate": 0.00011733988880195068, "loss": 0.8703, "step": 623 }, { "epoch": 0.12236493773899403, "grad_norm": 6.297984600067139, "learning_rate": 0.00011711547831384761, "loss": 0.8209, "step": 624 }, { "epoch": 0.12256103539562702, "grad_norm": 6.844716548919678, "learning_rate": 0.00011689097899566198, "loss": 1.2469, "step": 625 }, { "epoch": 0.12275713305226002, "grad_norm": 4.610165119171143, "learning_rate": 0.00011666639201255506, "loss": 1.6045, "step": 626 }, { "epoch": 0.12295323070889302, "grad_norm": 8.704841613769531, "learning_rate": 0.00011644171853014319, "loss": 1.5989, "step": 627 }, { "epoch": 0.12314932836552603, "grad_norm": 14.035907745361328, "learning_rate": 0.00011621695971449154, "loss": 2.3428, "step": 628 }, { "epoch": 0.12334542602215903, "grad_norm": 8.880375862121582, "learning_rate": 0.00011599211673210826, "loss": 1.6628, "step": 629 }, { "epoch": 0.12354152367879204, "grad_norm": 5.904176235198975, "learning_rate": 0.00011576719074993827, "loss": 1.8985, "step": 630 }, { "epoch": 0.12373762133542504, "grad_norm": 5.038250923156738, "learning_rate": 0.00011554218293535725, "loss": 1.8648, "step": 631 }, { "epoch": 0.12393371899205805, "grad_norm": 4.206717014312744, "learning_rate": 0.00011531709445616563, "loss": 1.7799, "step": 632 }, { "epoch": 0.12412981664869105, "grad_norm": 9.580304145812988, "learning_rate": 0.00011509192648058249, "loss": 1.484, "step": 633 }, { "epoch": 0.12432591430532405, "grad_norm": 5.086032867431641, "learning_rate": 0.00011486668017723949, "loss": 1.2848, "step": 634 }, { "epoch": 0.12452201196195706, "grad_norm": 3.946556568145752, "learning_rate": 0.00011464135671517482, "loss": 0.6885, "step": 635 }, { "epoch": 0.12471810961859006, "grad_norm": 5.43235969543457, "learning_rate": 0.0001144159572638271, "loss": 1.1324, "step": 636 }, { "epoch": 0.12491420727522307, "grad_norm": 14.442760467529297, "learning_rate": 0.00011419048299302939, "loss": 2.5022, "step": 637 }, { "epoch": 0.12511030493185607, "grad_norm": 5.514584541320801, "learning_rate": 0.00011396493507300303, "loss": 0.714, "step": 638 }, { "epoch": 0.12530640258848907, "grad_norm": 9.007026672363281, "learning_rate": 0.00011373931467435159, "loss": 1.894, "step": 639 }, { "epoch": 0.12550250024512208, "grad_norm": 4.5296196937561035, "learning_rate": 0.00011351362296805485, "loss": 1.8764, "step": 640 }, { "epoch": 0.12569859790175508, "grad_norm": 6.608786106109619, "learning_rate": 0.00011328786112546269, "loss": 1.8515, "step": 641 }, { "epoch": 0.1258946955583881, "grad_norm": 5.0649261474609375, "learning_rate": 0.00011306203031828886, "loss": 1.1955, "step": 642 }, { "epoch": 0.1260907932150211, "grad_norm": 7.887217998504639, "learning_rate": 0.00011283613171860525, "loss": 1.9566, "step": 643 }, { "epoch": 0.1262868908716541, "grad_norm": 3.997429609298706, "learning_rate": 0.00011261016649883545, "loss": 0.4142, "step": 644 }, { "epoch": 0.1264829885282871, "grad_norm": 6.192634582519531, "learning_rate": 0.0001123841358317489, "loss": 0.7429, "step": 645 }, { "epoch": 0.1266790861849201, "grad_norm": 10.252041816711426, "learning_rate": 0.00011215804089045459, "loss": 1.5821, "step": 646 }, { "epoch": 0.12687518384155308, "grad_norm": 15.608290672302246, "learning_rate": 0.00011193188284839517, "loss": 2.9049, "step": 647 }, { "epoch": 0.12707128149818608, "grad_norm": 7.4965314865112305, "learning_rate": 0.00011170566287934088, "loss": 1.7288, "step": 648 }, { "epoch": 0.1272673791548191, "grad_norm": 13.268543243408203, "learning_rate": 0.00011147938215738323, "loss": 2.6662, "step": 649 }, { "epoch": 0.1274634768114521, "grad_norm": 6.431972503662109, "learning_rate": 0.00011125304185692907, "loss": 2.416, "step": 650 }, { "epoch": 0.1276595744680851, "grad_norm": 5.82267951965332, "learning_rate": 0.0001110266431526945, "loss": 1.0295, "step": 651 }, { "epoch": 0.1278556721247181, "grad_norm": 8.612664222717285, "learning_rate": 0.00011080018721969871, "loss": 2.5151, "step": 652 }, { "epoch": 0.1280517697813511, "grad_norm": 10.103599548339844, "learning_rate": 0.00011057367523325792, "loss": 2.4646, "step": 653 }, { "epoch": 0.1282478674379841, "grad_norm": 6.955473899841309, "learning_rate": 0.00011034710836897921, "loss": 1.3256, "step": 654 }, { "epoch": 0.1284439650946171, "grad_norm": 6.604331016540527, "learning_rate": 0.00011012048780275463, "loss": 1.1124, "step": 655 }, { "epoch": 0.12864006275125012, "grad_norm": 7.2269511222839355, "learning_rate": 0.00010989381471075482, "loss": 2.1239, "step": 656 }, { "epoch": 0.12883616040788312, "grad_norm": 7.591269016265869, "learning_rate": 0.00010966709026942303, "loss": 1.9995, "step": 657 }, { "epoch": 0.12903225806451613, "grad_norm": 5.342310428619385, "learning_rate": 0.00010944031565546906, "loss": 0.8718, "step": 658 }, { "epoch": 0.12922835572114913, "grad_norm": 9.045146942138672, "learning_rate": 0.0001092134920458631, "loss": 0.9157, "step": 659 }, { "epoch": 0.12942445337778213, "grad_norm": 4.300910472869873, "learning_rate": 0.00010898662061782965, "loss": 1.0317, "step": 660 }, { "epoch": 0.12962055103441514, "grad_norm": 11.28966999053955, "learning_rate": 0.0001087597025488413, "loss": 0.7102, "step": 661 }, { "epoch": 0.12981664869104814, "grad_norm": 7.143383502960205, "learning_rate": 0.00010853273901661285, "loss": 1.4524, "step": 662 }, { "epoch": 0.13001274634768115, "grad_norm": 8.20251750946045, "learning_rate": 0.00010830573119909493, "loss": 1.825, "step": 663 }, { "epoch": 0.13020884400431415, "grad_norm": 8.748811721801758, "learning_rate": 0.00010807868027446808, "loss": 0.6214, "step": 664 }, { "epoch": 0.13040494166094715, "grad_norm": 6.7635040283203125, "learning_rate": 0.00010785158742113655, "loss": 1.158, "step": 665 }, { "epoch": 0.13060103931758016, "grad_norm": 5.804599761962891, "learning_rate": 0.00010762445381772217, "loss": 1.2651, "step": 666 }, { "epoch": 0.13079713697421316, "grad_norm": 6.046361923217773, "learning_rate": 0.00010739728064305834, "loss": 1.5022, "step": 667 }, { "epoch": 0.13099323463084617, "grad_norm": 10.987192153930664, "learning_rate": 0.00010717006907618377, "loss": 1.5359, "step": 668 }, { "epoch": 0.13118933228747917, "grad_norm": 8.694954872131348, "learning_rate": 0.00010694282029633647, "loss": 1.682, "step": 669 }, { "epoch": 0.13138542994411218, "grad_norm": 15.458917617797852, "learning_rate": 0.00010671553548294753, "loss": 1.3832, "step": 670 }, { "epoch": 0.13158152760074518, "grad_norm": 3.7156457901000977, "learning_rate": 0.00010648821581563513, "loss": 2.4507, "step": 671 }, { "epoch": 0.13177762525737818, "grad_norm": 9.7117338180542, "learning_rate": 0.00010626086247419826, "loss": 2.3141, "step": 672 }, { "epoch": 0.1319737229140112, "grad_norm": 7.740649700164795, "learning_rate": 0.00010603347663861079, "loss": 2.1845, "step": 673 }, { "epoch": 0.1321698205706442, "grad_norm": 7.079351425170898, "learning_rate": 0.00010580605948901514, "loss": 1.5912, "step": 674 }, { "epoch": 0.1323659182272772, "grad_norm": 3.896484613418579, "learning_rate": 0.00010557861220571625, "loss": 0.8481, "step": 675 }, { "epoch": 0.1325620158839102, "grad_norm": 12.182394981384277, "learning_rate": 0.00010535113596917556, "loss": 2.1278, "step": 676 }, { "epoch": 0.13275811354054318, "grad_norm": 7.2080230712890625, "learning_rate": 0.00010512363196000465, "loss": 1.8256, "step": 677 }, { "epoch": 0.13295421119717618, "grad_norm": 7.081282615661621, "learning_rate": 0.00010489610135895933, "loss": 1.5351, "step": 678 }, { "epoch": 0.13315030885380919, "grad_norm": 4.274497985839844, "learning_rate": 0.00010466854534693335, "loss": 1.3264, "step": 679 }, { "epoch": 0.1333464065104422, "grad_norm": 6.364016532897949, "learning_rate": 0.00010444096510495243, "loss": 1.751, "step": 680 }, { "epoch": 0.1335425041670752, "grad_norm": 6.791689872741699, "learning_rate": 0.00010421336181416797, "loss": 0.7944, "step": 681 }, { "epoch": 0.1337386018237082, "grad_norm": 9.867424964904785, "learning_rate": 0.00010398573665585105, "loss": 1.6282, "step": 682 }, { "epoch": 0.1339346994803412, "grad_norm": 12.289506912231445, "learning_rate": 0.0001037580908113862, "loss": 2.1015, "step": 683 }, { "epoch": 0.1341307971369742, "grad_norm": 5.21271276473999, "learning_rate": 0.00010353042546226537, "loss": 1.0861, "step": 684 }, { "epoch": 0.1343268947936072, "grad_norm": 6.344359874725342, "learning_rate": 0.00010330274179008161, "loss": 1.295, "step": 685 }, { "epoch": 0.13452299245024021, "grad_norm": 4.4853196144104, "learning_rate": 0.00010307504097652323, "loss": 1.5746, "step": 686 }, { "epoch": 0.13471909010687322, "grad_norm": 4.868480205535889, "learning_rate": 0.0001028473242033674, "loss": 0.7534, "step": 687 }, { "epoch": 0.13491518776350622, "grad_norm": 33.27044677734375, "learning_rate": 0.00010261959265247419, "loss": 3.1026, "step": 688 }, { "epoch": 0.13511128542013923, "grad_norm": 5.2553253173828125, "learning_rate": 0.0001023918475057803, "loss": 1.7087, "step": 689 }, { "epoch": 0.13530738307677223, "grad_norm": 9.47342586517334, "learning_rate": 0.00010216408994529303, "loss": 1.2846, "step": 690 }, { "epoch": 0.13550348073340523, "grad_norm": 5.372720241546631, "learning_rate": 0.00010193632115308411, "loss": 2.6163, "step": 691 }, { "epoch": 0.13569957839003824, "grad_norm": 9.89535903930664, "learning_rate": 0.00010170854231128352, "loss": 2.3616, "step": 692 }, { "epoch": 0.13589567604667124, "grad_norm": 8.19672966003418, "learning_rate": 0.00010148075460207347, "loss": 1.8404, "step": 693 }, { "epoch": 0.13609177370330425, "grad_norm": 5.280313491821289, "learning_rate": 0.0001012529592076821, "loss": 2.2078, "step": 694 }, { "epoch": 0.13628787135993725, "grad_norm": 7.142581462860107, "learning_rate": 0.00010102515731037758, "loss": 0.9421, "step": 695 }, { "epoch": 0.13648396901657026, "grad_norm": 8.142248153686523, "learning_rate": 0.00010079735009246167, "loss": 0.83, "step": 696 }, { "epoch": 0.13648396901657026, "eval_loss": 0.40602046251296997, "eval_runtime": 78.837, "eval_samples_per_second": 27.246, "eval_steps_per_second": 13.623, "step": 696 }, { "epoch": 0.13668006667320326, "grad_norm": 3.3614909648895264, "learning_rate": 0.00010056953873626384, "loss": 0.7098, "step": 697 }, { "epoch": 0.13687616432983626, "grad_norm": 6.330589294433594, "learning_rate": 0.00010034172442413501, "loss": 1.97, "step": 698 }, { "epoch": 0.13707226198646927, "grad_norm": 6.881412506103516, "learning_rate": 0.00010011390833844143, "loss": 2.2338, "step": 699 }, { "epoch": 0.13726835964310227, "grad_norm": 6.3669114112854, "learning_rate": 9.988609166155859e-05, "loss": 1.2176, "step": 700 }, { "epoch": 0.13746445729973528, "grad_norm": 4.456204414367676, "learning_rate": 9.965827557586498e-05, "loss": 2.5429, "step": 701 }, { "epoch": 0.13766055495636828, "grad_norm": 11.206175804138184, "learning_rate": 9.943046126373618e-05, "loss": 0.9134, "step": 702 }, { "epoch": 0.13785665261300128, "grad_norm": 3.9151828289031982, "learning_rate": 9.920264990753837e-05, "loss": 0.8662, "step": 703 }, { "epoch": 0.1380527502696343, "grad_norm": 9.173954963684082, "learning_rate": 9.897484268962243e-05, "loss": 1.7328, "step": 704 }, { "epoch": 0.1382488479262673, "grad_norm": 9.788105010986328, "learning_rate": 9.874704079231791e-05, "loss": 1.5085, "step": 705 }, { "epoch": 0.1384449455829003, "grad_norm": 4.705010414123535, "learning_rate": 9.851924539792656e-05, "loss": 1.7789, "step": 706 }, { "epoch": 0.1386410432395333, "grad_norm": 9.228858947753906, "learning_rate": 9.82914576887165e-05, "loss": 1.3564, "step": 707 }, { "epoch": 0.13883714089616628, "grad_norm": 13.22546100616455, "learning_rate": 9.806367884691594e-05, "loss": 1.752, "step": 708 }, { "epoch": 0.13903323855279928, "grad_norm": 8.163661003112793, "learning_rate": 9.783591005470698e-05, "loss": 1.3811, "step": 709 }, { "epoch": 0.1392293362094323, "grad_norm": 8.644879341125488, "learning_rate": 9.760815249421973e-05, "loss": 1.8551, "step": 710 }, { "epoch": 0.1394254338660653, "grad_norm": 4.906625270843506, "learning_rate": 9.738040734752582e-05, "loss": 1.3423, "step": 711 }, { "epoch": 0.1396215315226983, "grad_norm": 7.070640563964844, "learning_rate": 9.715267579663262e-05, "loss": 1.1465, "step": 712 }, { "epoch": 0.1398176291793313, "grad_norm": 4.564503192901611, "learning_rate": 9.692495902347678e-05, "loss": 1.167, "step": 713 }, { "epoch": 0.1400137268359643, "grad_norm": 6.235091686248779, "learning_rate": 9.669725820991841e-05, "loss": 1.0448, "step": 714 }, { "epoch": 0.1402098244925973, "grad_norm": 6.636250019073486, "learning_rate": 9.646957453773469e-05, "loss": 1.1572, "step": 715 }, { "epoch": 0.1404059221492303, "grad_norm": 4.873953342437744, "learning_rate": 9.62419091886138e-05, "loss": 1.6995, "step": 716 }, { "epoch": 0.14060201980586332, "grad_norm": 7.3861589431762695, "learning_rate": 9.601426334414898e-05, "loss": 1.6366, "step": 717 }, { "epoch": 0.14079811746249632, "grad_norm": 4.689702033996582, "learning_rate": 9.578663818583203e-05, "loss": 0.6136, "step": 718 }, { "epoch": 0.14099421511912932, "grad_norm": 6.917242527008057, "learning_rate": 9.555903489504761e-05, "loss": 1.985, "step": 719 }, { "epoch": 0.14119031277576233, "grad_norm": 11.42885971069336, "learning_rate": 9.533145465306667e-05, "loss": 1.2503, "step": 720 }, { "epoch": 0.14138641043239533, "grad_norm": 6.561046600341797, "learning_rate": 9.510389864104069e-05, "loss": 1.207, "step": 721 }, { "epoch": 0.14158250808902834, "grad_norm": 5.668408393859863, "learning_rate": 9.487636803999538e-05, "loss": 1.5323, "step": 722 }, { "epoch": 0.14177860574566134, "grad_norm": 5.94567346572876, "learning_rate": 9.464886403082445e-05, "loss": 1.6155, "step": 723 }, { "epoch": 0.14197470340229434, "grad_norm": 5.67940092086792, "learning_rate": 9.442138779428376e-05, "loss": 1.1803, "step": 724 }, { "epoch": 0.14217080105892735, "grad_norm": 5.852919101715088, "learning_rate": 9.419394051098489e-05, "loss": 2.2628, "step": 725 }, { "epoch": 0.14236689871556035, "grad_norm": 5.615955352783203, "learning_rate": 9.396652336138922e-05, "loss": 2.6917, "step": 726 }, { "epoch": 0.14256299637219336, "grad_norm": 14.781683921813965, "learning_rate": 9.373913752580175e-05, "loss": 1.8918, "step": 727 }, { "epoch": 0.14275909402882636, "grad_norm": 6.98385763168335, "learning_rate": 9.351178418436488e-05, "loss": 0.8915, "step": 728 }, { "epoch": 0.14295519168545937, "grad_norm": 11.97396469116211, "learning_rate": 9.328446451705249e-05, "loss": 1.3453, "step": 729 }, { "epoch": 0.14315128934209237, "grad_norm": 7.939299583435059, "learning_rate": 9.305717970366358e-05, "loss": 1.7643, "step": 730 }, { "epoch": 0.14334738699872537, "grad_norm": 6.75924015045166, "learning_rate": 9.282993092381625e-05, "loss": 0.9198, "step": 731 }, { "epoch": 0.14354348465535838, "grad_norm": 7.962716579437256, "learning_rate": 9.260271935694168e-05, "loss": 1.5201, "step": 732 }, { "epoch": 0.14373958231199138, "grad_norm": 6.077849864959717, "learning_rate": 9.237554618227785e-05, "loss": 1.1027, "step": 733 }, { "epoch": 0.14393567996862439, "grad_norm": 5.715548992156982, "learning_rate": 9.214841257886349e-05, "loss": 0.9475, "step": 734 }, { "epoch": 0.1441317776252574, "grad_norm": 8.250515937805176, "learning_rate": 9.192131972553191e-05, "loss": 1.5568, "step": 735 }, { "epoch": 0.1443278752818904, "grad_norm": 4.075748443603516, "learning_rate": 9.16942688009051e-05, "loss": 1.1728, "step": 736 }, { "epoch": 0.1445239729385234, "grad_norm": 6.287231922149658, "learning_rate": 9.146726098338719e-05, "loss": 0.9493, "step": 737 }, { "epoch": 0.14472007059515637, "grad_norm": 7.561791896820068, "learning_rate": 9.12402974511587e-05, "loss": 1.2801, "step": 738 }, { "epoch": 0.14491616825178938, "grad_norm": 8.35126781463623, "learning_rate": 9.101337938217038e-05, "loss": 1.4908, "step": 739 }, { "epoch": 0.14511226590842238, "grad_norm": 7.66392707824707, "learning_rate": 9.078650795413692e-05, "loss": 2.3328, "step": 740 }, { "epoch": 0.1453083635650554, "grad_norm": 8.209086418151855, "learning_rate": 9.055968434453097e-05, "loss": 1.1856, "step": 741 }, { "epoch": 0.1455044612216884, "grad_norm": 6.804962635040283, "learning_rate": 9.0332909730577e-05, "loss": 1.8494, "step": 742 }, { "epoch": 0.1457005588783214, "grad_norm": 8.357268333435059, "learning_rate": 9.01061852892452e-05, "loss": 2.0554, "step": 743 }, { "epoch": 0.1458966565349544, "grad_norm": 9.071808815002441, "learning_rate": 8.98795121972454e-05, "loss": 1.455, "step": 744 }, { "epoch": 0.1460927541915874, "grad_norm": 6.041928768157959, "learning_rate": 8.965289163102078e-05, "loss": 1.386, "step": 745 }, { "epoch": 0.1462888518482204, "grad_norm": 9.787776947021484, "learning_rate": 8.942632476674211e-05, "loss": 1.4516, "step": 746 }, { "epoch": 0.1464849495048534, "grad_norm": 11.039609909057617, "learning_rate": 8.919981278030133e-05, "loss": 2.5638, "step": 747 }, { "epoch": 0.14668104716148642, "grad_norm": 7.651856899261475, "learning_rate": 8.89733568473055e-05, "loss": 2.2337, "step": 748 }, { "epoch": 0.14687714481811942, "grad_norm": 5.052631855010986, "learning_rate": 8.874695814307094e-05, "loss": 2.1706, "step": 749 }, { "epoch": 0.14707324247475242, "grad_norm": 4.033751964569092, "learning_rate": 8.852061784261678e-05, "loss": 0.7492, "step": 750 }, { "epoch": 0.14726934013138543, "grad_norm": 7.171450138092041, "learning_rate": 8.829433712065914e-05, "loss": 1.1281, "step": 751 }, { "epoch": 0.14746543778801843, "grad_norm": 4.533056735992432, "learning_rate": 8.806811715160485e-05, "loss": 1.1391, "step": 752 }, { "epoch": 0.14766153544465144, "grad_norm": 6.083869934082031, "learning_rate": 8.784195910954545e-05, "loss": 1.0986, "step": 753 }, { "epoch": 0.14785763310128444, "grad_norm": 8.265074729919434, "learning_rate": 8.761586416825117e-05, "loss": 2.8251, "step": 754 }, { "epoch": 0.14805373075791745, "grad_norm": 4.621420860290527, "learning_rate": 8.738983350116454e-05, "loss": 1.3493, "step": 755 }, { "epoch": 0.14824982841455045, "grad_norm": 9.803274154663086, "learning_rate": 8.716386828139477e-05, "loss": 1.6575, "step": 756 }, { "epoch": 0.14844592607118345, "grad_norm": 8.033260345458984, "learning_rate": 8.693796968171113e-05, "loss": 1.4908, "step": 757 }, { "epoch": 0.14864202372781646, "grad_norm": 10.312782287597656, "learning_rate": 8.671213887453735e-05, "loss": 1.0033, "step": 758 }, { "epoch": 0.14883812138444946, "grad_norm": 4.6735615730285645, "learning_rate": 8.648637703194516e-05, "loss": 2.4385, "step": 759 }, { "epoch": 0.14903421904108247, "grad_norm": 7.040375709533691, "learning_rate": 8.62606853256484e-05, "loss": 1.4345, "step": 760 }, { "epoch": 0.14923031669771547, "grad_norm": 5.844077110290527, "learning_rate": 8.603506492699699e-05, "loss": 1.9578, "step": 761 }, { "epoch": 0.14942641435434847, "grad_norm": 7.8477606773376465, "learning_rate": 8.58095170069706e-05, "loss": 1.915, "step": 762 }, { "epoch": 0.14962251201098148, "grad_norm": 6.696018218994141, "learning_rate": 8.55840427361729e-05, "loss": 2.6032, "step": 763 }, { "epoch": 0.14981860966761448, "grad_norm": 6.304353713989258, "learning_rate": 8.535864328482523e-05, "loss": 1.0302, "step": 764 }, { "epoch": 0.1500147073242475, "grad_norm": 4.32615852355957, "learning_rate": 8.513331982276053e-05, "loss": 1.253, "step": 765 }, { "epoch": 0.1502108049808805, "grad_norm": 5.056833744049072, "learning_rate": 8.490807351941753e-05, "loss": 0.9107, "step": 766 }, { "epoch": 0.1504069026375135, "grad_norm": 11.254875183105469, "learning_rate": 8.468290554383436e-05, "loss": 2.1077, "step": 767 }, { "epoch": 0.15060300029414647, "grad_norm": 5.245373249053955, "learning_rate": 8.445781706464277e-05, "loss": 0.7334, "step": 768 }, { "epoch": 0.15079909795077948, "grad_norm": 6.346343994140625, "learning_rate": 8.423280925006178e-05, "loss": 2.2026, "step": 769 }, { "epoch": 0.15099519560741248, "grad_norm": 9.745604515075684, "learning_rate": 8.400788326789175e-05, "loss": 0.8139, "step": 770 }, { "epoch": 0.15119129326404548, "grad_norm": 4.820820331573486, "learning_rate": 8.378304028550848e-05, "loss": 1.2438, "step": 771 }, { "epoch": 0.1513873909206785, "grad_norm": 7.597200393676758, "learning_rate": 8.355828146985684e-05, "loss": 2.3756, "step": 772 }, { "epoch": 0.1515834885773115, "grad_norm": 4.522254943847656, "learning_rate": 8.333360798744496e-05, "loss": 1.9562, "step": 773 }, { "epoch": 0.1517795862339445, "grad_norm": 11.164361000061035, "learning_rate": 8.310902100433809e-05, "loss": 2.1888, "step": 774 }, { "epoch": 0.1519756838905775, "grad_norm": 8.457398414611816, "learning_rate": 8.288452168615242e-05, "loss": 0.7737, "step": 775 }, { "epoch": 0.1521717815472105, "grad_norm": 10.31162166595459, "learning_rate": 8.266011119804936e-05, "loss": 1.1301, "step": 776 }, { "epoch": 0.1523678792038435, "grad_norm": 6.120646953582764, "learning_rate": 8.243579070472909e-05, "loss": 0.6059, "step": 777 }, { "epoch": 0.1525639768604765, "grad_norm": 7.931275844573975, "learning_rate": 8.221156137042489e-05, "loss": 2.2832, "step": 778 }, { "epoch": 0.15276007451710952, "grad_norm": 8.665715217590332, "learning_rate": 8.198742435889674e-05, "loss": 1.5639, "step": 779 }, { "epoch": 0.15295617217374252, "grad_norm": 6.210934638977051, "learning_rate": 8.176338083342561e-05, "loss": 0.5992, "step": 780 }, { "epoch": 0.15315226983037553, "grad_norm": 4.350475311279297, "learning_rate": 8.153943195680723e-05, "loss": 1.9462, "step": 781 }, { "epoch": 0.15334836748700853, "grad_norm": 10.62283992767334, "learning_rate": 8.131557889134602e-05, "loss": 2.5713, "step": 782 }, { "epoch": 0.15354446514364153, "grad_norm": 19.049753189086914, "learning_rate": 8.109182279884928e-05, "loss": 2.0163, "step": 783 }, { "epoch": 0.15374056280027454, "grad_norm": 6.098108768463135, "learning_rate": 8.086816484062085e-05, "loss": 0.9044, "step": 784 }, { "epoch": 0.15393666045690754, "grad_norm": 4.7711567878723145, "learning_rate": 8.064460617745542e-05, "loss": 1.8375, "step": 785 }, { "epoch": 0.15413275811354055, "grad_norm": 6.954524993896484, "learning_rate": 8.042114796963219e-05, "loss": 1.4588, "step": 786 }, { "epoch": 0.15432885577017355, "grad_norm": 4.3450422286987305, "learning_rate": 8.019779137690906e-05, "loss": 1.9173, "step": 787 }, { "epoch": 0.15452495342680655, "grad_norm": 6.4524688720703125, "learning_rate": 7.997453755851658e-05, "loss": 0.9777, "step": 788 }, { "epoch": 0.15472105108343956, "grad_norm": 4.7262139320373535, "learning_rate": 7.975138767315178e-05, "loss": 2.0938, "step": 789 }, { "epoch": 0.15491714874007256, "grad_norm": 6.220945835113525, "learning_rate": 7.952834287897242e-05, "loss": 1.1835, "step": 790 }, { "epoch": 0.15511324639670557, "grad_norm": 12.306231498718262, "learning_rate": 7.930540433359071e-05, "loss": 1.2498, "step": 791 }, { "epoch": 0.15530934405333857, "grad_norm": 5.665555953979492, "learning_rate": 7.908257319406747e-05, "loss": 1.2846, "step": 792 }, { "epoch": 0.15550544170997158, "grad_norm": 5.636224269866943, "learning_rate": 7.88598506169061e-05, "loss": 1.1359, "step": 793 }, { "epoch": 0.15570153936660458, "grad_norm": 7.562717914581299, "learning_rate": 7.863723775804651e-05, "loss": 1.3149, "step": 794 }, { "epoch": 0.15589763702323758, "grad_norm": 6.797637462615967, "learning_rate": 7.841473577285925e-05, "loss": 1.2934, "step": 795 }, { "epoch": 0.1560937346798706, "grad_norm": 9.185527801513672, "learning_rate": 7.819234581613934e-05, "loss": 2.0876, "step": 796 }, { "epoch": 0.1562898323365036, "grad_norm": 5.824263095855713, "learning_rate": 7.797006904210035e-05, "loss": 1.3984, "step": 797 }, { "epoch": 0.15648592999313657, "grad_norm": 8.152481079101562, "learning_rate": 7.774790660436858e-05, "loss": 1.9248, "step": 798 }, { "epoch": 0.15668202764976957, "grad_norm": 5.90303897857666, "learning_rate": 7.752585965597673e-05, "loss": 1.3629, "step": 799 }, { "epoch": 0.15687812530640258, "grad_norm": 7.13554573059082, "learning_rate": 7.730392934935825e-05, "loss": 1.6686, "step": 800 }, { "epoch": 0.15707422296303558, "grad_norm": 14.757647514343262, "learning_rate": 7.708211683634112e-05, "loss": 3.0637, "step": 801 }, { "epoch": 0.15727032061966859, "grad_norm": 6.754796504974365, "learning_rate": 7.686042326814205e-05, "loss": 1.6339, "step": 802 }, { "epoch": 0.1574664182763016, "grad_norm": 4.844980716705322, "learning_rate": 7.663884979536035e-05, "loss": 1.799, "step": 803 }, { "epoch": 0.1576625159329346, "grad_norm": 7.784121990203857, "learning_rate": 7.641739756797202e-05, "loss": 1.4723, "step": 804 }, { "epoch": 0.1578586135895676, "grad_norm": 9.167228698730469, "learning_rate": 7.619606773532386e-05, "loss": 1.3234, "step": 805 }, { "epoch": 0.1580547112462006, "grad_norm": 4.63117790222168, "learning_rate": 7.59748614461274e-05, "loss": 1.2373, "step": 806 }, { "epoch": 0.1582508089028336, "grad_norm": 6.5301690101623535, "learning_rate": 7.5753779848453e-05, "loss": 2.5172, "step": 807 }, { "epoch": 0.1584469065594666, "grad_norm": 6.791456699371338, "learning_rate": 7.553282408972382e-05, "loss": 1.563, "step": 808 }, { "epoch": 0.15864300421609961, "grad_norm": 5.697129249572754, "learning_rate": 7.531199531670988e-05, "loss": 1.6731, "step": 809 }, { "epoch": 0.15883910187273262, "grad_norm": 6.511412143707275, "learning_rate": 7.50912946755223e-05, "loss": 1.2208, "step": 810 }, { "epoch": 0.15903519952936562, "grad_norm": 4.849149703979492, "learning_rate": 7.487072331160696e-05, "loss": 1.9081, "step": 811 }, { "epoch": 0.15923129718599863, "grad_norm": 2.844851016998291, "learning_rate": 7.465028236973897e-05, "loss": 1.2562, "step": 812 }, { "epoch": 0.15942739484263163, "grad_norm": 4.782979965209961, "learning_rate": 7.442997299401652e-05, "loss": 0.8038, "step": 813 }, { "epoch": 0.15962349249926464, "grad_norm": 13.652055740356445, "learning_rate": 7.420979632785483e-05, "loss": 1.3842, "step": 814 }, { "epoch": 0.15981959015589764, "grad_norm": 9.692163467407227, "learning_rate": 7.398975351398053e-05, "loss": 1.3089, "step": 815 }, { "epoch": 0.16001568781253064, "grad_norm": 6.529005527496338, "learning_rate": 7.37698456944254e-05, "loss": 1.8166, "step": 816 }, { "epoch": 0.16021178546916365, "grad_norm": 6.1307454109191895, "learning_rate": 7.355007401052072e-05, "loss": 1.6805, "step": 817 }, { "epoch": 0.16040788312579665, "grad_norm": 23.4924259185791, "learning_rate": 7.333043960289113e-05, "loss": 1.8122, "step": 818 }, { "epoch": 0.16060398078242966, "grad_norm": 5.680008411407471, "learning_rate": 7.311094361144881e-05, "loss": 1.7381, "step": 819 }, { "epoch": 0.16080007843906266, "grad_norm": 6.790538787841797, "learning_rate": 7.289158717538765e-05, "loss": 1.9142, "step": 820 }, { "epoch": 0.16099617609569566, "grad_norm": 7.620028495788574, "learning_rate": 7.267237143317706e-05, "loss": 1.8411, "step": 821 }, { "epoch": 0.16119227375232867, "grad_norm": 5.989513397216797, "learning_rate": 7.245329752255647e-05, "loss": 1.7845, "step": 822 }, { "epoch": 0.16138837140896167, "grad_norm": 5.91652774810791, "learning_rate": 7.223436658052898e-05, "loss": 1.7267, "step": 823 }, { "epoch": 0.16158446906559468, "grad_norm": 6.713134765625, "learning_rate": 7.201557974335583e-05, "loss": 1.6799, "step": 824 }, { "epoch": 0.16178056672222768, "grad_norm": 5.243997097015381, "learning_rate": 7.179693814655026e-05, "loss": 1.7857, "step": 825 }, { "epoch": 0.16197666437886069, "grad_norm": 4.856655120849609, "learning_rate": 7.157844292487173e-05, "loss": 1.0967, "step": 826 }, { "epoch": 0.1621727620354937, "grad_norm": 4.94920015335083, "learning_rate": 7.13600952123201e-05, "loss": 1.4571, "step": 827 }, { "epoch": 0.16236885969212667, "grad_norm": 5.146086692810059, "learning_rate": 7.114189614212944e-05, "loss": 0.6922, "step": 828 }, { "epoch": 0.16256495734875967, "grad_norm": 8.812291145324707, "learning_rate": 7.092384684676262e-05, "loss": 1.2531, "step": 829 }, { "epoch": 0.16276105500539267, "grad_norm": 6.924501419067383, "learning_rate": 7.070594845790497e-05, "loss": 1.3011, "step": 830 }, { "epoch": 0.16295715266202568, "grad_norm": 10.452120780944824, "learning_rate": 7.048820210645862e-05, "loss": 1.1114, "step": 831 }, { "epoch": 0.16315325031865868, "grad_norm": 8.212396621704102, "learning_rate": 7.027060892253679e-05, "loss": 0.9207, "step": 832 }, { "epoch": 0.1633493479752917, "grad_norm": 17.70586585998535, "learning_rate": 7.005317003545754e-05, "loss": 1.3019, "step": 833 }, { "epoch": 0.1635454456319247, "grad_norm": 12.174421310424805, "learning_rate": 6.983588657373833e-05, "loss": 2.1275, "step": 834 }, { "epoch": 0.1637415432885577, "grad_norm": 3.9426016807556152, "learning_rate": 6.96187596650898e-05, "loss": 2.4098, "step": 835 }, { "epoch": 0.1639376409451907, "grad_norm": 6.121764183044434, "learning_rate": 6.940179043641005e-05, "loss": 1.0717, "step": 836 }, { "epoch": 0.1641337386018237, "grad_norm": 4.13683557510376, "learning_rate": 6.918498001377901e-05, "loss": 1.4934, "step": 837 }, { "epoch": 0.1643298362584567, "grad_norm": 4.907169818878174, "learning_rate": 6.896832952245217e-05, "loss": 1.9002, "step": 838 }, { "epoch": 0.1645259339150897, "grad_norm": 8.611984252929688, "learning_rate": 6.875184008685514e-05, "loss": 1.1805, "step": 839 }, { "epoch": 0.16472203157172272, "grad_norm": 8.626840591430664, "learning_rate": 6.853551283057757e-05, "loss": 1.1437, "step": 840 }, { "epoch": 0.16491812922835572, "grad_norm": 6.1718878746032715, "learning_rate": 6.831934887636738e-05, "loss": 1.5465, "step": 841 }, { "epoch": 0.16511422688498872, "grad_norm": 6.6393961906433105, "learning_rate": 6.8103349346125e-05, "loss": 1.0682, "step": 842 }, { "epoch": 0.16531032454162173, "grad_norm": 5.233375072479248, "learning_rate": 6.788751536089739e-05, "loss": 1.8218, "step": 843 }, { "epoch": 0.16550642219825473, "grad_norm": 9.505565643310547, "learning_rate": 6.767184804087248e-05, "loss": 1.3462, "step": 844 }, { "epoch": 0.16570251985488774, "grad_norm": 5.554503440856934, "learning_rate": 6.745634850537302e-05, "loss": 0.7193, "step": 845 }, { "epoch": 0.16589861751152074, "grad_norm": 6.603649139404297, "learning_rate": 6.724101787285114e-05, "loss": 1.3164, "step": 846 }, { "epoch": 0.16609471516815374, "grad_norm": 10.690621376037598, "learning_rate": 6.702585726088222e-05, "loss": 3.3145, "step": 847 }, { "epoch": 0.16629081282478675, "grad_norm": 17.521020889282227, "learning_rate": 6.681086778615922e-05, "loss": 2.3221, "step": 848 }, { "epoch": 0.16648691048141975, "grad_norm": 11.0389986038208, "learning_rate": 6.659605056448702e-05, "loss": 1.2903, "step": 849 }, { "epoch": 0.16668300813805276, "grad_norm": 5.6815900802612305, "learning_rate": 6.638140671077633e-05, "loss": 1.1651, "step": 850 }, { "epoch": 0.16687910579468576, "grad_norm": 14.61434555053711, "learning_rate": 6.616693733903823e-05, "loss": 2.5921, "step": 851 }, { "epoch": 0.16707520345131877, "grad_norm": 6.983891487121582, "learning_rate": 6.595264356237812e-05, "loss": 1.2114, "step": 852 }, { "epoch": 0.16727130110795177, "grad_norm": 4.762997627258301, "learning_rate": 6.573852649299009e-05, "loss": 0.8655, "step": 853 }, { "epoch": 0.16746739876458477, "grad_norm": 8.16657829284668, "learning_rate": 6.552458724215114e-05, "loss": 1.584, "step": 854 }, { "epoch": 0.16766349642121778, "grad_norm": 3.753600597381592, "learning_rate": 6.531082692021532e-05, "loss": 0.7213, "step": 855 }, { "epoch": 0.16785959407785078, "grad_norm": 20.525659561157227, "learning_rate": 6.509724663660813e-05, "loss": 3.2134, "step": 856 }, { "epoch": 0.1680556917344838, "grad_norm": 4.803929805755615, "learning_rate": 6.488384749982053e-05, "loss": 1.0283, "step": 857 }, { "epoch": 0.16825178939111676, "grad_norm": 4.031068325042725, "learning_rate": 6.467063061740345e-05, "loss": 0.86, "step": 858 }, { "epoch": 0.16844788704774977, "grad_norm": 8.304176330566406, "learning_rate": 6.445759709596182e-05, "loss": 0.7242, "step": 859 }, { "epoch": 0.16864398470438277, "grad_norm": 5.160337448120117, "learning_rate": 6.424474804114895e-05, "loss": 1.7592, "step": 860 }, { "epoch": 0.16884008236101578, "grad_norm": 6.138307571411133, "learning_rate": 6.403208455766081e-05, "loss": 2.6926, "step": 861 }, { "epoch": 0.16903618001764878, "grad_norm": 3.878741979598999, "learning_rate": 6.381960774923017e-05, "loss": 1.2294, "step": 862 }, { "epoch": 0.16923227767428178, "grad_norm": 4.720643043518066, "learning_rate": 6.360731871862093e-05, "loss": 1.7562, "step": 863 }, { "epoch": 0.1694283753309148, "grad_norm": 4.653330326080322, "learning_rate": 6.339521856762254e-05, "loss": 1.2522, "step": 864 }, { "epoch": 0.1696244729875478, "grad_norm": 6.003101348876953, "learning_rate": 6.318330839704405e-05, "loss": 1.4006, "step": 865 }, { "epoch": 0.1698205706441808, "grad_norm": 5.075412273406982, "learning_rate": 6.297158930670852e-05, "loss": 0.7885, "step": 866 }, { "epoch": 0.1700166683008138, "grad_norm": 4.062684059143066, "learning_rate": 6.276006239544729e-05, "loss": 1.0201, "step": 867 }, { "epoch": 0.1702127659574468, "grad_norm": 5.678539752960205, "learning_rate": 6.254872876109438e-05, "loss": 1.1822, "step": 868 }, { "epoch": 0.1704088636140798, "grad_norm": 4.655857563018799, "learning_rate": 6.233758950048056e-05, "loss": 1.4193, "step": 869 }, { "epoch": 0.1706049612707128, "grad_norm": 18.205698013305664, "learning_rate": 6.21266457094278e-05, "loss": 0.7067, "step": 870 }, { "epoch": 0.17080105892734582, "grad_norm": 8.73768424987793, "learning_rate": 6.191589848274368e-05, "loss": 1.9048, "step": 871 }, { "epoch": 0.17099715658397882, "grad_norm": 5.723482131958008, "learning_rate": 6.170534891421556e-05, "loss": 0.8121, "step": 872 }, { "epoch": 0.17119325424061183, "grad_norm": 6.16123104095459, "learning_rate": 6.149499809660488e-05, "loss": 2.5358, "step": 873 }, { "epoch": 0.17138935189724483, "grad_norm": 6.115005016326904, "learning_rate": 6.128484712164164e-05, "loss": 1.6489, "step": 874 }, { "epoch": 0.17158544955387783, "grad_norm": 9.149282455444336, "learning_rate": 6.107489708001855e-05, "loss": 1.7977, "step": 875 }, { "epoch": 0.17178154721051084, "grad_norm": 7.190993309020996, "learning_rate": 6.086514906138563e-05, "loss": 1.3526, "step": 876 }, { "epoch": 0.17197764486714384, "grad_norm": 8.292972564697266, "learning_rate": 6.065560415434417e-05, "loss": 1.8477, "step": 877 }, { "epoch": 0.17217374252377685, "grad_norm": 8.858931541442871, "learning_rate": 6.044626344644151e-05, "loss": 1.3605, "step": 878 }, { "epoch": 0.17236984018040985, "grad_norm": 6.186944961547852, "learning_rate": 6.023712802416508e-05, "loss": 1.0484, "step": 879 }, { "epoch": 0.17256593783704285, "grad_norm": 9.702549934387207, "learning_rate": 6.0028198972936836e-05, "loss": 1.3224, "step": 880 }, { "epoch": 0.17276203549367586, "grad_norm": 12.195408821105957, "learning_rate": 5.981947737710779e-05, "loss": 1.8586, "step": 881 }, { "epoch": 0.17295813315030886, "grad_norm": 6.8144025802612305, "learning_rate": 5.9610964319952104e-05, "loss": 1.5967, "step": 882 }, { "epoch": 0.17315423080694187, "grad_norm": 10.37683391571045, "learning_rate": 5.940266088366173e-05, "loss": 1.7278, "step": 883 }, { "epoch": 0.17335032846357487, "grad_norm": 5.85037899017334, "learning_rate": 5.919456814934061e-05, "loss": 1.3165, "step": 884 }, { "epoch": 0.17354642612020788, "grad_norm": 7.210351943969727, "learning_rate": 5.8986687196999135e-05, "loss": 1.1156, "step": 885 }, { "epoch": 0.17374252377684088, "grad_norm": 5.840338706970215, "learning_rate": 5.877901910554862e-05, "loss": 1.4444, "step": 886 }, { "epoch": 0.17393862143347388, "grad_norm": 3.731435775756836, "learning_rate": 5.8571564952795475e-05, "loss": 0.9172, "step": 887 }, { "epoch": 0.17413471909010686, "grad_norm": 13.559743881225586, "learning_rate": 5.8364325815435916e-05, "loss": 2.1312, "step": 888 }, { "epoch": 0.17433081674673986, "grad_norm": 5.5318193435668945, "learning_rate": 5.815730276905014e-05, "loss": 1.735, "step": 889 }, { "epoch": 0.17452691440337287, "grad_norm": 7.40202522277832, "learning_rate": 5.7950496888096795e-05, "loss": 1.42, "step": 890 }, { "epoch": 0.17472301206000587, "grad_norm": 3.6923038959503174, "learning_rate": 5.7743909245907535e-05, "loss": 1.095, "step": 891 }, { "epoch": 0.17491910971663888, "grad_norm": 5.6912713050842285, "learning_rate": 5.753754091468115e-05, "loss": 2.2399, "step": 892 }, { "epoch": 0.17511520737327188, "grad_norm": 3.8538315296173096, "learning_rate": 5.7331392965478493e-05, "loss": 1.2043, "step": 893 }, { "epoch": 0.17531130502990488, "grad_norm": 5.220914840698242, "learning_rate": 5.712546646821627e-05, "loss": 1.0669, "step": 894 }, { "epoch": 0.1755074026865379, "grad_norm": 5.886853218078613, "learning_rate": 5.6919762491662164e-05, "loss": 1.3425, "step": 895 }, { "epoch": 0.1757035003431709, "grad_norm": 7.820699214935303, "learning_rate": 5.671428210342884e-05, "loss": 1.6338, "step": 896 }, { "epoch": 0.1758995979998039, "grad_norm": 9.36957836151123, "learning_rate": 5.650902636996837e-05, "loss": 1.1894, "step": 897 }, { "epoch": 0.1760956956564369, "grad_norm": 3.9427287578582764, "learning_rate": 5.6303996356567177e-05, "loss": 1.179, "step": 898 }, { "epoch": 0.1762917933130699, "grad_norm": 8.343499183654785, "learning_rate": 5.6099193127339864e-05, "loss": 1.9635, "step": 899 }, { "epoch": 0.1764878909697029, "grad_norm": 7.9742279052734375, "learning_rate": 5.589461774522433e-05, "loss": 1.6162, "step": 900 }, { "epoch": 0.1766839886263359, "grad_norm": 4.933746337890625, "learning_rate": 5.5690271271975644e-05, "loss": 1.0064, "step": 901 }, { "epoch": 0.17688008628296892, "grad_norm": 6.986325740814209, "learning_rate": 5.548615476816097e-05, "loss": 0.7052, "step": 902 }, { "epoch": 0.17707618393960192, "grad_norm": 5.324771881103516, "learning_rate": 5.528226929315401e-05, "loss": 1.5658, "step": 903 }, { "epoch": 0.17727228159623493, "grad_norm": 3.9526147842407227, "learning_rate": 5.507861590512916e-05, "loss": 1.0183, "step": 904 }, { "epoch": 0.17746837925286793, "grad_norm": 8.242249488830566, "learning_rate": 5.48751956610566e-05, "loss": 1.9991, "step": 905 }, { "epoch": 0.17766447690950093, "grad_norm": 6.519176483154297, "learning_rate": 5.467200961669619e-05, "loss": 1.2312, "step": 906 }, { "epoch": 0.17786057456613394, "grad_norm": 3.49365496635437, "learning_rate": 5.446905882659243e-05, "loss": 0.5029, "step": 907 }, { "epoch": 0.17805667222276694, "grad_norm": 5.808468818664551, "learning_rate": 5.426634434406883e-05, "loss": 1.1932, "step": 908 }, { "epoch": 0.17825276987939995, "grad_norm": 6.1657328605651855, "learning_rate": 5.40638672212224e-05, "loss": 1.0127, "step": 909 }, { "epoch": 0.17844886753603295, "grad_norm": 8.090532302856445, "learning_rate": 5.3861628508918384e-05, "loss": 1.0638, "step": 910 }, { "epoch": 0.17864496519266596, "grad_norm": 6.026912689208984, "learning_rate": 5.3659629256784424e-05, "loss": 1.1598, "step": 911 }, { "epoch": 0.17884106284929896, "grad_norm": 7.103152751922607, "learning_rate": 5.345787051320564e-05, "loss": 1.9001, "step": 912 }, { "epoch": 0.17903716050593196, "grad_norm": 4.5661163330078125, "learning_rate": 5.325635332531864e-05, "loss": 1.2711, "step": 913 }, { "epoch": 0.17923325816256497, "grad_norm": 6.035799980163574, "learning_rate": 5.305507873900649e-05, "loss": 2.3649, "step": 914 }, { "epoch": 0.17942935581919797, "grad_norm": 6.231631278991699, "learning_rate": 5.2854047798893125e-05, "loss": 2.7058, "step": 915 }, { "epoch": 0.17962545347583098, "grad_norm": 9.146753311157227, "learning_rate": 5.26532615483379e-05, "loss": 1.1388, "step": 916 }, { "epoch": 0.17982155113246398, "grad_norm": 8.76220417022705, "learning_rate": 5.245272102943034e-05, "loss": 1.0848, "step": 917 }, { "epoch": 0.18001764878909696, "grad_norm": 6.711489200592041, "learning_rate": 5.225242728298445e-05, "loss": 1.0747, "step": 918 }, { "epoch": 0.18021374644572996, "grad_norm": 4.66898775100708, "learning_rate": 5.2052381348533564e-05, "loss": 0.6995, "step": 919 }, { "epoch": 0.18040984410236297, "grad_norm": 6.504391193389893, "learning_rate": 5.1852584264324866e-05, "loss": 1.5352, "step": 920 }, { "epoch": 0.18060594175899597, "grad_norm": 6.724686145782471, "learning_rate": 5.165303706731397e-05, "loss": 1.2562, "step": 921 }, { "epoch": 0.18080203941562897, "grad_norm": 7.192296981811523, "learning_rate": 5.1453740793159586e-05, "loss": 1.0158, "step": 922 }, { "epoch": 0.18099813707226198, "grad_norm": 5.173651218414307, "learning_rate": 5.12546964762181e-05, "loss": 1.3406, "step": 923 }, { "epoch": 0.18119423472889498, "grad_norm": 8.315585136413574, "learning_rate": 5.105590514953824e-05, "loss": 2.0057, "step": 924 }, { "epoch": 0.18139033238552799, "grad_norm": 4.940390110015869, "learning_rate": 5.085736784485571e-05, "loss": 1.6468, "step": 925 }, { "epoch": 0.181586430042161, "grad_norm": 6.491610527038574, "learning_rate": 5.065908559258782e-05, "loss": 2.6567, "step": 926 }, { "epoch": 0.181782527698794, "grad_norm": 8.9893798828125, "learning_rate": 5.046105942182815e-05, "loss": 1.2412, "step": 927 }, { "epoch": 0.181978625355427, "grad_norm": 8.103008270263672, "learning_rate": 5.026329036034119e-05, "loss": 2.1319, "step": 928 }, { "epoch": 0.18217472301206, "grad_norm": 8.022109985351562, "learning_rate": 5.006577943455706e-05, "loss": 1.4119, "step": 929 }, { "epoch": 0.182370820668693, "grad_norm": 12.814486503601074, "learning_rate": 4.9868527669566113e-05, "loss": 1.9388, "step": 930 }, { "epoch": 0.182566918325326, "grad_norm": 7.568852424621582, "learning_rate": 4.967153608911366e-05, "loss": 1.2114, "step": 931 }, { "epoch": 0.18276301598195901, "grad_norm": 3.7613790035247803, "learning_rate": 4.947480571559462e-05, "loss": 1.2221, "step": 932 }, { "epoch": 0.18295911363859202, "grad_norm": 8.301228523254395, "learning_rate": 4.927833757004826e-05, "loss": 1.2867, "step": 933 }, { "epoch": 0.18315521129522502, "grad_norm": 5.2592244148254395, "learning_rate": 4.908213267215287e-05, "loss": 2.0937, "step": 934 }, { "epoch": 0.18335130895185803, "grad_norm": 6.635796546936035, "learning_rate": 4.888619204022047e-05, "loss": 1.9856, "step": 935 }, { "epoch": 0.18354740660849103, "grad_norm": 5.507410049438477, "learning_rate": 4.869051669119154e-05, "loss": 1.2305, "step": 936 }, { "epoch": 0.18374350426512404, "grad_norm": 8.553701400756836, "learning_rate": 4.8495107640629675e-05, "loss": 1.6872, "step": 937 }, { "epoch": 0.18393960192175704, "grad_norm": 7.652828693389893, "learning_rate": 4.829996590271646e-05, "loss": 1.4061, "step": 938 }, { "epoch": 0.18413569957839004, "grad_norm": 6.595338821411133, "learning_rate": 4.810509249024604e-05, "loss": 1.2953, "step": 939 }, { "epoch": 0.18433179723502305, "grad_norm": 4.507275581359863, "learning_rate": 4.7910488414619967e-05, "loss": 0.95, "step": 940 }, { "epoch": 0.18452789489165605, "grad_norm": 7.35803747177124, "learning_rate": 4.7716154685841944e-05, "loss": 1.7958, "step": 941 }, { "epoch": 0.18472399254828906, "grad_norm": 5.53951358795166, "learning_rate": 4.752209231251251e-05, "loss": 2.8191, "step": 942 }, { "epoch": 0.18492009020492206, "grad_norm": 4.738945960998535, "learning_rate": 4.73283023018239e-05, "loss": 1.7728, "step": 943 }, { "epoch": 0.18511618786155506, "grad_norm": 5.192052364349365, "learning_rate": 4.713478565955478e-05, "loss": 0.8087, "step": 944 }, { "epoch": 0.18531228551818807, "grad_norm": 5.9963507652282715, "learning_rate": 4.694154339006501e-05, "loss": 1.1223, "step": 945 }, { "epoch": 0.18550838317482107, "grad_norm": 14.023632049560547, "learning_rate": 4.6748576496290356e-05, "loss": 3.0858, "step": 946 }, { "epoch": 0.18570448083145408, "grad_norm": 6.268606662750244, "learning_rate": 4.655588597973754e-05, "loss": 0.8988, "step": 947 }, { "epoch": 0.18590057848808705, "grad_norm": 7.719700336456299, "learning_rate": 4.636347284047877e-05, "loss": 1.5117, "step": 948 }, { "epoch": 0.18609667614472006, "grad_norm": 4.05161714553833, "learning_rate": 4.617133807714666e-05, "loss": 0.7625, "step": 949 }, { "epoch": 0.18629277380135306, "grad_norm": 5.103604793548584, "learning_rate": 4.59794826869291e-05, "loss": 0.8914, "step": 950 }, { "epoch": 0.18648887145798607, "grad_norm": 7.7601847648620605, "learning_rate": 4.578790766556386e-05, "loss": 0.8747, "step": 951 }, { "epoch": 0.18668496911461907, "grad_norm": 12.439096450805664, "learning_rate": 4.559661400733383e-05, "loss": 2.2439, "step": 952 }, { "epoch": 0.18688106677125207, "grad_norm": 7.002286911010742, "learning_rate": 4.5405602705061345e-05, "loss": 1.4917, "step": 953 }, { "epoch": 0.18707716442788508, "grad_norm": 5.178001880645752, "learning_rate": 4.521487475010354e-05, "loss": 1.5737, "step": 954 }, { "epoch": 0.18727326208451808, "grad_norm": 9.555988311767578, "learning_rate": 4.502443113234688e-05, "loss": 1.7495, "step": 955 }, { "epoch": 0.1874693597411511, "grad_norm": 8.907003402709961, "learning_rate": 4.483427284020194e-05, "loss": 1.6053, "step": 956 }, { "epoch": 0.1876654573977841, "grad_norm": 5.562190532684326, "learning_rate": 4.464440086059878e-05, "loss": 1.1854, "step": 957 }, { "epoch": 0.1878615550544171, "grad_norm": 6.078771591186523, "learning_rate": 4.4454816178981115e-05, "loss": 1.3774, "step": 958 }, { "epoch": 0.1880576527110501, "grad_norm": 12.021577835083008, "learning_rate": 4.426551977930191e-05, "loss": 1.6445, "step": 959 }, { "epoch": 0.1882537503676831, "grad_norm": 7.406763553619385, "learning_rate": 4.407651264401763e-05, "loss": 2.0784, "step": 960 }, { "epoch": 0.1884498480243161, "grad_norm": 3.961423635482788, "learning_rate": 4.38877957540837e-05, "loss": 1.5526, "step": 961 }, { "epoch": 0.1886459456809491, "grad_norm": 7.410959243774414, "learning_rate": 4.3699370088949066e-05, "loss": 2.0247, "step": 962 }, { "epoch": 0.18884204333758212, "grad_norm": 5.237401485443115, "learning_rate": 4.3511236626551047e-05, "loss": 2.199, "step": 963 }, { "epoch": 0.18903814099421512, "grad_norm": 4.068509578704834, "learning_rate": 4.3323396343310715e-05, "loss": 2.3318, "step": 964 }, { "epoch": 0.18923423865084812, "grad_norm": 5.523316383361816, "learning_rate": 4.313585021412724e-05, "loss": 1.3474, "step": 965 }, { "epoch": 0.18943033630748113, "grad_norm": 5.052401542663574, "learning_rate": 4.294859921237339e-05, "loss": 1.1039, "step": 966 }, { "epoch": 0.18962643396411413, "grad_norm": 8.468581199645996, "learning_rate": 4.2761644309889946e-05, "loss": 2.9943, "step": 967 }, { "epoch": 0.18982253162074714, "grad_norm": 4.66163444519043, "learning_rate": 4.257498647698107e-05, "loss": 2.0152, "step": 968 }, { "epoch": 0.19001862927738014, "grad_norm": 4.58842658996582, "learning_rate": 4.2388626682409194e-05, "loss": 1.2759, "step": 969 }, { "epoch": 0.19021472693401315, "grad_norm": 3.5642545223236084, "learning_rate": 4.220256589338968e-05, "loss": 0.9877, "step": 970 }, { "epoch": 0.19041082459064615, "grad_norm": 6.506111145019531, "learning_rate": 4.201680507558631e-05, "loss": 1.7171, "step": 971 }, { "epoch": 0.19060692224727915, "grad_norm": 4.93696928024292, "learning_rate": 4.183134519310576e-05, "loss": 0.8798, "step": 972 }, { "epoch": 0.19080301990391216, "grad_norm": 12.539702415466309, "learning_rate": 4.1646187208493005e-05, "loss": 3.0454, "step": 973 }, { "epoch": 0.19099911756054516, "grad_norm": 9.189221382141113, "learning_rate": 4.146133208272608e-05, "loss": 1.9269, "step": 974 }, { "epoch": 0.19119521521717817, "grad_norm": 5.362178802490234, "learning_rate": 4.1276780775211156e-05, "loss": 1.7086, "step": 975 }, { "epoch": 0.19139131287381117, "grad_norm": 4.587559700012207, "learning_rate": 4.109253424377772e-05, "loss": 0.7529, "step": 976 }, { "epoch": 0.19158741053044417, "grad_norm": 7.922691345214844, "learning_rate": 4.090859344467325e-05, "loss": 2.6641, "step": 977 }, { "epoch": 0.19178350818707715, "grad_norm": 6.759201526641846, "learning_rate": 4.072495933255857e-05, "loss": 1.3197, "step": 978 }, { "epoch": 0.19197960584371015, "grad_norm": 5.635506629943848, "learning_rate": 4.054163286050276e-05, "loss": 0.8712, "step": 979 }, { "epoch": 0.19217570350034316, "grad_norm": 4.1558637619018555, "learning_rate": 4.035861497997828e-05, "loss": 2.3436, "step": 980 }, { "epoch": 0.19237180115697616, "grad_norm": 9.821369171142578, "learning_rate": 4.017590664085593e-05, "loss": 1.6202, "step": 981 }, { "epoch": 0.19256789881360917, "grad_norm": 7.816912651062012, "learning_rate": 3.999350879139997e-05, "loss": 1.1312, "step": 982 }, { "epoch": 0.19276399647024217, "grad_norm": 5.585190296173096, "learning_rate": 3.981142237826332e-05, "loss": 1.9218, "step": 983 }, { "epoch": 0.19296009412687518, "grad_norm": 4.785935878753662, "learning_rate": 3.962964834648236e-05, "loss": 0.8693, "step": 984 }, { "epoch": 0.19315619178350818, "grad_norm": 5.3974385261535645, "learning_rate": 3.944818763947231e-05, "loss": 1.1065, "step": 985 }, { "epoch": 0.19335228944014118, "grad_norm": 9.581192016601562, "learning_rate": 3.926704119902219e-05, "loss": 0.867, "step": 986 }, { "epoch": 0.1935483870967742, "grad_norm": 10.229225158691406, "learning_rate": 3.9086209965289965e-05, "loss": 2.8435, "step": 987 }, { "epoch": 0.1937444847534072, "grad_norm": 7.257898807525635, "learning_rate": 3.890569487679766e-05, "loss": 2.4263, "step": 988 }, { "epoch": 0.1939405824100402, "grad_norm": 5.026792526245117, "learning_rate": 3.87254968704265e-05, "loss": 1.2831, "step": 989 }, { "epoch": 0.1941366800666732, "grad_norm": 5.230473041534424, "learning_rate": 3.854561688141205e-05, "loss": 3.3858, "step": 990 }, { "epoch": 0.1943327777233062, "grad_norm": 6.340061187744141, "learning_rate": 3.836605584333931e-05, "loss": 1.5421, "step": 991 }, { "epoch": 0.1945288753799392, "grad_norm": 7.890949726104736, "learning_rate": 3.818681468813794e-05, "loss": 1.7882, "step": 992 }, { "epoch": 0.1947249730365722, "grad_norm": 4.434299945831299, "learning_rate": 3.800789434607741e-05, "loss": 1.274, "step": 993 }, { "epoch": 0.19492107069320522, "grad_norm": 5.311686038970947, "learning_rate": 3.782929574576213e-05, "loss": 1.7605, "step": 994 }, { "epoch": 0.19511716834983822, "grad_norm": 22.297441482543945, "learning_rate": 3.7651019814126654e-05, "loss": 1.9094, "step": 995 }, { "epoch": 0.19531326600647123, "grad_norm": 4.7996826171875, "learning_rate": 3.747306747643089e-05, "loss": 1.7196, "step": 996 }, { "epoch": 0.19550936366310423, "grad_norm": 12.06876277923584, "learning_rate": 3.729543965625526e-05, "loss": 1.2351, "step": 997 }, { "epoch": 0.19570546131973723, "grad_norm": 5.477703094482422, "learning_rate": 3.711813727549594e-05, "loss": 1.3366, "step": 998 }, { "epoch": 0.19590155897637024, "grad_norm": 4.249327659606934, "learning_rate": 3.694116125436007e-05, "loss": 1.9066, "step": 999 }, { "epoch": 0.19609765663300324, "grad_norm": 7.70191764831543, "learning_rate": 3.6764512511360935e-05, "loss": 1.3611, "step": 1000 }, { "epoch": 0.19629375428963625, "grad_norm": 5.822709560394287, "learning_rate": 3.658819196331327e-05, "loss": 2.3203, "step": 1001 }, { "epoch": 0.19648985194626925, "grad_norm": 4.531265735626221, "learning_rate": 3.6412200525328435e-05, "loss": 0.7644, "step": 1002 }, { "epoch": 0.19668594960290225, "grad_norm": 4.714295387268066, "learning_rate": 3.623653911080971e-05, "loss": 1.6555, "step": 1003 }, { "epoch": 0.19688204725953526, "grad_norm": 7.089095592498779, "learning_rate": 3.606120863144753e-05, "loss": 1.6242, "step": 1004 }, { "epoch": 0.19707814491616826, "grad_norm": 2.268071174621582, "learning_rate": 3.588620999721477e-05, "loss": 0.3747, "step": 1005 }, { "epoch": 0.19727424257280127, "grad_norm": 9.244414329528809, "learning_rate": 3.571154411636203e-05, "loss": 1.8298, "step": 1006 }, { "epoch": 0.19747034022943427, "grad_norm": 4.8095808029174805, "learning_rate": 3.5537211895412846e-05, "loss": 0.8923, "step": 1007 }, { "epoch": 0.19766643788606725, "grad_norm": 7.700345516204834, "learning_rate": 3.536321423915913e-05, "loss": 0.691, "step": 1008 }, { "epoch": 0.19786253554270025, "grad_norm": 7.553009510040283, "learning_rate": 3.518955205065632e-05, "loss": 1.1634, "step": 1009 }, { "epoch": 0.19805863319933326, "grad_norm": 5.901485443115234, "learning_rate": 3.5016226231218774e-05, "loss": 2.3853, "step": 1010 }, { "epoch": 0.19825473085596626, "grad_norm": 5.342306137084961, "learning_rate": 3.4843237680415156e-05, "loss": 0.9747, "step": 1011 }, { "epoch": 0.19845082851259926, "grad_norm": 5.793070316314697, "learning_rate": 3.46705872960635e-05, "loss": 0.7877, "step": 1012 }, { "epoch": 0.19864692616923227, "grad_norm": 9.105026245117188, "learning_rate": 3.449827597422698e-05, "loss": 1.5736, "step": 1013 }, { "epoch": 0.19884302382586527, "grad_norm": 7.359753608703613, "learning_rate": 3.432630460920887e-05, "loss": 1.4213, "step": 1014 }, { "epoch": 0.19903912148249828, "grad_norm": 4.728425025939941, "learning_rate": 3.415467409354809e-05, "loss": 0.7005, "step": 1015 }, { "epoch": 0.19923521913913128, "grad_norm": 4.326775074005127, "learning_rate": 3.398338531801457e-05, "loss": 1.0779, "step": 1016 }, { "epoch": 0.19943131679576429, "grad_norm": 5.717733383178711, "learning_rate": 3.381243917160448e-05, "loss": 2.7467, "step": 1017 }, { "epoch": 0.1996274144523973, "grad_norm": 12.189995765686035, "learning_rate": 3.364183654153592e-05, "loss": 1.4596, "step": 1018 }, { "epoch": 0.1998235121090303, "grad_norm": 5.9200239181518555, "learning_rate": 3.3471578313243903e-05, "loss": 1.8764, "step": 1019 }, { "epoch": 0.2000196097656633, "grad_norm": 11.550288200378418, "learning_rate": 3.330166537037618e-05, "loss": 1.3655, "step": 1020 }, { "epoch": 0.2002157074222963, "grad_norm": 4.954752445220947, "learning_rate": 3.313209859478839e-05, "loss": 1.2426, "step": 1021 }, { "epoch": 0.2004118050789293, "grad_norm": 11.455079078674316, "learning_rate": 3.296287886653941e-05, "loss": 2.2854, "step": 1022 }, { "epoch": 0.2006079027355623, "grad_norm": 6.088212966918945, "learning_rate": 3.2794007063887186e-05, "loss": 1.3675, "step": 1023 }, { "epoch": 0.20080400039219531, "grad_norm": 7.098971366882324, "learning_rate": 3.262548406328365e-05, "loss": 1.0983, "step": 1024 }, { "epoch": 0.20100009804882832, "grad_norm": 3.8312485218048096, "learning_rate": 3.245731073937068e-05, "loss": 1.7243, "step": 1025 }, { "epoch": 0.20119619570546132, "grad_norm": 3.807969570159912, "learning_rate": 3.2289487964975076e-05, "loss": 1.0291, "step": 1026 }, { "epoch": 0.20139229336209433, "grad_norm": 4.957508087158203, "learning_rate": 3.212201661110449e-05, "loss": 1.1555, "step": 1027 }, { "epoch": 0.20158839101872733, "grad_norm": 10.736882209777832, "learning_rate": 3.1954897546942584e-05, "loss": 1.9095, "step": 1028 }, { "epoch": 0.20178448867536034, "grad_norm": 9.901065826416016, "learning_rate": 3.1788131639844534e-05, "loss": 2.2914, "step": 1029 }, { "epoch": 0.20198058633199334, "grad_norm": 5.642101764678955, "learning_rate": 3.162171975533282e-05, "loss": 1.3295, "step": 1030 }, { "epoch": 0.20217668398862634, "grad_norm": 6.677104949951172, "learning_rate": 3.1455662757092306e-05, "loss": 1.3554, "step": 1031 }, { "epoch": 0.20237278164525935, "grad_norm": 5.27210807800293, "learning_rate": 3.1289961506966214e-05, "loss": 1.0698, "step": 1032 }, { "epoch": 0.20256887930189235, "grad_norm": 6.362321853637695, "learning_rate": 3.11246168649512e-05, "loss": 1.135, "step": 1033 }, { "epoch": 0.20276497695852536, "grad_norm": 4.513217449188232, "learning_rate": 3.095962968919319e-05, "loss": 1.9138, "step": 1034 }, { "epoch": 0.20296107461515836, "grad_norm": 16.53858184814453, "learning_rate": 3.079500083598297e-05, "loss": 2.4364, "step": 1035 }, { "epoch": 0.20315717227179136, "grad_norm": 5.629922389984131, "learning_rate": 3.063073115975136e-05, "loss": 1.7899, "step": 1036 }, { "epoch": 0.20335326992842437, "grad_norm": 9.979622840881348, "learning_rate": 3.0466821513065314e-05, "loss": 1.7134, "step": 1037 }, { "epoch": 0.20354936758505734, "grad_norm": 8.400696754455566, "learning_rate": 3.030327274662298e-05, "loss": 1.6557, "step": 1038 }, { "epoch": 0.20374546524169035, "grad_norm": 9.457781791687012, "learning_rate": 3.0140085709249667e-05, "loss": 2.3906, "step": 1039 }, { "epoch": 0.20394156289832335, "grad_norm": 4.818408489227295, "learning_rate": 2.997726124789324e-05, "loss": 1.0997, "step": 1040 }, { "epoch": 0.20413766055495636, "grad_norm": 7.07374382019043, "learning_rate": 2.9814800207619774e-05, "loss": 1.441, "step": 1041 }, { "epoch": 0.20433375821158936, "grad_norm": 9.976203918457031, "learning_rate": 2.9652703431609263e-05, "loss": 2.2031, "step": 1042 }, { "epoch": 0.20452985586822237, "grad_norm": 8.051301002502441, "learning_rate": 2.9490971761151e-05, "loss": 1.2837, "step": 1043 }, { "epoch": 0.20472595352485537, "grad_norm": 3.8618202209472656, "learning_rate": 2.9329606035639458e-05, "loss": 1.5356, "step": 1044 }, { "epoch": 0.20472595352485537, "eval_loss": 0.3870697021484375, "eval_runtime": 78.8825, "eval_samples_per_second": 27.23, "eval_steps_per_second": 13.615, "step": 1044 }, { "epoch": 0.20492205118148837, "grad_norm": 4.138796329498291, "learning_rate": 2.9168607092569845e-05, "loss": 0.6768, "step": 1045 }, { "epoch": 0.20511814883812138, "grad_norm": 4.2951250076293945, "learning_rate": 2.9007975767533714e-05, "loss": 1.0667, "step": 1046 }, { "epoch": 0.20531424649475438, "grad_norm": 6.767472743988037, "learning_rate": 2.8847712894214686e-05, "loss": 2.2965, "step": 1047 }, { "epoch": 0.2055103441513874, "grad_norm": 5.017336845397949, "learning_rate": 2.8687819304384066e-05, "loss": 1.457, "step": 1048 }, { "epoch": 0.2057064418080204, "grad_norm": 5.76704740524292, "learning_rate": 2.852829582789669e-05, "loss": 1.3418, "step": 1049 }, { "epoch": 0.2059025394646534, "grad_norm": 5.123453617095947, "learning_rate": 2.8369143292686306e-05, "loss": 1.0328, "step": 1050 }, { "epoch": 0.2060986371212864, "grad_norm": 6.851526260375977, "learning_rate": 2.821036252476156e-05, "loss": 2.0026, "step": 1051 }, { "epoch": 0.2062947347779194, "grad_norm": 5.268173694610596, "learning_rate": 2.8051954348201613e-05, "loss": 0.69, "step": 1052 }, { "epoch": 0.2064908324345524, "grad_norm": 6.25408935546875, "learning_rate": 2.789391958515183e-05, "loss": 1.2385, "step": 1053 }, { "epoch": 0.2066869300911854, "grad_norm": 4.305760383605957, "learning_rate": 2.7736259055819568e-05, "loss": 0.747, "step": 1054 }, { "epoch": 0.20688302774781842, "grad_norm": 9.248473167419434, "learning_rate": 2.757897357846988e-05, "loss": 1.4235, "step": 1055 }, { "epoch": 0.20707912540445142, "grad_norm": 4.8141984939575195, "learning_rate": 2.7422063969421285e-05, "loss": 0.5879, "step": 1056 }, { "epoch": 0.20727522306108442, "grad_norm": 4.95810604095459, "learning_rate": 2.7265531043041535e-05, "loss": 0.8899, "step": 1057 }, { "epoch": 0.20747132071771743, "grad_norm": 8.220407485961914, "learning_rate": 2.710937561174337e-05, "loss": 1.5904, "step": 1058 }, { "epoch": 0.20766741837435043, "grad_norm": 5.406398296356201, "learning_rate": 2.6953598485980336e-05, "loss": 2.2579, "step": 1059 }, { "epoch": 0.20786351603098344, "grad_norm": 7.9236273765563965, "learning_rate": 2.679820047424253e-05, "loss": 1.6289, "step": 1060 }, { "epoch": 0.20805961368761644, "grad_norm": 11.548074722290039, "learning_rate": 2.6643182383052446e-05, "loss": 1.3829, "step": 1061 }, { "epoch": 0.20825571134424944, "grad_norm": 6.566219329833984, "learning_rate": 2.6488545016960776e-05, "loss": 1.5911, "step": 1062 }, { "epoch": 0.20845180900088245, "grad_norm": 6.300790786743164, "learning_rate": 2.6334289178542226e-05, "loss": 0.8637, "step": 1063 }, { "epoch": 0.20864790665751545, "grad_norm": 5.563969135284424, "learning_rate": 2.6180415668391356e-05, "loss": 1.388, "step": 1064 }, { "epoch": 0.20884400431414846, "grad_norm": 6.617629528045654, "learning_rate": 2.602692528511843e-05, "loss": 1.1707, "step": 1065 }, { "epoch": 0.20904010197078146, "grad_norm": 6.338384628295898, "learning_rate": 2.5873818825345254e-05, "loss": 1.3016, "step": 1066 }, { "epoch": 0.20923619962741447, "grad_norm": 5.3434271812438965, "learning_rate": 2.5721097083701084e-05, "loss": 1.5446, "step": 1067 }, { "epoch": 0.20943229728404747, "grad_norm": 28.035884857177734, "learning_rate": 2.556876085281843e-05, "loss": 1.168, "step": 1068 }, { "epoch": 0.20962839494068045, "grad_norm": 14.401495933532715, "learning_rate": 2.5416810923329028e-05, "loss": 1.1242, "step": 1069 }, { "epoch": 0.20982449259731345, "grad_norm": 6.766130447387695, "learning_rate": 2.5265248083859648e-05, "loss": 1.4922, "step": 1070 }, { "epoch": 0.21002059025394645, "grad_norm": 5.8070292472839355, "learning_rate": 2.5114073121028093e-05, "loss": 1.7635, "step": 1071 }, { "epoch": 0.21021668791057946, "grad_norm": 5.450509548187256, "learning_rate": 2.4963286819439037e-05, "loss": 0.7839, "step": 1072 }, { "epoch": 0.21041278556721246, "grad_norm": 4.60319185256958, "learning_rate": 2.4812889961679986e-05, "loss": 1.8491, "step": 1073 }, { "epoch": 0.21060888322384547, "grad_norm": 3.9338133335113525, "learning_rate": 2.4662883328317222e-05, "loss": 0.9618, "step": 1074 }, { "epoch": 0.21080498088047847, "grad_norm": 7.811015605926514, "learning_rate": 2.451326769789176e-05, "loss": 1.5204, "step": 1075 }, { "epoch": 0.21100107853711147, "grad_norm": 6.122691631317139, "learning_rate": 2.4364043846915274e-05, "loss": 1.0331, "step": 1076 }, { "epoch": 0.21119717619374448, "grad_norm": 8.609393119812012, "learning_rate": 2.4215212549866116e-05, "loss": 1.9155, "step": 1077 }, { "epoch": 0.21139327385037748, "grad_norm": 7.292634963989258, "learning_rate": 2.4066774579185158e-05, "loss": 1.2971, "step": 1078 }, { "epoch": 0.2115893715070105, "grad_norm": 5.975107192993164, "learning_rate": 2.3918730705272064e-05, "loss": 1.4156, "step": 1079 }, { "epoch": 0.2117854691636435, "grad_norm": 14.805880546569824, "learning_rate": 2.377108169648098e-05, "loss": 1.0787, "step": 1080 }, { "epoch": 0.2119815668202765, "grad_norm": 5.720880031585693, "learning_rate": 2.3623828319116748e-05, "loss": 1.3329, "step": 1081 }, { "epoch": 0.2121776644769095, "grad_norm": 7.515237331390381, "learning_rate": 2.3476971337430875e-05, "loss": 1.7147, "step": 1082 }, { "epoch": 0.2123737621335425, "grad_norm": 6.027990341186523, "learning_rate": 2.3330511513617448e-05, "loss": 1.9171, "step": 1083 }, { "epoch": 0.2125698597901755, "grad_norm": 4.526289463043213, "learning_rate": 2.318444960780949e-05, "loss": 0.6467, "step": 1084 }, { "epoch": 0.2127659574468085, "grad_norm": 9.184370040893555, "learning_rate": 2.3038786378074574e-05, "loss": 1.6125, "step": 1085 }, { "epoch": 0.21296205510344152, "grad_norm": 7.263781547546387, "learning_rate": 2.289352258041133e-05, "loss": 1.6612, "step": 1086 }, { "epoch": 0.21315815276007452, "grad_norm": 9.886373519897461, "learning_rate": 2.274865896874523e-05, "loss": 2.4533, "step": 1087 }, { "epoch": 0.21335425041670752, "grad_norm": 9.56613540649414, "learning_rate": 2.2604196294924694e-05, "loss": 2.0031, "step": 1088 }, { "epoch": 0.21355034807334053, "grad_norm": 4.8680925369262695, "learning_rate": 2.2460135308717445e-05, "loss": 1.2974, "step": 1089 }, { "epoch": 0.21374644572997353, "grad_norm": 4.032529830932617, "learning_rate": 2.231647675780619e-05, "loss": 0.7491, "step": 1090 }, { "epoch": 0.21394254338660654, "grad_norm": 5.186695098876953, "learning_rate": 2.2173221387785216e-05, "loss": 1.2152, "step": 1091 }, { "epoch": 0.21413864104323954, "grad_norm": 5.538721561431885, "learning_rate": 2.2030369942156072e-05, "loss": 1.6134, "step": 1092 }, { "epoch": 0.21433473869987255, "grad_norm": 8.213644027709961, "learning_rate": 2.1887923162324097e-05, "loss": 1.5288, "step": 1093 }, { "epoch": 0.21453083635650555, "grad_norm": 8.110743522644043, "learning_rate": 2.1745881787594334e-05, "loss": 0.8424, "step": 1094 }, { "epoch": 0.21472693401313855, "grad_norm": 6.066911697387695, "learning_rate": 2.1604246555167638e-05, "loss": 1.9787, "step": 1095 }, { "epoch": 0.21492303166977156, "grad_norm": 3.873046636581421, "learning_rate": 2.1463018200137196e-05, "loss": 1.7871, "step": 1096 }, { "epoch": 0.21511912932640456, "grad_norm": 7.206579208374023, "learning_rate": 2.1322197455484248e-05, "loss": 1.3871, "step": 1097 }, { "epoch": 0.21531522698303757, "grad_norm": 7.201257705688477, "learning_rate": 2.1181785052074756e-05, "loss": 1.2177, "step": 1098 }, { "epoch": 0.21551132463967054, "grad_norm": 5.7325239181518555, "learning_rate": 2.104178171865513e-05, "loss": 1.0547, "step": 1099 }, { "epoch": 0.21570742229630355, "grad_norm": 4.294105052947998, "learning_rate": 2.0902188181848838e-05, "loss": 1.9177, "step": 1100 }, { "epoch": 0.21590351995293655, "grad_norm": 4.344897270202637, "learning_rate": 2.0763005166152517e-05, "loss": 1.2101, "step": 1101 }, { "epoch": 0.21609961760956956, "grad_norm": 10.920205116271973, "learning_rate": 2.0624233393932024e-05, "loss": 1.0783, "step": 1102 }, { "epoch": 0.21629571526620256, "grad_norm": 6.155653953552246, "learning_rate": 2.0485873585419035e-05, "loss": 2.6903, "step": 1103 }, { "epoch": 0.21649181292283556, "grad_norm": 7.211439609527588, "learning_rate": 2.0347926458706945e-05, "loss": 1.8343, "step": 1104 }, { "epoch": 0.21668791057946857, "grad_norm": 8.261204719543457, "learning_rate": 2.021039272974742e-05, "loss": 2.3539, "step": 1105 }, { "epoch": 0.21688400823610157, "grad_norm": 5.709137916564941, "learning_rate": 2.0073273112346526e-05, "loss": 1.0312, "step": 1106 }, { "epoch": 0.21708010589273458, "grad_norm": 6.048385143280029, "learning_rate": 1.9936568318161076e-05, "loss": 1.086, "step": 1107 }, { "epoch": 0.21727620354936758, "grad_norm": 4.2964677810668945, "learning_rate": 1.9800279056695005e-05, "loss": 0.7691, "step": 1108 }, { "epoch": 0.21747230120600058, "grad_norm": 4.8451972007751465, "learning_rate": 1.966440603529549e-05, "loss": 1.4864, "step": 1109 }, { "epoch": 0.2176683988626336, "grad_norm": 5.61348295211792, "learning_rate": 1.952894995914949e-05, "loss": 1.5675, "step": 1110 }, { "epoch": 0.2178644965192666, "grad_norm": 6.666314601898193, "learning_rate": 1.9393911531279974e-05, "loss": 1.3273, "step": 1111 }, { "epoch": 0.2180605941758996, "grad_norm": 5.22393798828125, "learning_rate": 1.9259291452542293e-05, "loss": 1.4175, "step": 1112 }, { "epoch": 0.2182566918325326, "grad_norm": 7.283446311950684, "learning_rate": 1.9125090421620574e-05, "loss": 2.5381, "step": 1113 }, { "epoch": 0.2184527894891656, "grad_norm": 8.553428649902344, "learning_rate": 1.8991309135024004e-05, "loss": 2.0767, "step": 1114 }, { "epoch": 0.2186488871457986, "grad_norm": 6.280401706695557, "learning_rate": 1.8857948287083416e-05, "loss": 0.9069, "step": 1115 }, { "epoch": 0.2188449848024316, "grad_norm": 14.387880325317383, "learning_rate": 1.8725008569947365e-05, "loss": 1.6501, "step": 1116 }, { "epoch": 0.21904108245906462, "grad_norm": 8.282727241516113, "learning_rate": 1.8592490673578843e-05, "loss": 1.645, "step": 1117 }, { "epoch": 0.21923718011569762, "grad_norm": 7.460145950317383, "learning_rate": 1.8460395285751542e-05, "loss": 2.3436, "step": 1118 }, { "epoch": 0.21943327777233063, "grad_norm": 4.364814758300781, "learning_rate": 1.8328723092046317e-05, "loss": 1.1313, "step": 1119 }, { "epoch": 0.21962937542896363, "grad_norm": 16.233600616455078, "learning_rate": 1.8197474775847613e-05, "loss": 2.5969, "step": 1120 }, { "epoch": 0.21982547308559663, "grad_norm": 7.081477165222168, "learning_rate": 1.806665101833994e-05, "loss": 2.4986, "step": 1121 }, { "epoch": 0.22002157074222964, "grad_norm": 4.212589740753174, "learning_rate": 1.7936252498504356e-05, "loss": 1.3044, "step": 1122 }, { "epoch": 0.22021766839886264, "grad_norm": 6.28343391418457, "learning_rate": 1.7806279893114875e-05, "loss": 1.7792, "step": 1123 }, { "epoch": 0.22041376605549565, "grad_norm": 6.392016410827637, "learning_rate": 1.7676733876735018e-05, "loss": 1.6957, "step": 1124 }, { "epoch": 0.22060986371212865, "grad_norm": 8.44856071472168, "learning_rate": 1.754761512171429e-05, "loss": 2.5809, "step": 1125 }, { "epoch": 0.22080596136876166, "grad_norm": 6.063395977020264, "learning_rate": 1.741892429818468e-05, "loss": 1.1537, "step": 1126 }, { "epoch": 0.22100205902539466, "grad_norm": 22.53998374938965, "learning_rate": 1.729066207405722e-05, "loss": 2.6686, "step": 1127 }, { "epoch": 0.22119815668202766, "grad_norm": 4.210419178009033, "learning_rate": 1.7162829115018452e-05, "loss": 1.2635, "step": 1128 }, { "epoch": 0.22139425433866064, "grad_norm": 9.418660163879395, "learning_rate": 1.7035426084527062e-05, "loss": 2.5926, "step": 1129 }, { "epoch": 0.22159035199529364, "grad_norm": 8.299819946289062, "learning_rate": 1.690845364381034e-05, "loss": 1.5026, "step": 1130 }, { "epoch": 0.22178644965192665, "grad_norm": 7.302555084228516, "learning_rate": 1.6781912451860827e-05, "loss": 1.5762, "step": 1131 }, { "epoch": 0.22198254730855965, "grad_norm": 8.947975158691406, "learning_rate": 1.665580316543286e-05, "loss": 0.9516, "step": 1132 }, { "epoch": 0.22217864496519266, "grad_norm": 7.167786121368408, "learning_rate": 1.653012643903915e-05, "loss": 2.3246, "step": 1133 }, { "epoch": 0.22237474262182566, "grad_norm": 8.101836204528809, "learning_rate": 1.640488292494743e-05, "loss": 1.5492, "step": 1134 }, { "epoch": 0.22257084027845866, "grad_norm": 7.395213603973389, "learning_rate": 1.628007327317701e-05, "loss": 1.7188, "step": 1135 }, { "epoch": 0.22276693793509167, "grad_norm": 3.9345545768737793, "learning_rate": 1.6155698131495454e-05, "loss": 0.7864, "step": 1136 }, { "epoch": 0.22296303559172467, "grad_norm": 6.147872447967529, "learning_rate": 1.603175814541522e-05, "loss": 1.7044, "step": 1137 }, { "epoch": 0.22315913324835768, "grad_norm": 5.625443458557129, "learning_rate": 1.5908253958190256e-05, "loss": 2.0135, "step": 1138 }, { "epoch": 0.22335523090499068, "grad_norm": 8.224568367004395, "learning_rate": 1.5785186210812698e-05, "loss": 1.4209, "step": 1139 }, { "epoch": 0.22355132856162369, "grad_norm": 5.80567741394043, "learning_rate": 1.566255554200955e-05, "loss": 0.8693, "step": 1140 }, { "epoch": 0.2237474262182567, "grad_norm": 7.716653347015381, "learning_rate": 1.5540362588239364e-05, "loss": 2.0515, "step": 1141 }, { "epoch": 0.2239435238748897, "grad_norm": 5.845829963684082, "learning_rate": 1.5418607983688927e-05, "loss": 1.3831, "step": 1142 }, { "epoch": 0.2241396215315227, "grad_norm": 5.728200435638428, "learning_rate": 1.529729236026999e-05, "loss": 1.6435, "step": 1143 }, { "epoch": 0.2243357191881557, "grad_norm": 9.004356384277344, "learning_rate": 1.5176416347615885e-05, "loss": 2.4094, "step": 1144 }, { "epoch": 0.2245318168447887, "grad_norm": 7.60123872756958, "learning_rate": 1.5055980573078487e-05, "loss": 1.6575, "step": 1145 }, { "epoch": 0.2247279145014217, "grad_norm": 6.572809219360352, "learning_rate": 1.4935985661724727e-05, "loss": 1.5638, "step": 1146 }, { "epoch": 0.22492401215805471, "grad_norm": 5.095000267028809, "learning_rate": 1.4816432236333444e-05, "loss": 1.6261, "step": 1147 }, { "epoch": 0.22512010981468772, "grad_norm": 5.581939220428467, "learning_rate": 1.4697320917392188e-05, "loss": 1.0154, "step": 1148 }, { "epoch": 0.22531620747132072, "grad_norm": 7.922572135925293, "learning_rate": 1.4578652323093855e-05, "loss": 3.1376, "step": 1149 }, { "epoch": 0.22551230512795373, "grad_norm": 8.814900398254395, "learning_rate": 1.4460427069333726e-05, "loss": 1.8381, "step": 1150 }, { "epoch": 0.22570840278458673, "grad_norm": 8.666311264038086, "learning_rate": 1.4342645769705977e-05, "loss": 0.7864, "step": 1151 }, { "epoch": 0.22590450044121974, "grad_norm": 4.692161560058594, "learning_rate": 1.4225309035500778e-05, "loss": 1.2963, "step": 1152 }, { "epoch": 0.22610059809785274, "grad_norm": 4.93988561630249, "learning_rate": 1.4108417475700908e-05, "loss": 0.8273, "step": 1153 }, { "epoch": 0.22629669575448574, "grad_norm": 5.998587131500244, "learning_rate": 1.3991971696978645e-05, "loss": 2.5023, "step": 1154 }, { "epoch": 0.22649279341111875, "grad_norm": 7.898712635040283, "learning_rate": 1.3875972303692752e-05, "loss": 1.7302, "step": 1155 }, { "epoch": 0.22668889106775175, "grad_norm": 4.3550190925598145, "learning_rate": 1.376041989788508e-05, "loss": 0.6245, "step": 1156 }, { "epoch": 0.22688498872438476, "grad_norm": 9.550539016723633, "learning_rate": 1.3645315079277765e-05, "loss": 1.7358, "step": 1157 }, { "epoch": 0.22708108638101776, "grad_norm": 7.5146164894104, "learning_rate": 1.3530658445269783e-05, "loss": 1.5159, "step": 1158 }, { "epoch": 0.22727718403765074, "grad_norm": 8.449292182922363, "learning_rate": 1.341645059093415e-05, "loss": 2.5644, "step": 1159 }, { "epoch": 0.22747328169428374, "grad_norm": 7.365999698638916, "learning_rate": 1.3302692109014625e-05, "loss": 1.3292, "step": 1160 }, { "epoch": 0.22766937935091675, "grad_norm": 4.936117172241211, "learning_rate": 1.3189383589922665e-05, "loss": 0.636, "step": 1161 }, { "epoch": 0.22786547700754975, "grad_norm": 22.70414161682129, "learning_rate": 1.3076525621734526e-05, "loss": 1.5574, "step": 1162 }, { "epoch": 0.22806157466418275, "grad_norm": 6.079709053039551, "learning_rate": 1.2964118790187929e-05, "loss": 1.8188, "step": 1163 }, { "epoch": 0.22825767232081576, "grad_norm": 6.864228248596191, "learning_rate": 1.2852163678679341e-05, "loss": 1.2751, "step": 1164 }, { "epoch": 0.22845376997744876, "grad_norm": 3.384364366531372, "learning_rate": 1.2740660868260633e-05, "loss": 1.3381, "step": 1165 }, { "epoch": 0.22864986763408177, "grad_norm": 4.77728796005249, "learning_rate": 1.2629610937636283e-05, "loss": 1.9661, "step": 1166 }, { "epoch": 0.22884596529071477, "grad_norm": 4.837019443511963, "learning_rate": 1.251901446316035e-05, "loss": 1.7805, "step": 1167 }, { "epoch": 0.22904206294734777, "grad_norm": 4.927963733673096, "learning_rate": 1.2408872018833296e-05, "loss": 0.8884, "step": 1168 }, { "epoch": 0.22923816060398078, "grad_norm": 7.3830647468566895, "learning_rate": 1.2299184176299339e-05, "loss": 1.1489, "step": 1169 }, { "epoch": 0.22943425826061378, "grad_norm": 4.659049987792969, "learning_rate": 1.2189951504843112e-05, "loss": 0.6723, "step": 1170 }, { "epoch": 0.2296303559172468, "grad_norm": 10.427599906921387, "learning_rate": 1.2081174571386989e-05, "loss": 1.1733, "step": 1171 }, { "epoch": 0.2298264535738798, "grad_norm": 13.781340599060059, "learning_rate": 1.1972853940488015e-05, "loss": 2.0666, "step": 1172 }, { "epoch": 0.2300225512305128, "grad_norm": 4.749293804168701, "learning_rate": 1.1864990174335012e-05, "loss": 1.657, "step": 1173 }, { "epoch": 0.2302186488871458, "grad_norm": 5.438106536865234, "learning_rate": 1.17575838327457e-05, "loss": 2.2866, "step": 1174 }, { "epoch": 0.2304147465437788, "grad_norm": 5.253787517547607, "learning_rate": 1.165063547316363e-05, "loss": 0.7286, "step": 1175 }, { "epoch": 0.2306108442004118, "grad_norm": 5.084885120391846, "learning_rate": 1.1544145650655514e-05, "loss": 1.5842, "step": 1176 }, { "epoch": 0.2308069418570448, "grad_norm": 6.295192718505859, "learning_rate": 1.1438114917908193e-05, "loss": 1.4375, "step": 1177 }, { "epoch": 0.23100303951367782, "grad_norm": 7.397315502166748, "learning_rate": 1.1332543825225806e-05, "loss": 1.5273, "step": 1178 }, { "epoch": 0.23119913717031082, "grad_norm": 4.620631217956543, "learning_rate": 1.122743292052697e-05, "loss": 2.3138, "step": 1179 }, { "epoch": 0.23139523482694382, "grad_norm": 6.895712852478027, "learning_rate": 1.1122782749341843e-05, "loss": 2.7047, "step": 1180 }, { "epoch": 0.23159133248357683, "grad_norm": 5.031332969665527, "learning_rate": 1.1018593854809478e-05, "loss": 1.3471, "step": 1181 }, { "epoch": 0.23178743014020983, "grad_norm": 5.385929584503174, "learning_rate": 1.0914866777674737e-05, "loss": 1.7638, "step": 1182 }, { "epoch": 0.23198352779684284, "grad_norm": 6.311648368835449, "learning_rate": 1.081160205628572e-05, "loss": 1.4686, "step": 1183 }, { "epoch": 0.23217962545347584, "grad_norm": 7.770483016967773, "learning_rate": 1.0708800226590854e-05, "loss": 0.9346, "step": 1184 }, { "epoch": 0.23237572311010884, "grad_norm": 7.618231773376465, "learning_rate": 1.0606461822136137e-05, "loss": 1.0662, "step": 1185 }, { "epoch": 0.23257182076674185, "grad_norm": 8.606494903564453, "learning_rate": 1.0504587374062391e-05, "loss": 1.5704, "step": 1186 }, { "epoch": 0.23276791842337485, "grad_norm": 25.36232566833496, "learning_rate": 1.0403177411102438e-05, "loss": 2.207, "step": 1187 }, { "epoch": 0.23296401608000786, "grad_norm": 8.098196983337402, "learning_rate": 1.0302232459578454e-05, "loss": 2.2944, "step": 1188 }, { "epoch": 0.23316011373664083, "grad_norm": 6.22314977645874, "learning_rate": 1.0201753043399143e-05, "loss": 1.62, "step": 1189 }, { "epoch": 0.23335621139327384, "grad_norm": 9.491933822631836, "learning_rate": 1.0101739684057098e-05, "loss": 1.392, "step": 1190 }, { "epoch": 0.23355230904990684, "grad_norm": 6.178860187530518, "learning_rate": 1.0002192900626028e-05, "loss": 1.6095, "step": 1191 }, { "epoch": 0.23374840670653985, "grad_norm": 5.492360591888428, "learning_rate": 9.903113209758096e-06, "loss": 2.4164, "step": 1192 }, { "epoch": 0.23394450436317285, "grad_norm": 7.380622386932373, "learning_rate": 9.804501125681243e-06, "loss": 1.8485, "step": 1193 }, { "epoch": 0.23414060201980585, "grad_norm": 6.217007637023926, "learning_rate": 9.70635716019651e-06, "loss": 1.5716, "step": 1194 }, { "epoch": 0.23433669967643886, "grad_norm": 3.790039539337158, "learning_rate": 9.608681822675381e-06, "loss": 1.5046, "step": 1195 }, { "epoch": 0.23453279733307186, "grad_norm": 4.153253555297852, "learning_rate": 9.51147562005713e-06, "loss": 0.7896, "step": 1196 }, { "epoch": 0.23472889498970487, "grad_norm": 5.173237323760986, "learning_rate": 9.414739056846222e-06, "loss": 1.1483, "step": 1197 }, { "epoch": 0.23492499264633787, "grad_norm": 6.67634916305542, "learning_rate": 9.318472635109653e-06, "loss": 1.0958, "step": 1198 }, { "epoch": 0.23512109030297088, "grad_norm": 5.527042388916016, "learning_rate": 9.222676854474365e-06, "loss": 2.349, "step": 1199 }, { "epoch": 0.23531718795960388, "grad_norm": 6.437062740325928, "learning_rate": 9.127352212124662e-06, "loss": 2.2577, "step": 1200 }, { "epoch": 0.23551328561623688, "grad_norm": 6.333834171295166, "learning_rate": 9.032499202799628e-06, "loss": 1.5634, "step": 1201 }, { "epoch": 0.2357093832728699, "grad_norm": 8.251330375671387, "learning_rate": 8.938118318790522e-06, "loss": 1.6758, "step": 1202 }, { "epoch": 0.2359054809295029, "grad_norm": 11.161762237548828, "learning_rate": 8.844210049938262e-06, "loss": 0.9043, "step": 1203 }, { "epoch": 0.2361015785861359, "grad_norm": 5.800900936126709, "learning_rate": 8.750774883630908e-06, "loss": 2.1929, "step": 1204 }, { "epoch": 0.2362976762427689, "grad_norm": 4.731690883636475, "learning_rate": 8.657813304801043e-06, "loss": 0.5826, "step": 1205 }, { "epoch": 0.2364937738994019, "grad_norm": 4.208858966827393, "learning_rate": 8.565325795923341e-06, "loss": 0.9173, "step": 1206 }, { "epoch": 0.2366898715560349, "grad_norm": 4.98720645904541, "learning_rate": 8.473312837012026e-06, "loss": 1.7575, "step": 1207 }, { "epoch": 0.2368859692126679, "grad_norm": 6.606943130493164, "learning_rate": 8.3817749056184e-06, "loss": 1.6911, "step": 1208 }, { "epoch": 0.23708206686930092, "grad_norm": 6.240574359893799, "learning_rate": 8.290712476828332e-06, "loss": 2.235, "step": 1209 }, { "epoch": 0.23727816452593392, "grad_norm": 7.585755348205566, "learning_rate": 8.200126023259791e-06, "loss": 1.4902, "step": 1210 }, { "epoch": 0.23747426218256693, "grad_norm": 7.400862693786621, "learning_rate": 8.110016015060484e-06, "loss": 1.9178, "step": 1211 }, { "epoch": 0.23767035983919993, "grad_norm": 9.042954444885254, "learning_rate": 8.020382919905278e-06, "loss": 2.6052, "step": 1212 }, { "epoch": 0.23786645749583293, "grad_norm": 15.022893905639648, "learning_rate": 7.931227202993873e-06, "loss": 1.697, "step": 1213 }, { "epoch": 0.23806255515246594, "grad_norm": 11.485679626464844, "learning_rate": 7.842549327048365e-06, "loss": 2.7337, "step": 1214 }, { "epoch": 0.23825865280909894, "grad_norm": 4.79671049118042, "learning_rate": 7.754349752310752e-06, "loss": 1.7505, "step": 1215 }, { "epoch": 0.23845475046573195, "grad_norm": 4.849589824676514, "learning_rate": 7.666628936540776e-06, "loss": 0.7821, "step": 1216 }, { "epoch": 0.23865084812236495, "grad_norm": 4.220393180847168, "learning_rate": 7.579387335013255e-06, "loss": 0.7812, "step": 1217 }, { "epoch": 0.23884694577899795, "grad_norm": 12.55904769897461, "learning_rate": 7.492625400515951e-06, "loss": 1.8666, "step": 1218 }, { "epoch": 0.23904304343563093, "grad_norm": 5.080347537994385, "learning_rate": 7.406343583347119e-06, "loss": 0.7878, "step": 1219 }, { "epoch": 0.23923914109226393, "grad_norm": 4.184628486633301, "learning_rate": 7.320542331313118e-06, "loss": 1.193, "step": 1220 }, { "epoch": 0.23943523874889694, "grad_norm": 4.931305885314941, "learning_rate": 7.235222089726279e-06, "loss": 0.9402, "step": 1221 }, { "epoch": 0.23963133640552994, "grad_norm": 8.1219482421875, "learning_rate": 7.15038330140233e-06, "loss": 2.1749, "step": 1222 }, { "epoch": 0.23982743406216295, "grad_norm": 5.320800304412842, "learning_rate": 7.066026406658355e-06, "loss": 1.0909, "step": 1223 }, { "epoch": 0.24002353171879595, "grad_norm": 5.9109907150268555, "learning_rate": 6.982151843310281e-06, "loss": 2.0466, "step": 1224 }, { "epoch": 0.24021962937542896, "grad_norm": 4.978448390960693, "learning_rate": 6.898760046670815e-06, "loss": 2.8843, "step": 1225 }, { "epoch": 0.24041572703206196, "grad_norm": 6.749833106994629, "learning_rate": 6.815851449547028e-06, "loss": 0.8198, "step": 1226 }, { "epoch": 0.24061182468869496, "grad_norm": 3.7781665325164795, "learning_rate": 6.7334264822381254e-06, "loss": 0.9111, "step": 1227 }, { "epoch": 0.24080792234532797, "grad_norm": 6.719789028167725, "learning_rate": 6.651485572533378e-06, "loss": 1.9275, "step": 1228 }, { "epoch": 0.24100402000196097, "grad_norm": 7.115839004516602, "learning_rate": 6.570029145709622e-06, "loss": 1.2663, "step": 1229 }, { "epoch": 0.24120011765859398, "grad_norm": 7.567049503326416, "learning_rate": 6.489057624529349e-06, "loss": 2.1002, "step": 1230 }, { "epoch": 0.24139621531522698, "grad_norm": 7.6808180809021, "learning_rate": 6.408571429238253e-06, "loss": 1.0844, "step": 1231 }, { "epoch": 0.24159231297185998, "grad_norm": 15.046584129333496, "learning_rate": 6.328570977563208e-06, "loss": 2.3513, "step": 1232 }, { "epoch": 0.241788410628493, "grad_norm": 9.350245475769043, "learning_rate": 6.24905668471013e-06, "loss": 1.81, "step": 1233 }, { "epoch": 0.241984508285126, "grad_norm": 5.964272499084473, "learning_rate": 6.170028963361618e-06, "loss": 2.0853, "step": 1234 }, { "epoch": 0.242180605941759, "grad_norm": 5.296453475952148, "learning_rate": 6.091488223675057e-06, "loss": 0.8561, "step": 1235 }, { "epoch": 0.242376703598392, "grad_norm": 20.43738555908203, "learning_rate": 6.013434873280288e-06, "loss": 3.0297, "step": 1236 }, { "epoch": 0.242572801255025, "grad_norm": 5.732899188995361, "learning_rate": 5.935869317277643e-06, "loss": 1.4933, "step": 1237 }, { "epoch": 0.242768898911658, "grad_norm": 3.8854384422302246, "learning_rate": 5.858791958235754e-06, "loss": 0.8324, "step": 1238 }, { "epoch": 0.24296499656829101, "grad_norm": 6.640622615814209, "learning_rate": 5.782203196189461e-06, "loss": 1.5419, "step": 1239 }, { "epoch": 0.24316109422492402, "grad_norm": 8.89739990234375, "learning_rate": 5.706103428637865e-06, "loss": 0.7287, "step": 1240 }, { "epoch": 0.24335719188155702, "grad_norm": 3.595503568649292, "learning_rate": 5.630493050542041e-06, "loss": 0.8245, "step": 1241 }, { "epoch": 0.24355328953819003, "grad_norm": 8.166557312011719, "learning_rate": 5.5553724543231825e-06, "loss": 1.5169, "step": 1242 }, { "epoch": 0.24374938719482303, "grad_norm": 11.863232612609863, "learning_rate": 5.480742029860464e-06, "loss": 2.4468, "step": 1243 }, { "epoch": 0.24394548485145603, "grad_norm": 4.747961044311523, "learning_rate": 5.406602164489072e-06, "loss": 1.1186, "step": 1244 }, { "epoch": 0.24414158250808904, "grad_norm": 3.378335952758789, "learning_rate": 5.332953242998151e-06, "loss": 1.222, "step": 1245 }, { "epoch": 0.24433768016472204, "grad_norm": 8.078283309936523, "learning_rate": 5.259795647628818e-06, "loss": 2.1882, "step": 1246 }, { "epoch": 0.24453377782135505, "grad_norm": 6.306114196777344, "learning_rate": 5.1871297580722515e-06, "loss": 1.9767, "step": 1247 }, { "epoch": 0.24472987547798805, "grad_norm": 4.042963981628418, "learning_rate": 5.114955951467537e-06, "loss": 1.6329, "step": 1248 }, { "epoch": 0.24492597313462103, "grad_norm": 4.9465179443359375, "learning_rate": 5.043274602399939e-06, "loss": 1.4165, "step": 1249 }, { "epoch": 0.24512207079125403, "grad_norm": 7.358532428741455, "learning_rate": 4.972086082898775e-06, "loss": 1.5899, "step": 1250 }, { "epoch": 0.24531816844788704, "grad_norm": 4.857394218444824, "learning_rate": 4.901390762435587e-06, "loss": 0.5367, "step": 1251 }, { "epoch": 0.24551426610452004, "grad_norm": 7.886226177215576, "learning_rate": 4.831189007922199e-06, "loss": 2.0356, "step": 1252 }, { "epoch": 0.24571036376115304, "grad_norm": 6.1324872970581055, "learning_rate": 4.761481183708783e-06, "loss": 1.1557, "step": 1253 }, { "epoch": 0.24590646141778605, "grad_norm": 4.222672939300537, "learning_rate": 4.692267651581994e-06, "loss": 1.5599, "step": 1254 }, { "epoch": 0.24610255907441905, "grad_norm": 6.940048694610596, "learning_rate": 4.6235487707631085e-06, "loss": 1.9772, "step": 1255 }, { "epoch": 0.24629865673105206, "grad_norm": 7.745510578155518, "learning_rate": 4.555324897906132e-06, "loss": 1.7293, "step": 1256 }, { "epoch": 0.24649475438768506, "grad_norm": 4.879486560821533, "learning_rate": 4.48759638709596e-06, "loss": 0.5805, "step": 1257 }, { "epoch": 0.24669085204431807, "grad_norm": 8.680879592895508, "learning_rate": 4.42036358984651e-06, "loss": 1.9445, "step": 1258 }, { "epoch": 0.24688694970095107, "grad_norm": 9.775550842285156, "learning_rate": 4.353626855098958e-06, "loss": 1.8064, "step": 1259 }, { "epoch": 0.24708304735758407, "grad_norm": 4.5769805908203125, "learning_rate": 4.287386529219894e-06, "loss": 1.3112, "step": 1260 }, { "epoch": 0.24727914501421708, "grad_norm": 9.00042724609375, "learning_rate": 4.221642955999494e-06, "loss": 1.9532, "step": 1261 }, { "epoch": 0.24747524267085008, "grad_norm": 5.064240455627441, "learning_rate": 4.156396476649782e-06, "loss": 1.262, "step": 1262 }, { "epoch": 0.24767134032748309, "grad_norm": 5.231726169586182, "learning_rate": 4.091647429802869e-06, "loss": 0.9197, "step": 1263 }, { "epoch": 0.2478674379841161, "grad_norm": 4.577012062072754, "learning_rate": 4.027396151509133e-06, "loss": 0.9262, "step": 1264 }, { "epoch": 0.2480635356407491, "grad_norm": 11.125844955444336, "learning_rate": 3.963642975235515e-06, "loss": 0.8612, "step": 1265 }, { "epoch": 0.2482596332973821, "grad_norm": 6.266839981079102, "learning_rate": 3.900388231863805e-06, "loss": 1.8149, "step": 1266 }, { "epoch": 0.2484557309540151, "grad_norm": 11.235099792480469, "learning_rate": 3.8376322496888825e-06, "loss": 1.9527, "step": 1267 }, { "epoch": 0.2486518286106481, "grad_norm": 5.78483247756958, "learning_rate": 3.7753753544170655e-06, "loss": 1.5316, "step": 1268 }, { "epoch": 0.2488479262672811, "grad_norm": 7.571983814239502, "learning_rate": 3.7136178691643433e-06, "loss": 1.8357, "step": 1269 }, { "epoch": 0.24904402392391412, "grad_norm": 10.283147811889648, "learning_rate": 3.6523601144548003e-06, "loss": 1.9926, "step": 1270 }, { "epoch": 0.24924012158054712, "grad_norm": 4.631470203399658, "learning_rate": 3.5916024082188414e-06, "loss": 0.7877, "step": 1271 }, { "epoch": 0.24943621923718012, "grad_norm": 5.498098373413086, "learning_rate": 3.531345065791636e-06, "loss": 3.0685, "step": 1272 }, { "epoch": 0.24963231689381313, "grad_norm": 5.340700149536133, "learning_rate": 3.471588399911441e-06, "loss": 1.7174, "step": 1273 }, { "epoch": 0.24982841455044613, "grad_norm": 8.994786262512207, "learning_rate": 3.4123327207179477e-06, "loss": 1.722, "step": 1274 }, { "epoch": 0.2500245122070791, "grad_norm": 4.499628067016602, "learning_rate": 3.3535783357507624e-06, "loss": 1.3553, "step": 1275 }, { "epoch": 0.25022060986371214, "grad_norm": 13.06672477722168, "learning_rate": 3.29532554994767e-06, "loss": 2.8721, "step": 1276 }, { "epoch": 0.2504167075203451, "grad_norm": 4.79543399810791, "learning_rate": 3.2375746656432284e-06, "loss": 1.0701, "step": 1277 }, { "epoch": 0.25061280517697815, "grad_norm": 5.686835765838623, "learning_rate": 3.180325982567034e-06, "loss": 1.2648, "step": 1278 }, { "epoch": 0.2508089028336111, "grad_norm": 4.657541275024414, "learning_rate": 3.1235797978422687e-06, "loss": 1.3918, "step": 1279 }, { "epoch": 0.25100500049024416, "grad_norm": 10.92974853515625, "learning_rate": 3.0673364059841338e-06, "loss": 2.4811, "step": 1280 }, { "epoch": 0.25120109814687713, "grad_norm": 5.082363128662109, "learning_rate": 3.0115960988982506e-06, "loss": 2.2096, "step": 1281 }, { "epoch": 0.25139719580351017, "grad_norm": 3.3826422691345215, "learning_rate": 2.9563591658793076e-06, "loss": 0.8372, "step": 1282 }, { "epoch": 0.25159329346014314, "grad_norm": 5.3609113693237305, "learning_rate": 2.901625893609361e-06, "loss": 1.2737, "step": 1283 }, { "epoch": 0.2517893911167762, "grad_norm": 4.7862043380737305, "learning_rate": 2.8473965661565347e-06, "loss": 0.7095, "step": 1284 }, { "epoch": 0.25198548877340915, "grad_norm": 8.444502830505371, "learning_rate": 2.793671464973413e-06, "loss": 2.1887, "step": 1285 }, { "epoch": 0.2521815864300422, "grad_norm": 5.742269039154053, "learning_rate": 2.740450868895583e-06, "loss": 1.1412, "step": 1286 }, { "epoch": 0.25237768408667516, "grad_norm": 7.201744079589844, "learning_rate": 2.687735054140317e-06, "loss": 1.3327, "step": 1287 }, { "epoch": 0.2525737817433082, "grad_norm": 7.618087291717529, "learning_rate": 2.63552429430497e-06, "loss": 1.9268, "step": 1288 }, { "epoch": 0.25276987939994117, "grad_norm": 7.360147953033447, "learning_rate": 2.5838188603657056e-06, "loss": 1.0334, "step": 1289 }, { "epoch": 0.2529659770565742, "grad_norm": 5.17711067199707, "learning_rate": 2.5326190206759527e-06, "loss": 1.784, "step": 1290 }, { "epoch": 0.2531620747132072, "grad_norm": 6.017335891723633, "learning_rate": 2.4819250409651607e-06, "loss": 0.9696, "step": 1291 }, { "epoch": 0.2533581723698402, "grad_norm": 6.2475433349609375, "learning_rate": 2.4317371843372904e-06, "loss": 2.4048, "step": 1292 }, { "epoch": 0.2535542700264732, "grad_norm": 6.189507484436035, "learning_rate": 2.3820557112695153e-06, "loss": 1.3096, "step": 1293 }, { "epoch": 0.25375036768310616, "grad_norm": 7.4594011306762695, "learning_rate": 2.3328808796108657e-06, "loss": 0.8887, "step": 1294 }, { "epoch": 0.2539464653397392, "grad_norm": 4.671147346496582, "learning_rate": 2.2842129445808546e-06, "loss": 1.3748, "step": 1295 }, { "epoch": 0.25414256299637217, "grad_norm": 4.567478179931641, "learning_rate": 2.2360521587682313e-06, "loss": 0.8291, "step": 1296 }, { "epoch": 0.2543386606530052, "grad_norm": 7.609241008758545, "learning_rate": 2.188398772129552e-06, "loss": 2.9765, "step": 1297 }, { "epoch": 0.2545347583096382, "grad_norm": 9.003718376159668, "learning_rate": 2.1412530319879887e-06, "loss": 3.6435, "step": 1298 }, { "epoch": 0.2547308559662712, "grad_norm": 6.3176164627075195, "learning_rate": 2.0946151830320224e-06, "loss": 1.2652, "step": 1299 }, { "epoch": 0.2549269536229042, "grad_norm": 12.08651065826416, "learning_rate": 2.0484854673140983e-06, "loss": 2.2016, "step": 1300 }, { "epoch": 0.2551230512795372, "grad_norm": 3.733414649963379, "learning_rate": 2.002864124249504e-06, "loss": 2.3059, "step": 1301 }, { "epoch": 0.2553191489361702, "grad_norm": 8.422008514404297, "learning_rate": 1.9577513906149702e-06, "loss": 1.5297, "step": 1302 }, { "epoch": 0.2555152465928032, "grad_norm": 7.98868989944458, "learning_rate": 1.913147500547574e-06, "loss": 2.1541, "step": 1303 }, { "epoch": 0.2557113442494362, "grad_norm": 4.629753589630127, "learning_rate": 1.869052685543471e-06, "loss": 1.617, "step": 1304 }, { "epoch": 0.25590744190606923, "grad_norm": 4.7464823722839355, "learning_rate": 1.825467174456652e-06, "loss": 0.7396, "step": 1305 }, { "epoch": 0.2561035395627022, "grad_norm": 6.383626937866211, "learning_rate": 1.7823911934978898e-06, "loss": 2.6406, "step": 1306 }, { "epoch": 0.25629963721933524, "grad_norm": 3.751051902770996, "learning_rate": 1.73982496623335e-06, "loss": 2.1239, "step": 1307 }, { "epoch": 0.2564957348759682, "grad_norm": 8.627437591552734, "learning_rate": 1.6977687135836584e-06, "loss": 1.5667, "step": 1308 }, { "epoch": 0.25669183253260125, "grad_norm": 5.191834926605225, "learning_rate": 1.656222653822581e-06, "loss": 1.7599, "step": 1309 }, { "epoch": 0.2568879301892342, "grad_norm": 11.265420913696289, "learning_rate": 1.615187002576013e-06, "loss": 1.6896, "step": 1310 }, { "epoch": 0.25708402784586726, "grad_norm": 7.610350608825684, "learning_rate": 1.574661972820779e-06, "loss": 1.0143, "step": 1311 }, { "epoch": 0.25728012550250023, "grad_norm": 2.748887538909912, "learning_rate": 1.5346477748835354e-06, "loss": 0.3839, "step": 1312 }, { "epoch": 0.25747622315913327, "grad_norm": 9.816802024841309, "learning_rate": 1.4951446164397587e-06, "loss": 1.659, "step": 1313 }, { "epoch": 0.25767232081576624, "grad_norm": 11.259073257446289, "learning_rate": 1.4561527025125476e-06, "loss": 1.9394, "step": 1314 }, { "epoch": 0.2578684184723993, "grad_norm": 5.475277423858643, "learning_rate": 1.4176722354716455e-06, "loss": 1.1844, "step": 1315 }, { "epoch": 0.25806451612903225, "grad_norm": 3.6779868602752686, "learning_rate": 1.379703415032374e-06, "loss": 1.5793, "step": 1316 }, { "epoch": 0.2582606137856653, "grad_norm": 7.077935695648193, "learning_rate": 1.3422464382545797e-06, "loss": 2.019, "step": 1317 }, { "epoch": 0.25845671144229826, "grad_norm": 3.54400634765625, "learning_rate": 1.3053014995415891e-06, "loss": 1.0317, "step": 1318 }, { "epoch": 0.2586528090989313, "grad_norm": 8.784947395324707, "learning_rate": 1.268868790639277e-06, "loss": 0.6858, "step": 1319 }, { "epoch": 0.25884890675556427, "grad_norm": 3.767857789993286, "learning_rate": 1.2329485006349895e-06, "loss": 1.6571, "step": 1320 }, { "epoch": 0.2590450044121973, "grad_norm": 2.997652769088745, "learning_rate": 1.1975408159566103e-06, "loss": 0.7133, "step": 1321 }, { "epoch": 0.2592411020688303, "grad_norm": 13.322317123413086, "learning_rate": 1.1626459203715633e-06, "loss": 2.6333, "step": 1322 }, { "epoch": 0.2594371997254633, "grad_norm": 7.647657871246338, "learning_rate": 1.128263994985901e-06, "loss": 1.3197, "step": 1323 }, { "epoch": 0.2596332973820963, "grad_norm": 5.545785903930664, "learning_rate": 1.0943952182433048e-06, "loss": 1.8365, "step": 1324 }, { "epoch": 0.25982939503872926, "grad_norm": 6.008249759674072, "learning_rate": 1.0610397659242322e-06, "loss": 0.7947, "step": 1325 }, { "epoch": 0.2600254926953623, "grad_norm": 6.118869304656982, "learning_rate": 1.0281978111449375e-06, "loss": 2.1696, "step": 1326 }, { "epoch": 0.26022159035199527, "grad_norm": 5.802155017852783, "learning_rate": 9.958695243565853e-07, "loss": 1.3875, "step": 1327 }, { "epoch": 0.2604176880086283, "grad_norm": 5.157081604003906, "learning_rate": 9.640550733444275e-07, "loss": 1.7365, "step": 1328 }, { "epoch": 0.2606137856652613, "grad_norm": 4.770791053771973, "learning_rate": 9.327546232268392e-07, "loss": 1.0385, "step": 1329 }, { "epoch": 0.2608098833218943, "grad_norm": 6.321690559387207, "learning_rate": 9.019683364545395e-07, "loss": 1.4739, "step": 1330 }, { "epoch": 0.2610059809785273, "grad_norm": 10.451473236083984, "learning_rate": 8.71696372809705e-07, "loss": 2.4985, "step": 1331 }, { "epoch": 0.2612020786351603, "grad_norm": 4.130318641662598, "learning_rate": 8.419388894051472e-07, "loss": 0.9982, "step": 1332 }, { "epoch": 0.2613981762917933, "grad_norm": 6.775696277618408, "learning_rate": 8.126960406835249e-07, "loss": 1.3855, "step": 1333 }, { "epoch": 0.2615942739484263, "grad_norm": 4.182593822479248, "learning_rate": 7.839679784164778e-07, "loss": 1.0039, "step": 1334 }, { "epoch": 0.2617903716050593, "grad_norm": 8.561737060546875, "learning_rate": 7.557548517039381e-07, "loss": 2.7707, "step": 1335 }, { "epoch": 0.26198646926169233, "grad_norm": 7.187052249908447, "learning_rate": 7.28056806973243e-07, "loss": 2.3727, "step": 1336 }, { "epoch": 0.2621825669183253, "grad_norm": 6.421212673187256, "learning_rate": 7.008739879784787e-07, "loss": 1.4712, "step": 1337 }, { "epoch": 0.26237866457495834, "grad_norm": 5.969448089599609, "learning_rate": 6.742065357996486e-07, "loss": 2.0104, "step": 1338 }, { "epoch": 0.2625747622315913, "grad_norm": 3.251781463623047, "learning_rate": 6.480545888420176e-07, "loss": 0.7331, "step": 1339 }, { "epoch": 0.26277085988822435, "grad_norm": 8.010222434997559, "learning_rate": 6.224182828353242e-07, "loss": 2.0611, "step": 1340 }, { "epoch": 0.2629669575448573, "grad_norm": 14.670138359069824, "learning_rate": 5.972977508331368e-07, "loss": 1.6784, "step": 1341 }, { "epoch": 0.26316305520149036, "grad_norm": 3.959542751312256, "learning_rate": 5.726931232120869e-07, "loss": 1.1185, "step": 1342 }, { "epoch": 0.26335915285812334, "grad_norm": 3.680546998977661, "learning_rate": 5.486045276712926e-07, "loss": 1.0173, "step": 1343 }, { "epoch": 0.26355525051475637, "grad_norm": 8.655952453613281, "learning_rate": 5.250320892316252e-07, "loss": 1.4111, "step": 1344 }, { "epoch": 0.26375134817138934, "grad_norm": 5.929543495178223, "learning_rate": 5.019759302350547e-07, "loss": 1.7795, "step": 1345 }, { "epoch": 0.2639474458280224, "grad_norm": 6.462841987609863, "learning_rate": 4.794361703440719e-07, "loss": 0.7667, "step": 1346 }, { "epoch": 0.26414354348465535, "grad_norm": 11.463775634765625, "learning_rate": 4.57412926541001e-07, "loss": 1.8102, "step": 1347 }, { "epoch": 0.2643396411412884, "grad_norm": 8.908552169799805, "learning_rate": 4.3590631312746545e-07, "loss": 1.5951, "step": 1348 }, { "epoch": 0.26453573879792136, "grad_norm": 5.707728862762451, "learning_rate": 4.149164417237117e-07, "loss": 1.287, "step": 1349 }, { "epoch": 0.2647318364545544, "grad_norm": 18.28376579284668, "learning_rate": 3.944434212680981e-07, "loss": 3.0229, "step": 1350 }, { "epoch": 0.26492793411118737, "grad_norm": 7.537192344665527, "learning_rate": 3.744873580165176e-07, "loss": 1.6773, "step": 1351 }, { "epoch": 0.2651240317678204, "grad_norm": 5.802631378173828, "learning_rate": 3.5504835554177605e-07, "loss": 1.5217, "step": 1352 }, { "epoch": 0.2653201294244534, "grad_norm": 5.430902004241943, "learning_rate": 3.361265147331816e-07, "loss": 1.668, "step": 1353 }, { "epoch": 0.26551622708108635, "grad_norm": 8.240734100341797, "learning_rate": 3.177219337958892e-07, "loss": 1.1068, "step": 1354 }, { "epoch": 0.2657123247377194, "grad_norm": 10.376667976379395, "learning_rate": 2.998347082505126e-07, "loss": 2.2403, "step": 1355 }, { "epoch": 0.26590842239435236, "grad_norm": 6.063961505889893, "learning_rate": 2.8246493093250227e-07, "loss": 1.3754, "step": 1356 }, { "epoch": 0.2661045200509854, "grad_norm": 5.9845733642578125, "learning_rate": 2.6561269199179006e-07, "loss": 1.5861, "step": 1357 }, { "epoch": 0.26630061770761837, "grad_norm": 6.585875511169434, "learning_rate": 2.492780788922344e-07, "loss": 2.882, "step": 1358 }, { "epoch": 0.2664967153642514, "grad_norm": 5.5460968017578125, "learning_rate": 2.3346117641116494e-07, "loss": 1.7507, "step": 1359 }, { "epoch": 0.2666928130208844, "grad_norm": 6.179254531860352, "learning_rate": 2.1816206663902717e-07, "loss": 0.9413, "step": 1360 }, { "epoch": 0.2668889106775174, "grad_norm": 18.120824813842773, "learning_rate": 2.0338082897886079e-07, "loss": 2.7844, "step": 1361 }, { "epoch": 0.2670850083341504, "grad_norm": 4.4856157302856445, "learning_rate": 1.891175401459444e-07, "loss": 0.7925, "step": 1362 }, { "epoch": 0.2672811059907834, "grad_norm": 5.911020755767822, "learning_rate": 1.7537227416735135e-07, "loss": 1.7331, "step": 1363 }, { "epoch": 0.2674772036474164, "grad_norm": 4.898580551147461, "learning_rate": 1.6214510238163893e-07, "loss": 1.5288, "step": 1364 }, { "epoch": 0.2676733013040494, "grad_norm": 4.032687664031982, "learning_rate": 1.4943609343839316e-07, "loss": 1.6682, "step": 1365 }, { "epoch": 0.2678693989606824, "grad_norm": 4.88344144821167, "learning_rate": 1.37245313297929e-07, "loss": 0.8977, "step": 1366 }, { "epoch": 0.26806549661731544, "grad_norm": 5.699201583862305, "learning_rate": 1.2557282523094627e-07, "loss": 1.4562, "step": 1367 }, { "epoch": 0.2682615942739484, "grad_norm": 6.58184814453125, "learning_rate": 1.1441868981815207e-07, "loss": 0.9233, "step": 1368 }, { "epoch": 0.26845769193058144, "grad_norm": 7.048121929168701, "learning_rate": 1.0378296494999439e-07, "loss": 2.6469, "step": 1369 }, { "epoch": 0.2686537895872144, "grad_norm": 6.19679069519043, "learning_rate": 9.366570582637346e-08, "loss": 1.6399, "step": 1370 }, { "epoch": 0.26884988724384745, "grad_norm": 6.541812896728516, "learning_rate": 8.406696495627531e-08, "loss": 2.3217, "step": 1371 }, { "epoch": 0.26904598490048043, "grad_norm": 7.404893398284912, "learning_rate": 7.498679215761639e-08, "loss": 1.32, "step": 1372 }, { "epoch": 0.26924208255711346, "grad_norm": 6.199706077575684, "learning_rate": 6.642523455687721e-08, "loss": 1.0584, "step": 1373 }, { "epoch": 0.26943818021374644, "grad_norm": 4.55602502822876, "learning_rate": 5.838233658892467e-08, "loss": 1.0957, "step": 1374 }, { "epoch": 0.26963427787037947, "grad_norm": 8.639842987060547, "learning_rate": 5.08581399967345e-08, "loss": 1.0236, "step": 1375 }, { "epoch": 0.26983037552701244, "grad_norm": 4.639615535736084, "learning_rate": 4.3852683831235866e-08, "loss": 0.8647, "step": 1376 }, { "epoch": 0.2700264731836455, "grad_norm": 4.805739402770996, "learning_rate": 3.736600445104488e-08, "loss": 1.9424, "step": 1377 }, { "epoch": 0.27022257084027845, "grad_norm": 4.504196643829346, "learning_rate": 3.139813552230919e-08, "loss": 0.815, "step": 1378 }, { "epoch": 0.2704186684969115, "grad_norm": 8.973424911499023, "learning_rate": 2.5949108018530342e-08, "loss": 1.7934, "step": 1379 }, { "epoch": 0.27061476615354446, "grad_norm": 5.073940277099609, "learning_rate": 2.101895022040834e-08, "loss": 1.6873, "step": 1380 }, { "epoch": 0.2708108638101775, "grad_norm": 5.69566535949707, "learning_rate": 1.6607687715675113e-08, "loss": 1.5254, "step": 1381 }, { "epoch": 0.27100696146681047, "grad_norm": 2.8405473232269287, "learning_rate": 1.2715343398972402e-08, "loss": 1.5803, "step": 1382 }, { "epoch": 0.2712030591234435, "grad_norm": 3.1316022872924805, "learning_rate": 9.341937471740724e-09, "loss": 0.5618, "step": 1383 }, { "epoch": 0.2713991567800765, "grad_norm": 6.8218793869018555, "learning_rate": 6.487487442097262e-09, "loss": 0.5681, "step": 1384 }, { "epoch": 0.27159525443670945, "grad_norm": 6.514251232147217, "learning_rate": 4.1520081247803375e-09, "loss": 2.2744, "step": 1385 }, { "epoch": 0.2717913520933425, "grad_norm": 8.452823638916016, "learning_rate": 2.335511641005095e-09, "loss": 1.4333, "step": 1386 }, { "epoch": 0.27198744974997546, "grad_norm": 5.929966449737549, "learning_rate": 1.0380074184856981e-09, "loss": 1.6609, "step": 1387 }, { "epoch": 0.2721835474066085, "grad_norm": 3.682739019393921, "learning_rate": 2.595021913243123e-10, "loss": 1.7354, "step": 1388 }, { "epoch": 0.27237964506324147, "grad_norm": 6.1388397216796875, "learning_rate": 0.0, "loss": 0.7588, "step": 1389 } ], "logging_steps": 1, "max_steps": 1389, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 348, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.976036715065508e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }