{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04566455356183518, "eval_steps": 500, "global_step": 1700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.6861502095197165e-05, "grad_norm": 0.9079805016517639, "learning_rate": 2e-05, "loss": 5.1449, "step": 1 }, { "epoch": 5.372300419039433e-05, "grad_norm": 1.1043980121612549, "learning_rate": 4e-05, "loss": 4.9052, "step": 2 }, { "epoch": 8.05845062855915e-05, "grad_norm": 1.1937360763549805, "learning_rate": 6e-05, "loss": 5.1465, "step": 3 }, { "epoch": 0.00010744600838078866, "grad_norm": 3.707120180130005, "learning_rate": 8e-05, "loss": 5.3884, "step": 4 }, { "epoch": 0.00013430751047598582, "grad_norm": 1.2312233448028564, "learning_rate": 0.0001, "loss": 5.7386, "step": 5 }, { "epoch": 0.000161169012571183, "grad_norm": 1.418683648109436, "learning_rate": 9.999999999999938e-05, "loss": 5.7863, "step": 6 }, { "epoch": 0.00018803051466638015, "grad_norm": 1.7924962043762207, "learning_rate": 9.999999999999754e-05, "loss": 5.23, "step": 7 }, { "epoch": 0.00021489201676157732, "grad_norm": 1.8357185125350952, "learning_rate": 9.999999999999445e-05, "loss": 4.9708, "step": 8 }, { "epoch": 0.00024175351885677448, "grad_norm": 1.7411339282989502, "learning_rate": 9.999999999999014e-05, "loss": 4.9414, "step": 9 }, { "epoch": 0.00026861502095197165, "grad_norm": 1.7914339303970337, "learning_rate": 9.999999999998458e-05, "loss": 4.6556, "step": 10 }, { "epoch": 0.0002954765230471688, "grad_norm": 1.8011003732681274, "learning_rate": 9.99999999999778e-05, "loss": 4.2169, "step": 11 }, { "epoch": 0.000322338025142366, "grad_norm": 1.3325257301330566, "learning_rate": 9.999999999996978e-05, "loss": 4.1794, "step": 12 }, { "epoch": 0.00034919952723756314, "grad_norm": 1.4623206853866577, "learning_rate": 9.999999999996053e-05, "loss": 4.4874, "step": 13 }, { "epoch": 0.0003760610293327603, "grad_norm": 1.2885961532592773, "learning_rate": 9.999999999995005e-05, "loss": 4.0216, "step": 14 }, { "epoch": 0.00040292253142795747, "grad_norm": 1.2243579626083374, "learning_rate": 9.999999999993833e-05, "loss": 3.9258, "step": 15 }, { "epoch": 0.00042978403352315463, "grad_norm": 1.3720817565917969, "learning_rate": 9.999999999992537e-05, "loss": 4.0791, "step": 16 }, { "epoch": 0.0004566455356183518, "grad_norm": 1.672309398651123, "learning_rate": 9.999999999991118e-05, "loss": 4.1378, "step": 17 }, { "epoch": 0.00048350703771354896, "grad_norm": 1.2076022624969482, "learning_rate": 9.999999999989576e-05, "loss": 3.5104, "step": 18 }, { "epoch": 0.0005103685398087461, "grad_norm": 1.3783760070800781, "learning_rate": 9.99999999998791e-05, "loss": 3.9612, "step": 19 }, { "epoch": 0.0005372300419039433, "grad_norm": 1.3485159873962402, "learning_rate": 9.999999999986121e-05, "loss": 3.8829, "step": 20 }, { "epoch": 0.0005640915439991404, "grad_norm": 1.6064331531524658, "learning_rate": 9.999999999984209e-05, "loss": 4.1337, "step": 21 }, { "epoch": 0.0005909530460943376, "grad_norm": 1.2591063976287842, "learning_rate": 9.999999999982173e-05, "loss": 3.4452, "step": 22 }, { "epoch": 0.0006178145481895347, "grad_norm": 1.456554651260376, "learning_rate": 9.999999999980015e-05, "loss": 3.49, "step": 23 }, { "epoch": 0.000644676050284732, "grad_norm": 1.4870679378509521, "learning_rate": 9.999999999977731e-05, "loss": 3.829, "step": 24 }, { "epoch": 0.0006715375523799291, "grad_norm": 1.3454641103744507, "learning_rate": 9.999999999975327e-05, "loss": 3.7703, "step": 25 }, { "epoch": 0.0006983990544751263, "grad_norm": 1.5091837644577026, "learning_rate": 9.999999999972798e-05, "loss": 3.7927, "step": 26 }, { "epoch": 0.0007252605565703234, "grad_norm": 1.527295708656311, "learning_rate": 9.999999999970144e-05, "loss": 3.6802, "step": 27 }, { "epoch": 0.0007521220586655206, "grad_norm": 1.5174264907836914, "learning_rate": 9.999999999967369e-05, "loss": 3.684, "step": 28 }, { "epoch": 0.0007789835607607177, "grad_norm": 1.5124770402908325, "learning_rate": 9.99999999996447e-05, "loss": 3.8185, "step": 29 }, { "epoch": 0.0008058450628559149, "grad_norm": 1.4548962116241455, "learning_rate": 9.999999999961446e-05, "loss": 3.7584, "step": 30 }, { "epoch": 0.000832706564951112, "grad_norm": 1.470106840133667, "learning_rate": 9.999999999958302e-05, "loss": 3.5163, "step": 31 }, { "epoch": 0.0008595680670463093, "grad_norm": 1.4147369861602783, "learning_rate": 9.999999999955032e-05, "loss": 3.6908, "step": 32 }, { "epoch": 0.0008864295691415064, "grad_norm": 1.5790003538131714, "learning_rate": 9.99999999995164e-05, "loss": 3.589, "step": 33 }, { "epoch": 0.0009132910712367036, "grad_norm": 1.5375813245773315, "learning_rate": 9.999999999948123e-05, "loss": 3.629, "step": 34 }, { "epoch": 0.0009401525733319007, "grad_norm": 1.5038788318634033, "learning_rate": 9.999999999944484e-05, "loss": 3.691, "step": 35 }, { "epoch": 0.0009670140754270979, "grad_norm": 1.4857861995697021, "learning_rate": 9.999999999940722e-05, "loss": 3.7313, "step": 36 }, { "epoch": 0.0009938755775222951, "grad_norm": 1.533119559288025, "learning_rate": 9.999999999936835e-05, "loss": 3.925, "step": 37 }, { "epoch": 0.0010207370796174921, "grad_norm": 1.562211036682129, "learning_rate": 9.999999999932825e-05, "loss": 3.8762, "step": 38 }, { "epoch": 0.0010475985817126894, "grad_norm": 1.3962745666503906, "learning_rate": 9.999999999928693e-05, "loss": 3.3351, "step": 39 }, { "epoch": 0.0010744600838078866, "grad_norm": 1.3966686725616455, "learning_rate": 9.999999999924437e-05, "loss": 3.7117, "step": 40 }, { "epoch": 0.0011013215859030838, "grad_norm": 1.4492931365966797, "learning_rate": 9.999999999920057e-05, "loss": 3.715, "step": 41 }, { "epoch": 0.0011281830879982808, "grad_norm": 1.4874073266983032, "learning_rate": 9.999999999915553e-05, "loss": 3.6364, "step": 42 }, { "epoch": 0.001155044590093478, "grad_norm": 1.5005569458007812, "learning_rate": 9.999999999910928e-05, "loss": 3.712, "step": 43 }, { "epoch": 0.0011819060921886752, "grad_norm": 1.3945828676223755, "learning_rate": 9.999999999906178e-05, "loss": 3.6228, "step": 44 }, { "epoch": 0.0012087675942838725, "grad_norm": 1.4671238660812378, "learning_rate": 9.999999999901304e-05, "loss": 3.5543, "step": 45 }, { "epoch": 0.0012356290963790695, "grad_norm": 1.5679062604904175, "learning_rate": 9.999999999896307e-05, "loss": 3.4235, "step": 46 }, { "epoch": 0.0012624905984742667, "grad_norm": 1.389269471168518, "learning_rate": 9.999999999891187e-05, "loss": 3.5772, "step": 47 }, { "epoch": 0.001289352100569464, "grad_norm": 1.628616452217102, "learning_rate": 9.999999999885945e-05, "loss": 3.9661, "step": 48 }, { "epoch": 0.001316213602664661, "grad_norm": 1.5256599187850952, "learning_rate": 9.999999999880578e-05, "loss": 3.6918, "step": 49 }, { "epoch": 0.0013430751047598581, "grad_norm": 1.6094127893447876, "learning_rate": 9.999999999875088e-05, "loss": 3.5672, "step": 50 }, { "epoch": 0.0013699366068550553, "grad_norm": 1.349199891090393, "learning_rate": 9.999999999869475e-05, "loss": 3.6309, "step": 51 }, { "epoch": 0.0013967981089502526, "grad_norm": 1.3306453227996826, "learning_rate": 9.999999999863738e-05, "loss": 3.3533, "step": 52 }, { "epoch": 0.0014236596110454496, "grad_norm": 1.3546539545059204, "learning_rate": 9.999999999857879e-05, "loss": 3.2053, "step": 53 }, { "epoch": 0.0014505211131406468, "grad_norm": 1.1287821531295776, "learning_rate": 9.999999999851894e-05, "loss": 3.3292, "step": 54 }, { "epoch": 0.001477382615235844, "grad_norm": 1.1314492225646973, "learning_rate": 9.999999999845788e-05, "loss": 2.9421, "step": 55 }, { "epoch": 0.0015042441173310412, "grad_norm": 1.3284823894500732, "learning_rate": 9.999999999839558e-05, "loss": 3.5217, "step": 56 }, { "epoch": 0.0015311056194262382, "grad_norm": 1.223236322402954, "learning_rate": 9.999999999833204e-05, "loss": 3.376, "step": 57 }, { "epoch": 0.0015579671215214354, "grad_norm": 1.043579339981079, "learning_rate": 9.999999999826727e-05, "loss": 3.1108, "step": 58 }, { "epoch": 0.0015848286236166327, "grad_norm": 1.1521614789962769, "learning_rate": 9.999999999820126e-05, "loss": 3.5073, "step": 59 }, { "epoch": 0.0016116901257118299, "grad_norm": 1.1598647832870483, "learning_rate": 9.999999999813403e-05, "loss": 3.0585, "step": 60 }, { "epoch": 0.0016385516278070269, "grad_norm": 1.1426210403442383, "learning_rate": 9.999999999806556e-05, "loss": 3.5695, "step": 61 }, { "epoch": 0.001665413129902224, "grad_norm": 1.1063036918640137, "learning_rate": 9.999999999799586e-05, "loss": 3.5192, "step": 62 }, { "epoch": 0.0016922746319974213, "grad_norm": 1.1488440036773682, "learning_rate": 9.999999999792492e-05, "loss": 3.3374, "step": 63 }, { "epoch": 0.0017191361340926185, "grad_norm": 1.1304515600204468, "learning_rate": 9.999999999785274e-05, "loss": 3.3304, "step": 64 }, { "epoch": 0.0017459976361878155, "grad_norm": 1.1385020017623901, "learning_rate": 9.999999999777934e-05, "loss": 3.2574, "step": 65 }, { "epoch": 0.0017728591382830128, "grad_norm": 1.1962132453918457, "learning_rate": 9.999999999770471e-05, "loss": 3.5773, "step": 66 }, { "epoch": 0.00179972064037821, "grad_norm": 1.2136801481246948, "learning_rate": 9.999999999762884e-05, "loss": 3.3551, "step": 67 }, { "epoch": 0.0018265821424734072, "grad_norm": 1.208842158317566, "learning_rate": 9.999999999755171e-05, "loss": 3.126, "step": 68 }, { "epoch": 0.0018534436445686042, "grad_norm": 1.1841886043548584, "learning_rate": 9.999999999747338e-05, "loss": 3.3473, "step": 69 }, { "epoch": 0.0018803051466638014, "grad_norm": 1.213433861732483, "learning_rate": 9.999999999739381e-05, "loss": 3.1907, "step": 70 }, { "epoch": 0.0019071666487589986, "grad_norm": 1.2414772510528564, "learning_rate": 9.9999999997313e-05, "loss": 3.5086, "step": 71 }, { "epoch": 0.0019340281508541959, "grad_norm": 1.267667293548584, "learning_rate": 9.999999999723097e-05, "loss": 3.2744, "step": 72 }, { "epoch": 0.001960889652949393, "grad_norm": 1.3424843549728394, "learning_rate": 9.999999999714769e-05, "loss": 3.5996, "step": 73 }, { "epoch": 0.0019877511550445903, "grad_norm": 1.2784979343414307, "learning_rate": 9.999999999706317e-05, "loss": 3.4, "step": 74 }, { "epoch": 0.0020146126571397873, "grad_norm": 1.4537830352783203, "learning_rate": 9.999999999697744e-05, "loss": 3.3768, "step": 75 }, { "epoch": 0.0020414741592349843, "grad_norm": 1.3794150352478027, "learning_rate": 9.999999999689046e-05, "loss": 3.1867, "step": 76 }, { "epoch": 0.0020683356613301817, "grad_norm": 1.411429762840271, "learning_rate": 9.999999999680225e-05, "loss": 3.4702, "step": 77 }, { "epoch": 0.0020951971634253787, "grad_norm": 1.3160576820373535, "learning_rate": 9.999999999671281e-05, "loss": 3.4361, "step": 78 }, { "epoch": 0.0021220586655205757, "grad_norm": 1.2753205299377441, "learning_rate": 9.999999999662213e-05, "loss": 3.3278, "step": 79 }, { "epoch": 0.002148920167615773, "grad_norm": 1.4154719114303589, "learning_rate": 9.999999999653022e-05, "loss": 3.7775, "step": 80 }, { "epoch": 0.00217578166971097, "grad_norm": 1.354750156402588, "learning_rate": 9.999999999643707e-05, "loss": 3.5165, "step": 81 }, { "epoch": 0.0022026431718061676, "grad_norm": 1.316260814666748, "learning_rate": 9.999999999634269e-05, "loss": 3.0653, "step": 82 }, { "epoch": 0.0022295046739013646, "grad_norm": 1.3837639093399048, "learning_rate": 9.999999999624708e-05, "loss": 3.3219, "step": 83 }, { "epoch": 0.0022563661759965616, "grad_norm": 1.3703302145004272, "learning_rate": 9.999999999615023e-05, "loss": 3.2097, "step": 84 }, { "epoch": 0.002283227678091759, "grad_norm": 1.4257659912109375, "learning_rate": 9.999999999605217e-05, "loss": 3.669, "step": 85 }, { "epoch": 0.002310089180186956, "grad_norm": 1.4506181478500366, "learning_rate": 9.999999999595284e-05, "loss": 3.6931, "step": 86 }, { "epoch": 0.002336950682282153, "grad_norm": 1.5057146549224854, "learning_rate": 9.99999999958523e-05, "loss": 3.3992, "step": 87 }, { "epoch": 0.0023638121843773505, "grad_norm": 1.5274763107299805, "learning_rate": 9.999999999575052e-05, "loss": 3.476, "step": 88 }, { "epoch": 0.0023906736864725475, "grad_norm": 1.5464972257614136, "learning_rate": 9.99999999956475e-05, "loss": 3.5653, "step": 89 }, { "epoch": 0.002417535188567745, "grad_norm": 1.5150189399719238, "learning_rate": 9.999999999554326e-05, "loss": 3.5639, "step": 90 }, { "epoch": 0.002444396690662942, "grad_norm": 1.5306520462036133, "learning_rate": 9.999999999543778e-05, "loss": 3.7718, "step": 91 }, { "epoch": 0.002471258192758139, "grad_norm": 1.562620997428894, "learning_rate": 9.999999999533106e-05, "loss": 3.5158, "step": 92 }, { "epoch": 0.0024981196948533364, "grad_norm": 1.4864206314086914, "learning_rate": 9.999999999522312e-05, "loss": 3.6478, "step": 93 }, { "epoch": 0.0025249811969485334, "grad_norm": 1.6090619564056396, "learning_rate": 9.999999999511394e-05, "loss": 3.5618, "step": 94 }, { "epoch": 0.0025518426990437304, "grad_norm": 1.538235068321228, "learning_rate": 9.999999999500351e-05, "loss": 3.3743, "step": 95 }, { "epoch": 0.002578704201138928, "grad_norm": 1.647749900817871, "learning_rate": 9.999999999489187e-05, "loss": 3.77, "step": 96 }, { "epoch": 0.002605565703234125, "grad_norm": 1.4851956367492676, "learning_rate": 9.999999999477898e-05, "loss": 3.3571, "step": 97 }, { "epoch": 0.002632427205329322, "grad_norm": 1.6766194105148315, "learning_rate": 9.999999999466486e-05, "loss": 3.7478, "step": 98 }, { "epoch": 0.0026592887074245192, "grad_norm": 1.7860498428344727, "learning_rate": 9.999999999454952e-05, "loss": 3.7822, "step": 99 }, { "epoch": 0.0026861502095197162, "grad_norm": 1.7103192806243896, "learning_rate": 9.999999999443292e-05, "loss": 3.4418, "step": 100 }, { "epoch": 0.0027130117116149137, "grad_norm": 1.218453288078308, "learning_rate": 9.999999999431511e-05, "loss": 3.5775, "step": 101 }, { "epoch": 0.0027398732137101107, "grad_norm": 1.3814231157302856, "learning_rate": 9.999999999419605e-05, "loss": 3.5856, "step": 102 }, { "epoch": 0.0027667347158053077, "grad_norm": 1.2434844970703125, "learning_rate": 9.999999999407576e-05, "loss": 3.0429, "step": 103 }, { "epoch": 0.002793596217900505, "grad_norm": 1.2515223026275635, "learning_rate": 9.999999999395425e-05, "loss": 3.2445, "step": 104 }, { "epoch": 0.002820457719995702, "grad_norm": 1.14759361743927, "learning_rate": 9.999999999383149e-05, "loss": 3.1756, "step": 105 }, { "epoch": 0.002847319222090899, "grad_norm": 1.1015360355377197, "learning_rate": 9.99999999937075e-05, "loss": 2.8039, "step": 106 }, { "epoch": 0.0028741807241860966, "grad_norm": 1.5735284090042114, "learning_rate": 9.99999999935823e-05, "loss": 3.6006, "step": 107 }, { "epoch": 0.0029010422262812936, "grad_norm": 1.27744460105896, "learning_rate": 9.999999999345584e-05, "loss": 3.1294, "step": 108 }, { "epoch": 0.002927903728376491, "grad_norm": 1.2897201776504517, "learning_rate": 9.999999999332814e-05, "loss": 3.217, "step": 109 }, { "epoch": 0.002954765230471688, "grad_norm": 1.176347255706787, "learning_rate": 9.999999999319922e-05, "loss": 3.1679, "step": 110 }, { "epoch": 0.002981626732566885, "grad_norm": 1.3510855436325073, "learning_rate": 9.999999999306907e-05, "loss": 3.3798, "step": 111 }, { "epoch": 0.0030084882346620824, "grad_norm": 1.310192346572876, "learning_rate": 9.999999999293767e-05, "loss": 3.3365, "step": 112 }, { "epoch": 0.0030353497367572794, "grad_norm": 1.339492678642273, "learning_rate": 9.999999999280506e-05, "loss": 3.1089, "step": 113 }, { "epoch": 0.0030622112388524764, "grad_norm": 1.184141755104065, "learning_rate": 9.99999999926712e-05, "loss": 3.2258, "step": 114 }, { "epoch": 0.003089072740947674, "grad_norm": 1.149617314338684, "learning_rate": 9.999999999253611e-05, "loss": 2.82, "step": 115 }, { "epoch": 0.003115934243042871, "grad_norm": 1.402675747871399, "learning_rate": 9.99999999923998e-05, "loss": 3.486, "step": 116 }, { "epoch": 0.0031427957451380683, "grad_norm": 1.5151649713516235, "learning_rate": 9.999999999226224e-05, "loss": 3.4411, "step": 117 }, { "epoch": 0.0031696572472332653, "grad_norm": 1.33055579662323, "learning_rate": 9.999999999212344e-05, "loss": 3.2941, "step": 118 }, { "epoch": 0.0031965187493284623, "grad_norm": 1.2644603252410889, "learning_rate": 9.999999999198341e-05, "loss": 2.8132, "step": 119 }, { "epoch": 0.0032233802514236598, "grad_norm": 1.3672884702682495, "learning_rate": 9.999999999184216e-05, "loss": 3.1897, "step": 120 }, { "epoch": 0.0032502417535188568, "grad_norm": 1.3743053674697876, "learning_rate": 9.999999999169966e-05, "loss": 3.4031, "step": 121 }, { "epoch": 0.0032771032556140538, "grad_norm": 1.4384634494781494, "learning_rate": 9.999999999155594e-05, "loss": 3.4043, "step": 122 }, { "epoch": 0.003303964757709251, "grad_norm": 1.497501015663147, "learning_rate": 9.999999999141097e-05, "loss": 3.1186, "step": 123 }, { "epoch": 0.003330826259804448, "grad_norm": 1.39578378200531, "learning_rate": 9.999999999126478e-05, "loss": 3.4345, "step": 124 }, { "epoch": 0.0033576877618996456, "grad_norm": 1.3490184545516968, "learning_rate": 9.999999999111735e-05, "loss": 3.3325, "step": 125 }, { "epoch": 0.0033845492639948426, "grad_norm": 1.410489797592163, "learning_rate": 9.99999999909687e-05, "loss": 3.3977, "step": 126 }, { "epoch": 0.0034114107660900396, "grad_norm": 1.3900330066680908, "learning_rate": 9.999999999081879e-05, "loss": 3.4452, "step": 127 }, { "epoch": 0.003438272268185237, "grad_norm": 1.3278859853744507, "learning_rate": 9.999999999066767e-05, "loss": 3.15, "step": 128 }, { "epoch": 0.003465133770280434, "grad_norm": 1.5235861539840698, "learning_rate": 9.999999999051531e-05, "loss": 3.5005, "step": 129 }, { "epoch": 0.003491995272375631, "grad_norm": 1.5267044305801392, "learning_rate": 9.999999999036172e-05, "loss": 3.5311, "step": 130 }, { "epoch": 0.0035188567744708285, "grad_norm": 1.4808764457702637, "learning_rate": 9.999999999020688e-05, "loss": 3.5163, "step": 131 }, { "epoch": 0.0035457182765660255, "grad_norm": 1.438101887702942, "learning_rate": 9.999999999005082e-05, "loss": 3.2459, "step": 132 }, { "epoch": 0.003572579778661223, "grad_norm": 1.4126555919647217, "learning_rate": 9.999999998989352e-05, "loss": 3.3198, "step": 133 }, { "epoch": 0.00359944128075642, "grad_norm": 1.53532874584198, "learning_rate": 9.9999999989735e-05, "loss": 3.3744, "step": 134 }, { "epoch": 0.003626302782851617, "grad_norm": 1.4298460483551025, "learning_rate": 9.999999998957524e-05, "loss": 3.5602, "step": 135 }, { "epoch": 0.0036531642849468144, "grad_norm": 1.5221728086471558, "learning_rate": 9.999999998941423e-05, "loss": 3.4989, "step": 136 }, { "epoch": 0.0036800257870420114, "grad_norm": 1.5173331499099731, "learning_rate": 9.9999999989252e-05, "loss": 3.2325, "step": 137 }, { "epoch": 0.0037068872891372084, "grad_norm": 1.512844204902649, "learning_rate": 9.999999998908853e-05, "loss": 3.5864, "step": 138 }, { "epoch": 0.003733748791232406, "grad_norm": 1.5138494968414307, "learning_rate": 9.999999998892384e-05, "loss": 3.2989, "step": 139 }, { "epoch": 0.003760610293327603, "grad_norm": 1.484354019165039, "learning_rate": 9.999999998875791e-05, "loss": 3.2201, "step": 140 }, { "epoch": 0.0037874717954228, "grad_norm": 1.5164704322814941, "learning_rate": 9.999999998859073e-05, "loss": 3.5061, "step": 141 }, { "epoch": 0.0038143332975179973, "grad_norm": 1.4222763776779175, "learning_rate": 9.999999998842234e-05, "loss": 3.3215, "step": 142 }, { "epoch": 0.0038411947996131943, "grad_norm": 1.4848490953445435, "learning_rate": 9.99999999882527e-05, "loss": 3.2796, "step": 143 }, { "epoch": 0.0038680563017083917, "grad_norm": 1.4712228775024414, "learning_rate": 9.999999998808183e-05, "loss": 3.4786, "step": 144 }, { "epoch": 0.0038949178038035887, "grad_norm": 1.654551386833191, "learning_rate": 9.999999998790973e-05, "loss": 3.4385, "step": 145 }, { "epoch": 0.003921779305898786, "grad_norm": 1.503765344619751, "learning_rate": 9.999999998773639e-05, "loss": 3.3224, "step": 146 }, { "epoch": 0.003948640807993983, "grad_norm": 1.7049174308776855, "learning_rate": 9.999999998756183e-05, "loss": 3.589, "step": 147 }, { "epoch": 0.003975502310089181, "grad_norm": 1.5706239938735962, "learning_rate": 9.999999998738603e-05, "loss": 3.5896, "step": 148 }, { "epoch": 0.004002363812184377, "grad_norm": 1.7528107166290283, "learning_rate": 9.9999999987209e-05, "loss": 3.8007, "step": 149 }, { "epoch": 0.004029225314279575, "grad_norm": 1.8006619215011597, "learning_rate": 9.999999998703072e-05, "loss": 3.7126, "step": 150 }, { "epoch": 0.004056086816374772, "grad_norm": 1.501328706741333, "learning_rate": 9.999999998685121e-05, "loss": 3.5138, "step": 151 }, { "epoch": 0.004082948318469969, "grad_norm": 1.5186468362808228, "learning_rate": 9.999999998667048e-05, "loss": 2.9103, "step": 152 }, { "epoch": 0.004109809820565166, "grad_norm": 1.4595462083816528, "learning_rate": 9.99999999864885e-05, "loss": 3.4846, "step": 153 }, { "epoch": 0.0041366713226603635, "grad_norm": 1.4289087057113647, "learning_rate": 9.99999999863053e-05, "loss": 3.3445, "step": 154 }, { "epoch": 0.00416353282475556, "grad_norm": 1.2521617412567139, "learning_rate": 9.999999998612086e-05, "loss": 3.1745, "step": 155 }, { "epoch": 0.0041903943268507575, "grad_norm": 1.2283687591552734, "learning_rate": 9.999999998593519e-05, "loss": 2.9809, "step": 156 }, { "epoch": 0.004217255828945955, "grad_norm": 1.2780406475067139, "learning_rate": 9.999999998574829e-05, "loss": 3.2447, "step": 157 }, { "epoch": 0.0042441173310411515, "grad_norm": 1.2048006057739258, "learning_rate": 9.999999998556015e-05, "loss": 3.0817, "step": 158 }, { "epoch": 0.004270978833136349, "grad_norm": 4.955997467041016, "learning_rate": 9.999999998537077e-05, "loss": 2.7906, "step": 159 }, { "epoch": 0.004297840335231546, "grad_norm": 1.3788132667541504, "learning_rate": 9.999999998518017e-05, "loss": 3.2995, "step": 160 }, { "epoch": 0.004324701837326743, "grad_norm": 1.4886622428894043, "learning_rate": 9.999999998498832e-05, "loss": 3.0405, "step": 161 }, { "epoch": 0.00435156333942194, "grad_norm": 1.3938782215118408, "learning_rate": 9.999999998479526e-05, "loss": 3.1436, "step": 162 }, { "epoch": 0.004378424841517138, "grad_norm": 1.3613883256912231, "learning_rate": 9.999999998460095e-05, "loss": 2.8424, "step": 163 }, { "epoch": 0.004405286343612335, "grad_norm": 1.4670555591583252, "learning_rate": 9.999999998440541e-05, "loss": 3.4129, "step": 164 }, { "epoch": 0.004432147845707532, "grad_norm": 1.307345986366272, "learning_rate": 9.999999998420864e-05, "loss": 3.1923, "step": 165 }, { "epoch": 0.004459009347802729, "grad_norm": 1.3759874105453491, "learning_rate": 9.999999998401061e-05, "loss": 3.2051, "step": 166 }, { "epoch": 0.004485870849897927, "grad_norm": 1.3621212244033813, "learning_rate": 9.999999998381138e-05, "loss": 3.3061, "step": 167 }, { "epoch": 0.004512732351993123, "grad_norm": 1.3702600002288818, "learning_rate": 9.99999999836109e-05, "loss": 3.1421, "step": 168 }, { "epoch": 0.004539593854088321, "grad_norm": 1.4759025573730469, "learning_rate": 9.999999998340919e-05, "loss": 3.517, "step": 169 }, { "epoch": 0.004566455356183518, "grad_norm": 1.5046229362487793, "learning_rate": 9.999999998320626e-05, "loss": 3.3313, "step": 170 }, { "epoch": 0.004593316858278715, "grad_norm": 1.4678044319152832, "learning_rate": 9.999999998300207e-05, "loss": 3.32, "step": 171 }, { "epoch": 0.004620178360373912, "grad_norm": 1.4216737747192383, "learning_rate": 9.999999998279666e-05, "loss": 2.9374, "step": 172 }, { "epoch": 0.0046470398624691095, "grad_norm": 1.5586341619491577, "learning_rate": 9.999999998259002e-05, "loss": 3.2402, "step": 173 }, { "epoch": 0.004673901364564306, "grad_norm": 1.494938850402832, "learning_rate": 9.999999998238214e-05, "loss": 3.0836, "step": 174 }, { "epoch": 0.0047007628666595035, "grad_norm": 1.4996769428253174, "learning_rate": 9.999999998217302e-05, "loss": 3.0837, "step": 175 }, { "epoch": 0.004727624368754701, "grad_norm": 1.468011736869812, "learning_rate": 9.999999998196268e-05, "loss": 3.2219, "step": 176 }, { "epoch": 0.0047544858708498975, "grad_norm": 1.4905102252960205, "learning_rate": 9.99999999817511e-05, "loss": 3.4999, "step": 177 }, { "epoch": 0.004781347372945095, "grad_norm": 1.6069837808609009, "learning_rate": 9.999999998153828e-05, "loss": 3.3729, "step": 178 }, { "epoch": 0.004808208875040292, "grad_norm": 1.5482449531555176, "learning_rate": 9.999999998132423e-05, "loss": 3.5421, "step": 179 }, { "epoch": 0.00483507037713549, "grad_norm": 1.5595391988754272, "learning_rate": 9.999999998110896e-05, "loss": 3.5452, "step": 180 }, { "epoch": 0.004861931879230686, "grad_norm": 1.505689263343811, "learning_rate": 9.999999998089245e-05, "loss": 3.4279, "step": 181 }, { "epoch": 0.004888793381325884, "grad_norm": 1.5317928791046143, "learning_rate": 9.999999998067468e-05, "loss": 3.1678, "step": 182 }, { "epoch": 0.004915654883421081, "grad_norm": 1.6715329885482788, "learning_rate": 9.99999999804557e-05, "loss": 3.8234, "step": 183 }, { "epoch": 0.004942516385516278, "grad_norm": 1.5834782123565674, "learning_rate": 9.999999998023549e-05, "loss": 3.4183, "step": 184 }, { "epoch": 0.004969377887611475, "grad_norm": 1.5933635234832764, "learning_rate": 9.999999998001405e-05, "loss": 3.1948, "step": 185 }, { "epoch": 0.004996239389706673, "grad_norm": 1.658353567123413, "learning_rate": 9.999999997979137e-05, "loss": 3.6501, "step": 186 }, { "epoch": 0.005023100891801869, "grad_norm": 1.5532222986221313, "learning_rate": 9.999999997956744e-05, "loss": 3.4074, "step": 187 }, { "epoch": 0.005049962393897067, "grad_norm": 1.7190356254577637, "learning_rate": 9.99999999793423e-05, "loss": 3.7267, "step": 188 }, { "epoch": 0.005076823895992264, "grad_norm": 1.4863312244415283, "learning_rate": 9.999999997911591e-05, "loss": 3.3407, "step": 189 }, { "epoch": 0.005103685398087461, "grad_norm": 1.798384428024292, "learning_rate": 9.99999999788883e-05, "loss": 3.5679, "step": 190 }, { "epoch": 0.005130546900182658, "grad_norm": 1.6841833591461182, "learning_rate": 9.999999997865944e-05, "loss": 3.2323, "step": 191 }, { "epoch": 0.005157408402277856, "grad_norm": 1.7028827667236328, "learning_rate": 9.999999997842934e-05, "loss": 3.6724, "step": 192 }, { "epoch": 0.005184269904373052, "grad_norm": 1.623820424079895, "learning_rate": 9.999999997819804e-05, "loss": 3.4745, "step": 193 }, { "epoch": 0.00521113140646825, "grad_norm": 1.8269304037094116, "learning_rate": 9.999999997796548e-05, "loss": 3.839, "step": 194 }, { "epoch": 0.005237992908563447, "grad_norm": 1.530657410621643, "learning_rate": 9.999999997773171e-05, "loss": 3.4414, "step": 195 }, { "epoch": 0.005264854410658644, "grad_norm": 1.580862045288086, "learning_rate": 9.999999997749667e-05, "loss": 3.2415, "step": 196 }, { "epoch": 0.005291715912753841, "grad_norm": 1.924195408821106, "learning_rate": 9.999999997726042e-05, "loss": 3.3448, "step": 197 }, { "epoch": 0.0053185774148490385, "grad_norm": 1.864040493965149, "learning_rate": 9.999999997702294e-05, "loss": 3.5684, "step": 198 }, { "epoch": 0.005345438916944236, "grad_norm": 1.8680702447891235, "learning_rate": 9.999999997678421e-05, "loss": 3.6054, "step": 199 }, { "epoch": 0.0053723004190394325, "grad_norm": 1.8232849836349487, "learning_rate": 9.999999997654425e-05, "loss": 3.4409, "step": 200 }, { "epoch": 0.00539916192113463, "grad_norm": 1.6235407590866089, "learning_rate": 9.999999997630307e-05, "loss": 3.2162, "step": 201 }, { "epoch": 0.005426023423229827, "grad_norm": 1.53889799118042, "learning_rate": 9.999999997606064e-05, "loss": 3.0395, "step": 202 }, { "epoch": 0.005452884925325024, "grad_norm": 1.5615484714508057, "learning_rate": 9.9999999975817e-05, "loss": 3.2514, "step": 203 }, { "epoch": 0.005479746427420221, "grad_norm": 1.3152657747268677, "learning_rate": 9.99999999755721e-05, "loss": 2.9712, "step": 204 }, { "epoch": 0.005506607929515419, "grad_norm": 1.3440253734588623, "learning_rate": 9.999999997532599e-05, "loss": 3.1026, "step": 205 }, { "epoch": 0.005533469431610615, "grad_norm": 1.3807686567306519, "learning_rate": 9.999999997507863e-05, "loss": 3.3082, "step": 206 }, { "epoch": 0.005560330933705813, "grad_norm": 1.3441410064697266, "learning_rate": 9.999999997483002e-05, "loss": 3.2609, "step": 207 }, { "epoch": 0.00558719243580101, "grad_norm": 1.3188197612762451, "learning_rate": 9.999999997458021e-05, "loss": 2.8421, "step": 208 }, { "epoch": 0.005614053937896207, "grad_norm": 1.566558599472046, "learning_rate": 9.999999997432915e-05, "loss": 3.1037, "step": 209 }, { "epoch": 0.005640915439991404, "grad_norm": 1.3136351108551025, "learning_rate": 9.999999997407685e-05, "loss": 3.0866, "step": 210 }, { "epoch": 0.005667776942086602, "grad_norm": 1.2555242776870728, "learning_rate": 9.999999997382334e-05, "loss": 2.9204, "step": 211 }, { "epoch": 0.005694638444181798, "grad_norm": 1.334439754486084, "learning_rate": 9.999999997356857e-05, "loss": 3.1215, "step": 212 }, { "epoch": 0.005721499946276996, "grad_norm": 1.4756368398666382, "learning_rate": 9.999999997331257e-05, "loss": 3.4476, "step": 213 }, { "epoch": 0.005748361448372193, "grad_norm": 1.2498222589492798, "learning_rate": 9.999999997305536e-05, "loss": 2.86, "step": 214 }, { "epoch": 0.0057752229504673906, "grad_norm": 1.3437496423721313, "learning_rate": 9.99999999727969e-05, "loss": 3.1574, "step": 215 }, { "epoch": 0.005802084452562587, "grad_norm": 1.3369652032852173, "learning_rate": 9.999999997253719e-05, "loss": 3.0527, "step": 216 }, { "epoch": 0.0058289459546577846, "grad_norm": 1.3278571367263794, "learning_rate": 9.999999997227626e-05, "loss": 3.1739, "step": 217 }, { "epoch": 0.005855807456752982, "grad_norm": 1.4438539743423462, "learning_rate": 9.999999997201412e-05, "loss": 3.35, "step": 218 }, { "epoch": 0.0058826689588481786, "grad_norm": 1.4686214923858643, "learning_rate": 9.999999997175071e-05, "loss": 3.2776, "step": 219 }, { "epoch": 0.005909530460943376, "grad_norm": 1.4738445281982422, "learning_rate": 9.999999997148608e-05, "loss": 3.2764, "step": 220 }, { "epoch": 0.005936391963038573, "grad_norm": 1.3048352003097534, "learning_rate": 9.999999997122022e-05, "loss": 3.007, "step": 221 }, { "epoch": 0.00596325346513377, "grad_norm": 1.4230749607086182, "learning_rate": 9.999999997095313e-05, "loss": 3.0977, "step": 222 }, { "epoch": 0.0059901149672289674, "grad_norm": 1.462730884552002, "learning_rate": 9.99999999706848e-05, "loss": 3.3268, "step": 223 }, { "epoch": 0.006016976469324165, "grad_norm": 1.6287310123443604, "learning_rate": 9.999999997041523e-05, "loss": 3.3482, "step": 224 }, { "epoch": 0.0060438379714193614, "grad_norm": 1.577708125114441, "learning_rate": 9.999999997014444e-05, "loss": 3.2947, "step": 225 }, { "epoch": 0.006070699473514559, "grad_norm": 1.4368644952774048, "learning_rate": 9.99999999698724e-05, "loss": 3.1511, "step": 226 }, { "epoch": 0.006097560975609756, "grad_norm": 1.7501544952392578, "learning_rate": 9.999999996959914e-05, "loss": 3.6171, "step": 227 }, { "epoch": 0.006124422477704953, "grad_norm": 1.531460165977478, "learning_rate": 9.999999996932465e-05, "loss": 3.3854, "step": 228 }, { "epoch": 0.00615128397980015, "grad_norm": 1.4792691469192505, "learning_rate": 9.99999999690489e-05, "loss": 3.2036, "step": 229 }, { "epoch": 0.006178145481895348, "grad_norm": 1.4752593040466309, "learning_rate": 9.999999996877195e-05, "loss": 3.0966, "step": 230 }, { "epoch": 0.006205006983990545, "grad_norm": 1.560064435005188, "learning_rate": 9.999999996849374e-05, "loss": 3.2259, "step": 231 }, { "epoch": 0.006231868486085742, "grad_norm": 1.5049961805343628, "learning_rate": 9.999999996821431e-05, "loss": 3.255, "step": 232 }, { "epoch": 0.006258729988180939, "grad_norm": 1.6931719779968262, "learning_rate": 9.999999996793364e-05, "loss": 3.2948, "step": 233 }, { "epoch": 0.006285591490276137, "grad_norm": 1.6067354679107666, "learning_rate": 9.999999996765175e-05, "loss": 3.3771, "step": 234 }, { "epoch": 0.006312452992371333, "grad_norm": 1.5622678995132446, "learning_rate": 9.999999996736861e-05, "loss": 3.2915, "step": 235 }, { "epoch": 0.006339314494466531, "grad_norm": 1.6007894277572632, "learning_rate": 9.999999996708425e-05, "loss": 3.4445, "step": 236 }, { "epoch": 0.006366175996561728, "grad_norm": 1.4729998111724854, "learning_rate": 9.999999996679864e-05, "loss": 3.3731, "step": 237 }, { "epoch": 0.006393037498656925, "grad_norm": 1.5431838035583496, "learning_rate": 9.99999999665118e-05, "loss": 3.2531, "step": 238 }, { "epoch": 0.006419899000752122, "grad_norm": 1.636124849319458, "learning_rate": 9.999999996622373e-05, "loss": 3.5058, "step": 239 }, { "epoch": 0.0064467605028473195, "grad_norm": 1.5989148616790771, "learning_rate": 9.999999996593443e-05, "loss": 3.2455, "step": 240 }, { "epoch": 0.006473622004942516, "grad_norm": 1.642051339149475, "learning_rate": 9.999999996564389e-05, "loss": 3.1518, "step": 241 }, { "epoch": 0.0065004835070377135, "grad_norm": 1.5991356372833252, "learning_rate": 9.999999996535212e-05, "loss": 3.1274, "step": 242 }, { "epoch": 0.006527345009132911, "grad_norm": 1.601693868637085, "learning_rate": 9.999999996505912e-05, "loss": 3.584, "step": 243 }, { "epoch": 0.0065542065112281075, "grad_norm": 1.5522338151931763, "learning_rate": 9.999999996476488e-05, "loss": 3.3002, "step": 244 }, { "epoch": 0.006581068013323305, "grad_norm": 1.7774641513824463, "learning_rate": 9.99999999644694e-05, "loss": 3.7639, "step": 245 }, { "epoch": 0.006607929515418502, "grad_norm": 1.7426048517227173, "learning_rate": 9.99999999641727e-05, "loss": 3.6212, "step": 246 }, { "epoch": 0.006634791017513699, "grad_norm": 1.7567298412322998, "learning_rate": 9.999999996387477e-05, "loss": 3.5256, "step": 247 }, { "epoch": 0.006661652519608896, "grad_norm": 1.7526336908340454, "learning_rate": 9.99999999635756e-05, "loss": 3.5622, "step": 248 }, { "epoch": 0.006688514021704094, "grad_norm": 1.787500262260437, "learning_rate": 9.999999996327518e-05, "loss": 3.5157, "step": 249 }, { "epoch": 0.006715375523799291, "grad_norm": 2.0876896381378174, "learning_rate": 9.999999996297355e-05, "loss": 3.3864, "step": 250 }, { "epoch": 0.006742237025894488, "grad_norm": 1.5887773036956787, "learning_rate": 9.999999996267068e-05, "loss": 2.6411, "step": 251 }, { "epoch": 0.006769098527989685, "grad_norm": 1.7187421321868896, "learning_rate": 9.999999996236657e-05, "loss": 2.9251, "step": 252 }, { "epoch": 0.006795960030084883, "grad_norm": 1.5112552642822266, "learning_rate": 9.999999996206122e-05, "loss": 3.1039, "step": 253 }, { "epoch": 0.006822821532180079, "grad_norm": 1.426690697669983, "learning_rate": 9.999999996175465e-05, "loss": 3.2291, "step": 254 }, { "epoch": 0.006849683034275277, "grad_norm": 1.2940804958343506, "learning_rate": 9.999999996144685e-05, "loss": 2.9074, "step": 255 }, { "epoch": 0.006876544536370474, "grad_norm": 1.4958122968673706, "learning_rate": 9.999999996113781e-05, "loss": 3.4205, "step": 256 }, { "epoch": 0.006903406038465671, "grad_norm": 1.314099669456482, "learning_rate": 9.999999996082752e-05, "loss": 3.1381, "step": 257 }, { "epoch": 0.006930267540560868, "grad_norm": 1.276431679725647, "learning_rate": 9.9999999960516e-05, "loss": 3.1831, "step": 258 }, { "epoch": 0.006957129042656066, "grad_norm": 1.3868881464004517, "learning_rate": 9.999999996020327e-05, "loss": 3.0969, "step": 259 }, { "epoch": 0.006983990544751262, "grad_norm": 1.3772238492965698, "learning_rate": 9.99999999598893e-05, "loss": 3.3114, "step": 260 }, { "epoch": 0.00701085204684646, "grad_norm": 1.3541710376739502, "learning_rate": 9.999999995957409e-05, "loss": 2.8867, "step": 261 }, { "epoch": 0.007037713548941657, "grad_norm": 1.481025218963623, "learning_rate": 9.999999995925764e-05, "loss": 3.2412, "step": 262 }, { "epoch": 0.007064575051036854, "grad_norm": 1.4574871063232422, "learning_rate": 9.999999995893995e-05, "loss": 3.1844, "step": 263 }, { "epoch": 0.007091436553132051, "grad_norm": 1.5566786527633667, "learning_rate": 9.999999995862105e-05, "loss": 3.2533, "step": 264 }, { "epoch": 0.0071182980552272485, "grad_norm": 1.298317551612854, "learning_rate": 9.999999995830091e-05, "loss": 3.1958, "step": 265 }, { "epoch": 0.007145159557322446, "grad_norm": 1.3086849451065063, "learning_rate": 9.999999995797953e-05, "loss": 3.1858, "step": 266 }, { "epoch": 0.0071720210594176425, "grad_norm": 1.4598180055618286, "learning_rate": 9.999999995765692e-05, "loss": 3.3412, "step": 267 }, { "epoch": 0.00719888256151284, "grad_norm": 1.3468729257583618, "learning_rate": 9.999999995733307e-05, "loss": 3.1933, "step": 268 }, { "epoch": 0.007225744063608037, "grad_norm": 1.5812733173370361, "learning_rate": 9.9999999957008e-05, "loss": 3.0884, "step": 269 }, { "epoch": 0.007252605565703234, "grad_norm": 1.3265537023544312, "learning_rate": 9.999999995668168e-05, "loss": 2.8339, "step": 270 }, { "epoch": 0.007279467067798431, "grad_norm": 1.4566750526428223, "learning_rate": 9.999999995635412e-05, "loss": 3.0427, "step": 271 }, { "epoch": 0.007306328569893629, "grad_norm": 1.4008580446243286, "learning_rate": 9.999999995602533e-05, "loss": 2.9818, "step": 272 }, { "epoch": 0.007333190071988825, "grad_norm": 1.4401434659957886, "learning_rate": 9.999999995569533e-05, "loss": 3.1373, "step": 273 }, { "epoch": 0.007360051574084023, "grad_norm": 1.3396170139312744, "learning_rate": 9.999999995536409e-05, "loss": 3.1087, "step": 274 }, { "epoch": 0.00738691307617922, "grad_norm": 1.440423846244812, "learning_rate": 9.99999999550316e-05, "loss": 3.2071, "step": 275 }, { "epoch": 0.007413774578274417, "grad_norm": 1.3632419109344482, "learning_rate": 9.999999995469789e-05, "loss": 3.024, "step": 276 }, { "epoch": 0.007440636080369614, "grad_norm": 1.4464221000671387, "learning_rate": 9.999999995436293e-05, "loss": 3.0848, "step": 277 }, { "epoch": 0.007467497582464812, "grad_norm": 1.3937352895736694, "learning_rate": 9.999999995402674e-05, "loss": 3.1426, "step": 278 }, { "epoch": 0.007494359084560008, "grad_norm": 1.3715002536773682, "learning_rate": 9.999999995368934e-05, "loss": 3.1451, "step": 279 }, { "epoch": 0.007521220586655206, "grad_norm": 1.6965054273605347, "learning_rate": 9.999999995335067e-05, "loss": 3.4319, "step": 280 }, { "epoch": 0.007548082088750403, "grad_norm": 1.5740978717803955, "learning_rate": 9.999999995301079e-05, "loss": 3.4204, "step": 281 }, { "epoch": 0.0075749435908456, "grad_norm": 1.5966700315475464, "learning_rate": 9.999999995266967e-05, "loss": 3.3832, "step": 282 }, { "epoch": 0.007601805092940797, "grad_norm": 1.4009469747543335, "learning_rate": 9.999999995232731e-05, "loss": 3.0976, "step": 283 }, { "epoch": 0.0076286665950359945, "grad_norm": 1.580001950263977, "learning_rate": 9.999999995198374e-05, "loss": 3.0962, "step": 284 }, { "epoch": 0.007655528097131192, "grad_norm": 1.5711950063705444, "learning_rate": 9.999999995163891e-05, "loss": 3.2862, "step": 285 }, { "epoch": 0.0076823895992263885, "grad_norm": 1.6567752361297607, "learning_rate": 9.999999995129287e-05, "loss": 3.387, "step": 286 }, { "epoch": 0.007709251101321586, "grad_norm": 1.5363513231277466, "learning_rate": 9.999999995094558e-05, "loss": 3.3296, "step": 287 }, { "epoch": 0.007736112603416783, "grad_norm": 1.628709316253662, "learning_rate": 9.999999995059706e-05, "loss": 3.4314, "step": 288 }, { "epoch": 0.00776297410551198, "grad_norm": 1.683313250541687, "learning_rate": 9.999999995024731e-05, "loss": 3.2991, "step": 289 }, { "epoch": 0.007789835607607177, "grad_norm": 1.7183477878570557, "learning_rate": 9.999999994989631e-05, "loss": 3.366, "step": 290 }, { "epoch": 0.007816697109702375, "grad_norm": 1.825942039489746, "learning_rate": 9.99999999495441e-05, "loss": 3.933, "step": 291 }, { "epoch": 0.007843558611797571, "grad_norm": 1.5922329425811768, "learning_rate": 9.999999994919063e-05, "loss": 3.3191, "step": 292 }, { "epoch": 0.00787042011389277, "grad_norm": 1.6675676107406616, "learning_rate": 9.999999994883595e-05, "loss": 3.7358, "step": 293 }, { "epoch": 0.007897281615987966, "grad_norm": 1.6281962394714355, "learning_rate": 9.999999994848003e-05, "loss": 3.1607, "step": 294 }, { "epoch": 0.007924143118083163, "grad_norm": 1.5776563882827759, "learning_rate": 9.999999994812287e-05, "loss": 3.4825, "step": 295 }, { "epoch": 0.007951004620178361, "grad_norm": 1.6171278953552246, "learning_rate": 9.999999994776447e-05, "loss": 3.285, "step": 296 }, { "epoch": 0.007977866122273558, "grad_norm": 1.6754804849624634, "learning_rate": 9.999999994740486e-05, "loss": 3.4483, "step": 297 }, { "epoch": 0.008004727624368754, "grad_norm": 1.6712759733200073, "learning_rate": 9.999999994704399e-05, "loss": 3.578, "step": 298 }, { "epoch": 0.008031589126463953, "grad_norm": 1.7725811004638672, "learning_rate": 9.999999994668191e-05, "loss": 3.4371, "step": 299 }, { "epoch": 0.00805845062855915, "grad_norm": 1.9692758321762085, "learning_rate": 9.99999999463186e-05, "loss": 3.8032, "step": 300 }, { "epoch": 0.008085312130654346, "grad_norm": 1.2866451740264893, "learning_rate": 9.999999994595403e-05, "loss": 3.2611, "step": 301 }, { "epoch": 0.008112173632749544, "grad_norm": 1.4625946283340454, "learning_rate": 9.999999994558823e-05, "loss": 3.0714, "step": 302 }, { "epoch": 0.00813903513484474, "grad_norm": 1.3855098485946655, "learning_rate": 9.99999999452212e-05, "loss": 3.1154, "step": 303 }, { "epoch": 0.008165896636939937, "grad_norm": 1.3074418306350708, "learning_rate": 9.999999994485294e-05, "loss": 3.2195, "step": 304 }, { "epoch": 0.008192758139035135, "grad_norm": 1.3705579042434692, "learning_rate": 9.999999994448346e-05, "loss": 3.1222, "step": 305 }, { "epoch": 0.008219619641130332, "grad_norm": 1.3016101121902466, "learning_rate": 9.999999994411273e-05, "loss": 3.0081, "step": 306 }, { "epoch": 0.008246481143225529, "grad_norm": 1.5367039442062378, "learning_rate": 9.999999994374077e-05, "loss": 3.3542, "step": 307 }, { "epoch": 0.008273342645320727, "grad_norm": 1.352465271949768, "learning_rate": 9.999999994336757e-05, "loss": 3.2663, "step": 308 }, { "epoch": 0.008300204147415923, "grad_norm": 1.0709216594696045, "learning_rate": 9.999999994299314e-05, "loss": 2.5144, "step": 309 }, { "epoch": 0.00832706564951112, "grad_norm": 1.2512985467910767, "learning_rate": 9.999999994261748e-05, "loss": 2.9406, "step": 310 }, { "epoch": 0.008353927151606318, "grad_norm": 1.4661740064620972, "learning_rate": 9.999999994224058e-05, "loss": 3.3997, "step": 311 }, { "epoch": 0.008380788653701515, "grad_norm": 1.3103265762329102, "learning_rate": 9.999999994186246e-05, "loss": 3.0447, "step": 312 }, { "epoch": 0.008407650155796711, "grad_norm": 1.5357261896133423, "learning_rate": 9.999999994148309e-05, "loss": 3.2413, "step": 313 }, { "epoch": 0.00843451165789191, "grad_norm": 1.3907216787338257, "learning_rate": 9.99999999411025e-05, "loss": 3.1204, "step": 314 }, { "epoch": 0.008461373159987106, "grad_norm": 1.3356871604919434, "learning_rate": 9.999999994072067e-05, "loss": 3.0059, "step": 315 }, { "epoch": 0.008488234662082303, "grad_norm": 1.3362263441085815, "learning_rate": 9.999999994033759e-05, "loss": 3.1497, "step": 316 }, { "epoch": 0.008515096164177501, "grad_norm": 1.3796724081039429, "learning_rate": 9.999999993995331e-05, "loss": 3.1882, "step": 317 }, { "epoch": 0.008541957666272698, "grad_norm": 1.4778923988342285, "learning_rate": 9.999999993956777e-05, "loss": 2.8893, "step": 318 }, { "epoch": 0.008568819168367894, "grad_norm": 1.4544243812561035, "learning_rate": 9.9999999939181e-05, "loss": 3.1249, "step": 319 }, { "epoch": 0.008595680670463093, "grad_norm": 1.3993685245513916, "learning_rate": 9.999999993879301e-05, "loss": 3.0868, "step": 320 }, { "epoch": 0.00862254217255829, "grad_norm": 1.455314040184021, "learning_rate": 9.999999993840377e-05, "loss": 3.2488, "step": 321 }, { "epoch": 0.008649403674653486, "grad_norm": 1.535010576248169, "learning_rate": 9.99999999380133e-05, "loss": 3.211, "step": 322 }, { "epoch": 0.008676265176748684, "grad_norm": 1.4369255304336548, "learning_rate": 9.999999993762161e-05, "loss": 3.268, "step": 323 }, { "epoch": 0.00870312667884388, "grad_norm": 2.008056163787842, "learning_rate": 9.999999993722867e-05, "loss": 2.9857, "step": 324 }, { "epoch": 0.008729988180939077, "grad_norm": 1.3951343297958374, "learning_rate": 9.999999993683451e-05, "loss": 3.2106, "step": 325 }, { "epoch": 0.008756849683034276, "grad_norm": 1.472520112991333, "learning_rate": 9.99999999364391e-05, "loss": 3.181, "step": 326 }, { "epoch": 0.008783711185129472, "grad_norm": 1.491974949836731, "learning_rate": 9.999999993604247e-05, "loss": 3.2049, "step": 327 }, { "epoch": 0.00881057268722467, "grad_norm": 1.5334579944610596, "learning_rate": 9.99999999356446e-05, "loss": 3.3579, "step": 328 }, { "epoch": 0.008837434189319867, "grad_norm": 1.509615421295166, "learning_rate": 9.999999993524549e-05, "loss": 3.5056, "step": 329 }, { "epoch": 0.008864295691415064, "grad_norm": 1.575596570968628, "learning_rate": 9.999999993484517e-05, "loss": 3.1586, "step": 330 }, { "epoch": 0.008891157193510262, "grad_norm": 1.5231512784957886, "learning_rate": 9.99999999344436e-05, "loss": 3.0471, "step": 331 }, { "epoch": 0.008918018695605458, "grad_norm": 1.5751467943191528, "learning_rate": 9.999999993404079e-05, "loss": 3.3389, "step": 332 }, { "epoch": 0.008944880197700655, "grad_norm": 1.436456561088562, "learning_rate": 9.999999993363676e-05, "loss": 3.0194, "step": 333 }, { "epoch": 0.008971741699795853, "grad_norm": 1.59831702709198, "learning_rate": 9.999999993323147e-05, "loss": 3.112, "step": 334 }, { "epoch": 0.00899860320189105, "grad_norm": 1.468543529510498, "learning_rate": 9.999999993282498e-05, "loss": 3.3302, "step": 335 }, { "epoch": 0.009025464703986246, "grad_norm": 1.5483806133270264, "learning_rate": 9.999999993241724e-05, "loss": 3.2404, "step": 336 }, { "epoch": 0.009052326206081445, "grad_norm": 1.6208584308624268, "learning_rate": 9.999999993200826e-05, "loss": 3.0803, "step": 337 }, { "epoch": 0.009079187708176641, "grad_norm": 1.4745221138000488, "learning_rate": 9.999999993159806e-05, "loss": 3.4383, "step": 338 }, { "epoch": 0.009106049210271838, "grad_norm": 1.5517362356185913, "learning_rate": 9.999999993118663e-05, "loss": 3.2555, "step": 339 }, { "epoch": 0.009132910712367036, "grad_norm": 1.5948114395141602, "learning_rate": 9.999999993077394e-05, "loss": 3.1166, "step": 340 }, { "epoch": 0.009159772214462233, "grad_norm": 1.6069583892822266, "learning_rate": 9.999999993036005e-05, "loss": 3.3697, "step": 341 }, { "epoch": 0.00918663371655743, "grad_norm": 1.6950137615203857, "learning_rate": 9.99999999299449e-05, "loss": 3.2143, "step": 342 }, { "epoch": 0.009213495218652628, "grad_norm": 1.7324154376983643, "learning_rate": 9.999999992952853e-05, "loss": 3.6276, "step": 343 }, { "epoch": 0.009240356720747824, "grad_norm": 1.6985573768615723, "learning_rate": 9.999999992911092e-05, "loss": 3.3153, "step": 344 }, { "epoch": 0.00926721822284302, "grad_norm": 1.7411456108093262, "learning_rate": 9.999999992869208e-05, "loss": 3.7364, "step": 345 }, { "epoch": 0.009294079724938219, "grad_norm": 1.7979718446731567, "learning_rate": 9.999999992827201e-05, "loss": 3.3799, "step": 346 }, { "epoch": 0.009320941227033416, "grad_norm": 1.7107107639312744, "learning_rate": 9.99999999278507e-05, "loss": 3.3509, "step": 347 }, { "epoch": 0.009347802729128612, "grad_norm": 1.7422338724136353, "learning_rate": 9.999999992742815e-05, "loss": 3.2159, "step": 348 }, { "epoch": 0.00937466423122381, "grad_norm": 1.9913054704666138, "learning_rate": 9.999999992700437e-05, "loss": 3.5324, "step": 349 }, { "epoch": 0.009401525733319007, "grad_norm": 2.0499820709228516, "learning_rate": 9.999999992657935e-05, "loss": 3.2073, "step": 350 }, { "epoch": 0.009428387235414204, "grad_norm": 1.312639832496643, "learning_rate": 9.999999992615312e-05, "loss": 3.1846, "step": 351 }, { "epoch": 0.009455248737509402, "grad_norm": 1.534393310546875, "learning_rate": 9.999999992572564e-05, "loss": 3.1861, "step": 352 }, { "epoch": 0.009482110239604599, "grad_norm": 1.592322587966919, "learning_rate": 9.999999992529694e-05, "loss": 3.202, "step": 353 }, { "epoch": 0.009508971741699795, "grad_norm": 1.6027579307556152, "learning_rate": 9.999999992486699e-05, "loss": 3.3655, "step": 354 }, { "epoch": 0.009535833243794993, "grad_norm": 1.460989236831665, "learning_rate": 9.99999999244358e-05, "loss": 3.0499, "step": 355 }, { "epoch": 0.00956269474589019, "grad_norm": 1.4667208194732666, "learning_rate": 9.99999999240034e-05, "loss": 3.0652, "step": 356 }, { "epoch": 0.009589556247985387, "grad_norm": 1.4265388250350952, "learning_rate": 9.999999992356974e-05, "loss": 3.0453, "step": 357 }, { "epoch": 0.009616417750080585, "grad_norm": 1.4427202939987183, "learning_rate": 9.999999992313487e-05, "loss": 3.1117, "step": 358 }, { "epoch": 0.009643279252175781, "grad_norm": 1.3827509880065918, "learning_rate": 9.999999992269875e-05, "loss": 3.0258, "step": 359 }, { "epoch": 0.00967014075427098, "grad_norm": 3.8355908393859863, "learning_rate": 9.999999992226141e-05, "loss": 3.1564, "step": 360 }, { "epoch": 0.009697002256366176, "grad_norm": 1.4933533668518066, "learning_rate": 9.999999992182283e-05, "loss": 3.2878, "step": 361 }, { "epoch": 0.009723863758461373, "grad_norm": 1.4010118246078491, "learning_rate": 9.999999992138301e-05, "loss": 2.9805, "step": 362 }, { "epoch": 0.009750725260556571, "grad_norm": 1.441159725189209, "learning_rate": 9.999999992094196e-05, "loss": 3.1448, "step": 363 }, { "epoch": 0.009777586762651768, "grad_norm": 1.6072075366973877, "learning_rate": 9.999999992049969e-05, "loss": 3.1391, "step": 364 }, { "epoch": 0.009804448264746964, "grad_norm": 1.3306028842926025, "learning_rate": 9.999999992005618e-05, "loss": 2.9318, "step": 365 }, { "epoch": 0.009831309766842163, "grad_norm": 1.4072214365005493, "learning_rate": 9.999999991961141e-05, "loss": 3.0531, "step": 366 }, { "epoch": 0.00985817126893736, "grad_norm": 1.4834871292114258, "learning_rate": 9.999999991916544e-05, "loss": 2.9282, "step": 367 }, { "epoch": 0.009885032771032556, "grad_norm": 1.5989537239074707, "learning_rate": 9.999999991871822e-05, "loss": 3.3957, "step": 368 }, { "epoch": 0.009911894273127754, "grad_norm": 1.4718068838119507, "learning_rate": 9.999999991826977e-05, "loss": 3.3333, "step": 369 }, { "epoch": 0.00993875577522295, "grad_norm": 1.4870935678482056, "learning_rate": 9.999999991782008e-05, "loss": 3.3692, "step": 370 }, { "epoch": 0.009965617277318147, "grad_norm": 1.3331711292266846, "learning_rate": 9.999999991736916e-05, "loss": 3.0298, "step": 371 }, { "epoch": 0.009992478779413345, "grad_norm": 1.485838770866394, "learning_rate": 9.999999991691702e-05, "loss": 3.2719, "step": 372 }, { "epoch": 0.010019340281508542, "grad_norm": 1.5010319948196411, "learning_rate": 9.999999991646362e-05, "loss": 3.0529, "step": 373 }, { "epoch": 0.010046201783603739, "grad_norm": 1.4825512170791626, "learning_rate": 9.999999991600902e-05, "loss": 3.0555, "step": 374 }, { "epoch": 0.010073063285698937, "grad_norm": 1.4355058670043945, "learning_rate": 9.999999991555315e-05, "loss": 2.9217, "step": 375 }, { "epoch": 0.010099924787794133, "grad_norm": 1.448727011680603, "learning_rate": 9.999999991509608e-05, "loss": 3.1127, "step": 376 }, { "epoch": 0.01012678628988933, "grad_norm": 1.4077942371368408, "learning_rate": 9.999999991463776e-05, "loss": 3.0063, "step": 377 }, { "epoch": 0.010153647791984528, "grad_norm": 1.4879511594772339, "learning_rate": 9.999999991417819e-05, "loss": 3.0618, "step": 378 }, { "epoch": 0.010180509294079725, "grad_norm": 1.4629594087600708, "learning_rate": 9.999999991371741e-05, "loss": 3.0791, "step": 379 }, { "epoch": 0.010207370796174921, "grad_norm": 1.5846166610717773, "learning_rate": 9.999999991325539e-05, "loss": 3.3179, "step": 380 }, { "epoch": 0.01023423229827012, "grad_norm": 1.6092251539230347, "learning_rate": 9.999999991279214e-05, "loss": 3.1621, "step": 381 }, { "epoch": 0.010261093800365316, "grad_norm": 1.3157117366790771, "learning_rate": 9.999999991232765e-05, "loss": 3.0647, "step": 382 }, { "epoch": 0.010287955302460513, "grad_norm": 1.5434112548828125, "learning_rate": 9.999999991186193e-05, "loss": 3.1533, "step": 383 }, { "epoch": 0.010314816804555711, "grad_norm": 1.445216178894043, "learning_rate": 9.999999991139496e-05, "loss": 3.2086, "step": 384 }, { "epoch": 0.010341678306650908, "grad_norm": 1.4799890518188477, "learning_rate": 9.999999991092678e-05, "loss": 3.0183, "step": 385 }, { "epoch": 0.010368539808746104, "grad_norm": 1.5840139389038086, "learning_rate": 9.999999991045736e-05, "loss": 3.3566, "step": 386 }, { "epoch": 0.010395401310841303, "grad_norm": 1.7577760219573975, "learning_rate": 9.99999999099867e-05, "loss": 3.687, "step": 387 }, { "epoch": 0.0104222628129365, "grad_norm": 1.7453808784484863, "learning_rate": 9.999999990951481e-05, "loss": 3.4915, "step": 388 }, { "epoch": 0.010449124315031696, "grad_norm": 1.5384831428527832, "learning_rate": 9.999999990904169e-05, "loss": 3.2543, "step": 389 }, { "epoch": 0.010475985817126894, "grad_norm": 1.5846140384674072, "learning_rate": 9.999999990856732e-05, "loss": 3.5913, "step": 390 }, { "epoch": 0.01050284731922209, "grad_norm": 1.6113022565841675, "learning_rate": 9.999999990809174e-05, "loss": 3.2361, "step": 391 }, { "epoch": 0.010529708821317287, "grad_norm": 1.611128330230713, "learning_rate": 9.999999990761491e-05, "loss": 3.3629, "step": 392 }, { "epoch": 0.010556570323412486, "grad_norm": 1.6922285556793213, "learning_rate": 9.999999990713684e-05, "loss": 3.4664, "step": 393 }, { "epoch": 0.010583431825507682, "grad_norm": 1.6885075569152832, "learning_rate": 9.999999990665756e-05, "loss": 3.3561, "step": 394 }, { "epoch": 0.01061029332760288, "grad_norm": 1.6231778860092163, "learning_rate": 9.999999990617704e-05, "loss": 3.2889, "step": 395 }, { "epoch": 0.010637154829698077, "grad_norm": 1.790924310684204, "learning_rate": 9.999999990569526e-05, "loss": 3.7997, "step": 396 }, { "epoch": 0.010664016331793274, "grad_norm": 1.620592713356018, "learning_rate": 9.999999990521227e-05, "loss": 3.109, "step": 397 }, { "epoch": 0.010690877833888472, "grad_norm": 1.6597546339035034, "learning_rate": 9.999999990472804e-05, "loss": 3.4153, "step": 398 }, { "epoch": 0.010717739335983668, "grad_norm": 2.0343093872070312, "learning_rate": 9.999999990424258e-05, "loss": 3.6979, "step": 399 }, { "epoch": 0.010744600838078865, "grad_norm": 1.827853798866272, "learning_rate": 9.999999990375589e-05, "loss": 3.4915, "step": 400 }, { "epoch": 0.010771462340174063, "grad_norm": 1.4068386554718018, "learning_rate": 9.999999990326796e-05, "loss": 3.139, "step": 401 }, { "epoch": 0.01079832384226926, "grad_norm": 1.4641603231430054, "learning_rate": 9.99999999027788e-05, "loss": 3.0299, "step": 402 }, { "epoch": 0.010825185344364456, "grad_norm": 1.3369860649108887, "learning_rate": 9.999999990228841e-05, "loss": 2.801, "step": 403 }, { "epoch": 0.010852046846459655, "grad_norm": 1.3419314622879028, "learning_rate": 9.999999990179678e-05, "loss": 2.8149, "step": 404 }, { "epoch": 0.010878908348554851, "grad_norm": 1.5593358278274536, "learning_rate": 9.999999990130392e-05, "loss": 3.4785, "step": 405 }, { "epoch": 0.010905769850650048, "grad_norm": 1.4393575191497803, "learning_rate": 9.999999990080982e-05, "loss": 3.0239, "step": 406 }, { "epoch": 0.010932631352745246, "grad_norm": 1.4988009929656982, "learning_rate": 9.999999990031447e-05, "loss": 3.2712, "step": 407 }, { "epoch": 0.010959492854840443, "grad_norm": 1.4504315853118896, "learning_rate": 9.999999989981792e-05, "loss": 3.1714, "step": 408 }, { "epoch": 0.01098635435693564, "grad_norm": 1.4853525161743164, "learning_rate": 9.999999989932012e-05, "loss": 3.418, "step": 409 }, { "epoch": 0.011013215859030838, "grad_norm": 1.2990763187408447, "learning_rate": 9.999999989882109e-05, "loss": 2.9569, "step": 410 }, { "epoch": 0.011040077361126034, "grad_norm": 1.3418400287628174, "learning_rate": 9.999999989832083e-05, "loss": 3.0375, "step": 411 }, { "epoch": 0.01106693886322123, "grad_norm": 1.4683629274368286, "learning_rate": 9.999999989781932e-05, "loss": 3.2594, "step": 412 }, { "epoch": 0.011093800365316429, "grad_norm": 1.411016583442688, "learning_rate": 9.999999989731659e-05, "loss": 3.2121, "step": 413 }, { "epoch": 0.011120661867411626, "grad_norm": 1.4058409929275513, "learning_rate": 9.999999989681262e-05, "loss": 3.0771, "step": 414 }, { "epoch": 0.011147523369506822, "grad_norm": 1.346800684928894, "learning_rate": 9.999999989630741e-05, "loss": 2.9472, "step": 415 }, { "epoch": 0.01117438487160202, "grad_norm": 1.4477880001068115, "learning_rate": 9.999999989580099e-05, "loss": 3.0263, "step": 416 }, { "epoch": 0.011201246373697217, "grad_norm": 1.4545361995697021, "learning_rate": 9.999999989529332e-05, "loss": 3.2445, "step": 417 }, { "epoch": 0.011228107875792414, "grad_norm": 1.5225719213485718, "learning_rate": 9.999999989478442e-05, "loss": 3.1295, "step": 418 }, { "epoch": 0.011254969377887612, "grad_norm": 1.4374668598175049, "learning_rate": 9.999999989427429e-05, "loss": 3.0734, "step": 419 }, { "epoch": 0.011281830879982808, "grad_norm": 1.5443471670150757, "learning_rate": 9.999999989376292e-05, "loss": 2.9032, "step": 420 }, { "epoch": 0.011308692382078005, "grad_norm": 1.4011187553405762, "learning_rate": 9.999999989325031e-05, "loss": 2.7283, "step": 421 }, { "epoch": 0.011335553884173203, "grad_norm": 1.5621442794799805, "learning_rate": 9.999999989273647e-05, "loss": 3.0155, "step": 422 }, { "epoch": 0.0113624153862684, "grad_norm": 1.4946808815002441, "learning_rate": 9.999999989222139e-05, "loss": 3.2491, "step": 423 }, { "epoch": 0.011389276888363596, "grad_norm": 1.5118460655212402, "learning_rate": 9.99999998917051e-05, "loss": 2.917, "step": 424 }, { "epoch": 0.011416138390458795, "grad_norm": 1.5413126945495605, "learning_rate": 9.999999989118755e-05, "loss": 3.2541, "step": 425 }, { "epoch": 0.011442999892553991, "grad_norm": 1.6237248182296753, "learning_rate": 9.99999998906688e-05, "loss": 3.2308, "step": 426 }, { "epoch": 0.011469861394649188, "grad_norm": 1.5881716012954712, "learning_rate": 9.999999989014879e-05, "loss": 3.1807, "step": 427 }, { "epoch": 0.011496722896744386, "grad_norm": 1.5346410274505615, "learning_rate": 9.999999988962755e-05, "loss": 3.1304, "step": 428 }, { "epoch": 0.011523584398839583, "grad_norm": 1.464072346687317, "learning_rate": 9.999999988910507e-05, "loss": 3.0906, "step": 429 }, { "epoch": 0.011550445900934781, "grad_norm": 1.582999587059021, "learning_rate": 9.999999988858137e-05, "loss": 3.0908, "step": 430 }, { "epoch": 0.011577307403029978, "grad_norm": 1.4797403812408447, "learning_rate": 9.999999988805643e-05, "loss": 3.1175, "step": 431 }, { "epoch": 0.011604168905125174, "grad_norm": 1.5761067867279053, "learning_rate": 9.999999988753027e-05, "loss": 3.3944, "step": 432 }, { "epoch": 0.011631030407220373, "grad_norm": 1.6513606309890747, "learning_rate": 9.999999988700286e-05, "loss": 3.3121, "step": 433 }, { "epoch": 0.011657891909315569, "grad_norm": 1.7659647464752197, "learning_rate": 9.99999998864742e-05, "loss": 3.277, "step": 434 }, { "epoch": 0.011684753411410766, "grad_norm": 1.5365813970565796, "learning_rate": 9.999999988594434e-05, "loss": 3.2994, "step": 435 }, { "epoch": 0.011711614913505964, "grad_norm": 1.5114079713821411, "learning_rate": 9.999999988541323e-05, "loss": 3.1791, "step": 436 }, { "epoch": 0.01173847641560116, "grad_norm": 1.73617422580719, "learning_rate": 9.999999988488088e-05, "loss": 3.5268, "step": 437 }, { "epoch": 0.011765337917696357, "grad_norm": 1.7058979272842407, "learning_rate": 9.99999998843473e-05, "loss": 3.1726, "step": 438 }, { "epoch": 0.011792199419791555, "grad_norm": 1.5015861988067627, "learning_rate": 9.99999998838125e-05, "loss": 3.2026, "step": 439 }, { "epoch": 0.011819060921886752, "grad_norm": 1.6483575105667114, "learning_rate": 9.999999988327645e-05, "loss": 3.3578, "step": 440 }, { "epoch": 0.011845922423981949, "grad_norm": 1.7205007076263428, "learning_rate": 9.999999988273918e-05, "loss": 3.5342, "step": 441 }, { "epoch": 0.011872783926077147, "grad_norm": 1.6439756155014038, "learning_rate": 9.999999988220067e-05, "loss": 3.3699, "step": 442 }, { "epoch": 0.011899645428172343, "grad_norm": 1.710311770439148, "learning_rate": 9.999999988166092e-05, "loss": 3.5054, "step": 443 }, { "epoch": 0.01192650693026754, "grad_norm": 1.7583166360855103, "learning_rate": 9.999999988111995e-05, "loss": 3.4383, "step": 444 }, { "epoch": 0.011953368432362738, "grad_norm": 1.5195735692977905, "learning_rate": 9.999999988057774e-05, "loss": 2.9549, "step": 445 }, { "epoch": 0.011980229934457935, "grad_norm": 1.7824205160140991, "learning_rate": 9.999999988003428e-05, "loss": 3.1322, "step": 446 }, { "epoch": 0.012007091436553131, "grad_norm": 1.7998090982437134, "learning_rate": 9.99999998794896e-05, "loss": 3.6423, "step": 447 }, { "epoch": 0.01203395293864833, "grad_norm": 1.6461634635925293, "learning_rate": 9.99999998789437e-05, "loss": 3.4975, "step": 448 }, { "epoch": 0.012060814440743526, "grad_norm": 1.8779922723770142, "learning_rate": 9.999999987839654e-05, "loss": 3.4399, "step": 449 }, { "epoch": 0.012087675942838723, "grad_norm": 1.8773419857025146, "learning_rate": 9.999999987784816e-05, "loss": 3.4724, "step": 450 }, { "epoch": 0.012114537444933921, "grad_norm": 1.4512723684310913, "learning_rate": 9.999999987729855e-05, "loss": 3.4861, "step": 451 }, { "epoch": 0.012141398947029118, "grad_norm": 1.4454329013824463, "learning_rate": 9.999999987674771e-05, "loss": 2.9476, "step": 452 }, { "epoch": 0.012168260449124314, "grad_norm": 1.5218937397003174, "learning_rate": 9.999999987619562e-05, "loss": 3.3391, "step": 453 }, { "epoch": 0.012195121951219513, "grad_norm": 1.4006235599517822, "learning_rate": 9.99999998756423e-05, "loss": 3.1386, "step": 454 }, { "epoch": 0.01222198345331471, "grad_norm": 1.5272748470306396, "learning_rate": 9.999999987508777e-05, "loss": 3.2105, "step": 455 }, { "epoch": 0.012248844955409906, "grad_norm": 1.381325125694275, "learning_rate": 9.999999987453198e-05, "loss": 3.0307, "step": 456 }, { "epoch": 0.012275706457505104, "grad_norm": 1.5673829317092896, "learning_rate": 9.999999987397497e-05, "loss": 3.2447, "step": 457 }, { "epoch": 0.0123025679596003, "grad_norm": 1.5407352447509766, "learning_rate": 9.99999998734167e-05, "loss": 3.0539, "step": 458 }, { "epoch": 0.012329429461695497, "grad_norm": 1.4541857242584229, "learning_rate": 9.999999987285723e-05, "loss": 3.212, "step": 459 }, { "epoch": 0.012356290963790695, "grad_norm": 1.3955122232437134, "learning_rate": 9.99999998722965e-05, "loss": 2.8864, "step": 460 }, { "epoch": 0.012383152465885892, "grad_norm": 1.4555715322494507, "learning_rate": 9.999999987173456e-05, "loss": 3.1965, "step": 461 }, { "epoch": 0.01241001396798109, "grad_norm": 1.4746215343475342, "learning_rate": 9.999999987117138e-05, "loss": 3.0635, "step": 462 }, { "epoch": 0.012436875470076287, "grad_norm": 1.5263087749481201, "learning_rate": 9.999999987060695e-05, "loss": 2.9506, "step": 463 }, { "epoch": 0.012463736972171484, "grad_norm": 1.4185150861740112, "learning_rate": 9.999999987004131e-05, "loss": 2.6964, "step": 464 }, { "epoch": 0.012490598474266682, "grad_norm": 1.4453799724578857, "learning_rate": 9.999999986947441e-05, "loss": 3.1973, "step": 465 }, { "epoch": 0.012517459976361878, "grad_norm": 1.4354561567306519, "learning_rate": 9.99999998689063e-05, "loss": 2.9152, "step": 466 }, { "epoch": 0.012544321478457075, "grad_norm": 1.3852994441986084, "learning_rate": 9.999999986833695e-05, "loss": 2.7374, "step": 467 }, { "epoch": 0.012571182980552273, "grad_norm": 1.4055900573730469, "learning_rate": 9.999999986776636e-05, "loss": 2.9589, "step": 468 }, { "epoch": 0.01259804448264747, "grad_norm": 1.6043518781661987, "learning_rate": 9.999999986719455e-05, "loss": 2.8888, "step": 469 }, { "epoch": 0.012624905984742666, "grad_norm": 1.423304796218872, "learning_rate": 9.999999986662148e-05, "loss": 2.9067, "step": 470 }, { "epoch": 0.012651767486837865, "grad_norm": 1.4667428731918335, "learning_rate": 9.99999998660472e-05, "loss": 3.1209, "step": 471 }, { "epoch": 0.012678628988933061, "grad_norm": 1.458530068397522, "learning_rate": 9.999999986547168e-05, "loss": 3.1869, "step": 472 }, { "epoch": 0.012705490491028258, "grad_norm": 1.5382295846939087, "learning_rate": 9.999999986489492e-05, "loss": 2.9852, "step": 473 }, { "epoch": 0.012732351993123456, "grad_norm": 1.6067780256271362, "learning_rate": 9.999999986431694e-05, "loss": 3.4255, "step": 474 }, { "epoch": 0.012759213495218653, "grad_norm": 1.3856332302093506, "learning_rate": 9.999999986373772e-05, "loss": 2.9723, "step": 475 }, { "epoch": 0.01278607499731385, "grad_norm": 1.409085750579834, "learning_rate": 9.999999986315725e-05, "loss": 2.9678, "step": 476 }, { "epoch": 0.012812936499409048, "grad_norm": 1.5785013437271118, "learning_rate": 9.999999986257556e-05, "loss": 3.0543, "step": 477 }, { "epoch": 0.012839798001504244, "grad_norm": 1.5739150047302246, "learning_rate": 9.999999986199264e-05, "loss": 3.0962, "step": 478 }, { "epoch": 0.01286665950359944, "grad_norm": 1.715063452720642, "learning_rate": 9.999999986140849e-05, "loss": 3.4156, "step": 479 }, { "epoch": 0.012893521005694639, "grad_norm": 1.5047529935836792, "learning_rate": 9.999999986082309e-05, "loss": 3.2267, "step": 480 }, { "epoch": 0.012920382507789836, "grad_norm": 1.5376882553100586, "learning_rate": 9.999999986023646e-05, "loss": 3.3192, "step": 481 }, { "epoch": 0.012947244009885032, "grad_norm": 1.6161760091781616, "learning_rate": 9.999999985964861e-05, "loss": 3.1084, "step": 482 }, { "epoch": 0.01297410551198023, "grad_norm": 1.5804671049118042, "learning_rate": 9.999999985905952e-05, "loss": 3.1579, "step": 483 }, { "epoch": 0.013000967014075427, "grad_norm": 1.6535561084747314, "learning_rate": 9.99999998584692e-05, "loss": 3.1284, "step": 484 }, { "epoch": 0.013027828516170624, "grad_norm": 1.6472439765930176, "learning_rate": 9.999999985787762e-05, "loss": 3.2893, "step": 485 }, { "epoch": 0.013054690018265822, "grad_norm": 1.5657492876052856, "learning_rate": 9.999999985728483e-05, "loss": 3.3764, "step": 486 }, { "epoch": 0.013081551520361018, "grad_norm": 1.6238974332809448, "learning_rate": 9.999999985669081e-05, "loss": 3.3152, "step": 487 }, { "epoch": 0.013108413022456215, "grad_norm": 1.579158067703247, "learning_rate": 9.999999985609554e-05, "loss": 3.5067, "step": 488 }, { "epoch": 0.013135274524551413, "grad_norm": 1.7080647945404053, "learning_rate": 9.999999985549906e-05, "loss": 3.1746, "step": 489 }, { "epoch": 0.01316213602664661, "grad_norm": 1.7463475465774536, "learning_rate": 9.999999985490132e-05, "loss": 3.3698, "step": 490 }, { "epoch": 0.013188997528741806, "grad_norm": 1.751397967338562, "learning_rate": 9.999999985430236e-05, "loss": 3.5693, "step": 491 }, { "epoch": 0.013215859030837005, "grad_norm": 1.6995598077774048, "learning_rate": 9.999999985370217e-05, "loss": 3.4366, "step": 492 }, { "epoch": 0.013242720532932201, "grad_norm": 1.7562099695205688, "learning_rate": 9.999999985310073e-05, "loss": 3.5061, "step": 493 }, { "epoch": 0.013269582035027398, "grad_norm": 1.6325514316558838, "learning_rate": 9.999999985249808e-05, "loss": 3.2136, "step": 494 }, { "epoch": 0.013296443537122596, "grad_norm": 1.7133991718292236, "learning_rate": 9.999999985189418e-05, "loss": 3.5505, "step": 495 }, { "epoch": 0.013323305039217793, "grad_norm": 1.8352411985397339, "learning_rate": 9.999999985128905e-05, "loss": 3.5474, "step": 496 }, { "epoch": 0.013350166541312991, "grad_norm": 1.6895605325698853, "learning_rate": 9.999999985068269e-05, "loss": 3.2428, "step": 497 }, { "epoch": 0.013377028043408188, "grad_norm": 1.8699710369110107, "learning_rate": 9.999999985007509e-05, "loss": 3.3019, "step": 498 }, { "epoch": 0.013403889545503384, "grad_norm": 1.8091951608657837, "learning_rate": 9.999999984946626e-05, "loss": 3.3801, "step": 499 }, { "epoch": 0.013430751047598583, "grad_norm": 1.845009207725525, "learning_rate": 9.999999984885619e-05, "loss": 3.4538, "step": 500 }, { "epoch": 0.013457612549693779, "grad_norm": 1.5051531791687012, "learning_rate": 9.999999984824489e-05, "loss": 3.2036, "step": 501 }, { "epoch": 0.013484474051788976, "grad_norm": 1.5916658639907837, "learning_rate": 9.999999984763236e-05, "loss": 2.9292, "step": 502 }, { "epoch": 0.013511335553884174, "grad_norm": 1.5836458206176758, "learning_rate": 9.99999998470186e-05, "loss": 3.3527, "step": 503 }, { "epoch": 0.01353819705597937, "grad_norm": 1.5160671472549438, "learning_rate": 9.999999984640359e-05, "loss": 2.9836, "step": 504 }, { "epoch": 0.013565058558074567, "grad_norm": 1.4878536462783813, "learning_rate": 9.999999984578736e-05, "loss": 3.1274, "step": 505 }, { "epoch": 0.013591920060169765, "grad_norm": 1.5014201402664185, "learning_rate": 9.999999984516989e-05, "loss": 2.9355, "step": 506 }, { "epoch": 0.013618781562264962, "grad_norm": 1.4217443466186523, "learning_rate": 9.999999984455119e-05, "loss": 3.0484, "step": 507 }, { "epoch": 0.013645643064360159, "grad_norm": 1.5473932027816772, "learning_rate": 9.999999984393126e-05, "loss": 3.2235, "step": 508 }, { "epoch": 0.013672504566455357, "grad_norm": 1.4834808111190796, "learning_rate": 9.999999984331009e-05, "loss": 3.1755, "step": 509 }, { "epoch": 0.013699366068550553, "grad_norm": 1.636918306350708, "learning_rate": 9.999999984268769e-05, "loss": 3.3078, "step": 510 }, { "epoch": 0.01372622757064575, "grad_norm": 1.439376711845398, "learning_rate": 9.999999984206406e-05, "loss": 3.0135, "step": 511 }, { "epoch": 0.013753089072740948, "grad_norm": 1.612460732460022, "learning_rate": 9.999999984143918e-05, "loss": 3.1161, "step": 512 }, { "epoch": 0.013779950574836145, "grad_norm": 1.2844470739364624, "learning_rate": 9.999999984081308e-05, "loss": 2.8637, "step": 513 }, { "epoch": 0.013806812076931341, "grad_norm": 1.4732033014297485, "learning_rate": 9.999999984018573e-05, "loss": 3.1481, "step": 514 }, { "epoch": 0.01383367357902654, "grad_norm": 1.3747831583023071, "learning_rate": 9.999999983955717e-05, "loss": 2.9905, "step": 515 }, { "epoch": 0.013860535081121736, "grad_norm": 1.550026297569275, "learning_rate": 9.999999983892737e-05, "loss": 3.2305, "step": 516 }, { "epoch": 0.013887396583216933, "grad_norm": 1.468211054801941, "learning_rate": 9.999999983829633e-05, "loss": 2.9263, "step": 517 }, { "epoch": 0.013914258085312131, "grad_norm": 1.5542832612991333, "learning_rate": 9.999999983766405e-05, "loss": 3.0559, "step": 518 }, { "epoch": 0.013941119587407328, "grad_norm": 1.5159426927566528, "learning_rate": 9.999999983703054e-05, "loss": 2.8209, "step": 519 }, { "epoch": 0.013967981089502524, "grad_norm": 1.4121785163879395, "learning_rate": 9.999999983639581e-05, "loss": 3.2127, "step": 520 }, { "epoch": 0.013994842591597723, "grad_norm": 1.531212329864502, "learning_rate": 9.999999983575983e-05, "loss": 3.0407, "step": 521 }, { "epoch": 0.01402170409369292, "grad_norm": 1.4030855894088745, "learning_rate": 9.999999983512263e-05, "loss": 2.8646, "step": 522 }, { "epoch": 0.014048565595788116, "grad_norm": 1.542206883430481, "learning_rate": 9.999999983448418e-05, "loss": 3.1509, "step": 523 }, { "epoch": 0.014075427097883314, "grad_norm": 1.414257526397705, "learning_rate": 9.999999983384452e-05, "loss": 3.0755, "step": 524 }, { "epoch": 0.01410228859997851, "grad_norm": 1.5378986597061157, "learning_rate": 9.99999998332036e-05, "loss": 3.0178, "step": 525 }, { "epoch": 0.014129150102073707, "grad_norm": 1.573044776916504, "learning_rate": 9.999999983256146e-05, "loss": 3.1776, "step": 526 }, { "epoch": 0.014156011604168905, "grad_norm": 1.5487502813339233, "learning_rate": 9.99999998319181e-05, "loss": 3.3601, "step": 527 }, { "epoch": 0.014182873106264102, "grad_norm": 1.622823715209961, "learning_rate": 9.999999983127348e-05, "loss": 3.1433, "step": 528 }, { "epoch": 0.014209734608359299, "grad_norm": 1.4159866571426392, "learning_rate": 9.999999983062765e-05, "loss": 3.0213, "step": 529 }, { "epoch": 0.014236596110454497, "grad_norm": 1.5231095552444458, "learning_rate": 9.999999982998056e-05, "loss": 3.0939, "step": 530 }, { "epoch": 0.014263457612549693, "grad_norm": 1.5016103982925415, "learning_rate": 9.999999982933226e-05, "loss": 3.3296, "step": 531 }, { "epoch": 0.014290319114644892, "grad_norm": 1.5221531391143799, "learning_rate": 9.999999982868271e-05, "loss": 3.2847, "step": 532 }, { "epoch": 0.014317180616740088, "grad_norm": 1.6948050260543823, "learning_rate": 9.999999982803193e-05, "loss": 3.1667, "step": 533 }, { "epoch": 0.014344042118835285, "grad_norm": 1.4808393716812134, "learning_rate": 9.999999982737992e-05, "loss": 2.9595, "step": 534 }, { "epoch": 0.014370903620930483, "grad_norm": 1.594460129737854, "learning_rate": 9.999999982672668e-05, "loss": 3.0397, "step": 535 }, { "epoch": 0.01439776512302568, "grad_norm": 1.7172738313674927, "learning_rate": 9.99999998260722e-05, "loss": 3.1509, "step": 536 }, { "epoch": 0.014424626625120876, "grad_norm": 1.5233323574066162, "learning_rate": 9.999999982541649e-05, "loss": 3.1804, "step": 537 }, { "epoch": 0.014451488127216075, "grad_norm": 1.71015465259552, "learning_rate": 9.999999982475954e-05, "loss": 3.2557, "step": 538 }, { "epoch": 0.014478349629311271, "grad_norm": 1.7213354110717773, "learning_rate": 9.999999982410135e-05, "loss": 3.474, "step": 539 }, { "epoch": 0.014505211131406468, "grad_norm": 1.5233113765716553, "learning_rate": 9.999999982344195e-05, "loss": 3.193, "step": 540 }, { "epoch": 0.014532072633501666, "grad_norm": 1.641194462776184, "learning_rate": 9.99999998227813e-05, "loss": 3.2334, "step": 541 }, { "epoch": 0.014558934135596863, "grad_norm": 1.7879207134246826, "learning_rate": 9.999999982211941e-05, "loss": 3.3306, "step": 542 }, { "epoch": 0.01458579563769206, "grad_norm": 1.6878842115402222, "learning_rate": 9.99999998214563e-05, "loss": 3.4826, "step": 543 }, { "epoch": 0.014612657139787258, "grad_norm": 1.5964375734329224, "learning_rate": 9.999999982079196e-05, "loss": 3.1481, "step": 544 }, { "epoch": 0.014639518641882454, "grad_norm": 1.7950385808944702, "learning_rate": 9.999999982012638e-05, "loss": 3.4581, "step": 545 }, { "epoch": 0.01466638014397765, "grad_norm": 1.717629075050354, "learning_rate": 9.999999981945955e-05, "loss": 3.1694, "step": 546 }, { "epoch": 0.014693241646072849, "grad_norm": 1.9837907552719116, "learning_rate": 9.999999981879151e-05, "loss": 3.4366, "step": 547 }, { "epoch": 0.014720103148168046, "grad_norm": 1.6364156007766724, "learning_rate": 9.999999981812223e-05, "loss": 3.1522, "step": 548 }, { "epoch": 0.014746964650263242, "grad_norm": 2.122551202774048, "learning_rate": 9.999999981745172e-05, "loss": 3.4088, "step": 549 }, { "epoch": 0.01477382615235844, "grad_norm": 2.0202836990356445, "learning_rate": 9.999999981677996e-05, "loss": 3.3569, "step": 550 }, { "epoch": 0.014800687654453637, "grad_norm": 1.4466760158538818, "learning_rate": 9.999999981610697e-05, "loss": 2.767, "step": 551 }, { "epoch": 0.014827549156548834, "grad_norm": 1.565651774406433, "learning_rate": 9.999999981543276e-05, "loss": 3.2593, "step": 552 }, { "epoch": 0.014854410658644032, "grad_norm": 1.4580707550048828, "learning_rate": 9.99999998147573e-05, "loss": 3.1369, "step": 553 }, { "epoch": 0.014881272160739228, "grad_norm": 1.5976176261901855, "learning_rate": 9.999999981408062e-05, "loss": 3.3074, "step": 554 }, { "epoch": 0.014908133662834425, "grad_norm": 1.3057708740234375, "learning_rate": 9.999999981340271e-05, "loss": 3.0269, "step": 555 }, { "epoch": 0.014934995164929623, "grad_norm": 1.2500848770141602, "learning_rate": 9.999999981272354e-05, "loss": 2.6988, "step": 556 }, { "epoch": 0.01496185666702482, "grad_norm": 1.383833408355713, "learning_rate": 9.999999981204316e-05, "loss": 3.1782, "step": 557 }, { "epoch": 0.014988718169120016, "grad_norm": 1.4894264936447144, "learning_rate": 9.999999981136154e-05, "loss": 3.0019, "step": 558 }, { "epoch": 0.015015579671215215, "grad_norm": 1.4194146394729614, "learning_rate": 9.99999998106787e-05, "loss": 2.7848, "step": 559 }, { "epoch": 0.015042441173310411, "grad_norm": 1.6750750541687012, "learning_rate": 9.99999998099946e-05, "loss": 2.9919, "step": 560 }, { "epoch": 0.015069302675405608, "grad_norm": 1.4904073476791382, "learning_rate": 9.999999980930928e-05, "loss": 2.9473, "step": 561 }, { "epoch": 0.015096164177500806, "grad_norm": 1.3552346229553223, "learning_rate": 9.999999980862274e-05, "loss": 2.9918, "step": 562 }, { "epoch": 0.015123025679596003, "grad_norm": 1.4699270725250244, "learning_rate": 9.999999980793495e-05, "loss": 3.0069, "step": 563 }, { "epoch": 0.0151498871816912, "grad_norm": 1.4307541847229004, "learning_rate": 9.999999980724592e-05, "loss": 3.0416, "step": 564 }, { "epoch": 0.015176748683786398, "grad_norm": 1.4240820407867432, "learning_rate": 9.999999980655565e-05, "loss": 3.0761, "step": 565 }, { "epoch": 0.015203610185881594, "grad_norm": 1.514182448387146, "learning_rate": 9.999999980586417e-05, "loss": 3.3871, "step": 566 }, { "epoch": 0.015230471687976792, "grad_norm": 1.4674021005630493, "learning_rate": 9.999999980517145e-05, "loss": 3.0696, "step": 567 }, { "epoch": 0.015257333190071989, "grad_norm": 1.5257642269134521, "learning_rate": 9.999999980447749e-05, "loss": 3.1464, "step": 568 }, { "epoch": 0.015284194692167186, "grad_norm": 1.277586579322815, "learning_rate": 9.99999998037823e-05, "loss": 2.8407, "step": 569 }, { "epoch": 0.015311056194262384, "grad_norm": 1.529081106185913, "learning_rate": 9.999999980308588e-05, "loss": 3.0011, "step": 570 }, { "epoch": 0.01533791769635758, "grad_norm": 1.4303085803985596, "learning_rate": 9.999999980238821e-05, "loss": 2.9766, "step": 571 }, { "epoch": 0.015364779198452777, "grad_norm": 1.4762227535247803, "learning_rate": 9.999999980168933e-05, "loss": 2.8978, "step": 572 }, { "epoch": 0.015391640700547975, "grad_norm": 1.5402470827102661, "learning_rate": 9.99999998009892e-05, "loss": 3.0596, "step": 573 }, { "epoch": 0.015418502202643172, "grad_norm": 1.673646092414856, "learning_rate": 9.999999980028785e-05, "loss": 3.0161, "step": 574 }, { "epoch": 0.015445363704738369, "grad_norm": 1.5393816232681274, "learning_rate": 9.999999979958525e-05, "loss": 2.9339, "step": 575 }, { "epoch": 0.015472225206833567, "grad_norm": 1.6110318899154663, "learning_rate": 9.999999979888141e-05, "loss": 3.0425, "step": 576 }, { "epoch": 0.015499086708928763, "grad_norm": 1.70695161819458, "learning_rate": 9.999999979817635e-05, "loss": 3.2202, "step": 577 }, { "epoch": 0.01552594821102396, "grad_norm": 1.6060736179351807, "learning_rate": 9.999999979747006e-05, "loss": 3.1667, "step": 578 }, { "epoch": 0.015552809713119158, "grad_norm": 1.648018717765808, "learning_rate": 9.999999979676254e-05, "loss": 3.1444, "step": 579 }, { "epoch": 0.015579671215214355, "grad_norm": 1.6265192031860352, "learning_rate": 9.999999979605378e-05, "loss": 3.0009, "step": 580 }, { "epoch": 0.015606532717309551, "grad_norm": 1.5863219499588013, "learning_rate": 9.999999979534378e-05, "loss": 3.4085, "step": 581 }, { "epoch": 0.01563339421940475, "grad_norm": 1.7096445560455322, "learning_rate": 9.999999979463256e-05, "loss": 3.1331, "step": 582 }, { "epoch": 0.015660255721499948, "grad_norm": 1.4504519701004028, "learning_rate": 9.99999997939201e-05, "loss": 3.0117, "step": 583 }, { "epoch": 0.015687117223595143, "grad_norm": 1.5081751346588135, "learning_rate": 9.99999997932064e-05, "loss": 3.2206, "step": 584 }, { "epoch": 0.01571397872569034, "grad_norm": 1.7219492197036743, "learning_rate": 9.999999979249146e-05, "loss": 3.0363, "step": 585 }, { "epoch": 0.01574084022778554, "grad_norm": 1.6603717803955078, "learning_rate": 9.99999997917753e-05, "loss": 3.2665, "step": 586 }, { "epoch": 0.015767701729880734, "grad_norm": 1.6417179107666016, "learning_rate": 9.99999997910579e-05, "loss": 2.9291, "step": 587 }, { "epoch": 0.015794563231975933, "grad_norm": 1.5284661054611206, "learning_rate": 9.999999979033927e-05, "loss": 3.2111, "step": 588 }, { "epoch": 0.01582142473407113, "grad_norm": 1.5470607280731201, "learning_rate": 9.999999978961941e-05, "loss": 2.9447, "step": 589 }, { "epoch": 0.015848286236166326, "grad_norm": 1.7079750299453735, "learning_rate": 9.99999997888983e-05, "loss": 3.3994, "step": 590 }, { "epoch": 0.015875147738261524, "grad_norm": 1.6720783710479736, "learning_rate": 9.999999978817598e-05, "loss": 3.1815, "step": 591 }, { "epoch": 0.015902009240356722, "grad_norm": 1.5435919761657715, "learning_rate": 9.999999978745241e-05, "loss": 2.8047, "step": 592 }, { "epoch": 0.015928870742451917, "grad_norm": 1.6838616132736206, "learning_rate": 9.999999978672761e-05, "loss": 3.1796, "step": 593 }, { "epoch": 0.015955732244547115, "grad_norm": 1.8546950817108154, "learning_rate": 9.999999978600159e-05, "loss": 3.3696, "step": 594 }, { "epoch": 0.015982593746642314, "grad_norm": 1.722764015197754, "learning_rate": 9.999999978527432e-05, "loss": 3.1535, "step": 595 }, { "epoch": 0.01600945524873751, "grad_norm": 1.6362978219985962, "learning_rate": 9.999999978454582e-05, "loss": 3.3235, "step": 596 }, { "epoch": 0.016036316750832707, "grad_norm": 1.7171472311019897, "learning_rate": 9.99999997838161e-05, "loss": 3.2714, "step": 597 }, { "epoch": 0.016063178252927905, "grad_norm": 1.8612757921218872, "learning_rate": 9.999999978308511e-05, "loss": 3.3938, "step": 598 }, { "epoch": 0.0160900397550231, "grad_norm": 2.0596330165863037, "learning_rate": 9.999999978235291e-05, "loss": 3.3465, "step": 599 }, { "epoch": 0.0161169012571183, "grad_norm": 1.7264341115951538, "learning_rate": 9.999999978161947e-05, "loss": 3.2179, "step": 600 }, { "epoch": 0.016143762759213497, "grad_norm": 1.4845199584960938, "learning_rate": 9.99999997808848e-05, "loss": 3.2596, "step": 601 }, { "epoch": 0.01617062426130869, "grad_norm": 1.5728617906570435, "learning_rate": 9.999999978014892e-05, "loss": 3.0341, "step": 602 }, { "epoch": 0.01619748576340389, "grad_norm": 1.4966508150100708, "learning_rate": 9.999999977941177e-05, "loss": 3.1194, "step": 603 }, { "epoch": 0.016224347265499088, "grad_norm": 1.623679518699646, "learning_rate": 9.99999997786734e-05, "loss": 2.9306, "step": 604 }, { "epoch": 0.016251208767594283, "grad_norm": 1.5571153163909912, "learning_rate": 9.99999997779338e-05, "loss": 3.1209, "step": 605 }, { "epoch": 0.01627807026968948, "grad_norm": 1.5426580905914307, "learning_rate": 9.999999977719296e-05, "loss": 3.2077, "step": 606 }, { "epoch": 0.01630493177178468, "grad_norm": 1.5416814088821411, "learning_rate": 9.999999977645089e-05, "loss": 3.2349, "step": 607 }, { "epoch": 0.016331793273879874, "grad_norm": 1.580945611000061, "learning_rate": 9.999999977570758e-05, "loss": 2.8518, "step": 608 }, { "epoch": 0.016358654775975073, "grad_norm": 1.3710176944732666, "learning_rate": 9.999999977496304e-05, "loss": 2.9742, "step": 609 }, { "epoch": 0.01638551627807027, "grad_norm": 1.4292913675308228, "learning_rate": 9.999999977421727e-05, "loss": 2.9606, "step": 610 }, { "epoch": 0.016412377780165466, "grad_norm": 1.366773247718811, "learning_rate": 9.999999977347026e-05, "loss": 2.9488, "step": 611 }, { "epoch": 0.016439239282260664, "grad_norm": 1.4914782047271729, "learning_rate": 9.999999977272204e-05, "loss": 3.087, "step": 612 }, { "epoch": 0.016466100784355862, "grad_norm": 1.4016969203948975, "learning_rate": 9.999999977197256e-05, "loss": 2.9716, "step": 613 }, { "epoch": 0.016492962286451057, "grad_norm": 1.3989452123641968, "learning_rate": 9.999999977122184e-05, "loss": 3.0153, "step": 614 }, { "epoch": 0.016519823788546256, "grad_norm": 1.3405635356903076, "learning_rate": 9.99999997704699e-05, "loss": 2.6192, "step": 615 }, { "epoch": 0.016546685290641454, "grad_norm": 1.4095908403396606, "learning_rate": 9.999999976971672e-05, "loss": 3.0062, "step": 616 }, { "epoch": 0.01657354679273665, "grad_norm": 1.4745733737945557, "learning_rate": 9.999999976896232e-05, "loss": 3.125, "step": 617 }, { "epoch": 0.016600408294831847, "grad_norm": 1.4414845705032349, "learning_rate": 9.999999976820667e-05, "loss": 2.9896, "step": 618 }, { "epoch": 0.016627269796927045, "grad_norm": 1.6898787021636963, "learning_rate": 9.99999997674498e-05, "loss": 3.1397, "step": 619 }, { "epoch": 0.01665413129902224, "grad_norm": 1.5939667224884033, "learning_rate": 9.99999997666917e-05, "loss": 3.0917, "step": 620 }, { "epoch": 0.01668099280111744, "grad_norm": 1.5486644506454468, "learning_rate": 9.999999976593236e-05, "loss": 3.0241, "step": 621 }, { "epoch": 0.016707854303212637, "grad_norm": 1.4304726123809814, "learning_rate": 9.999999976517178e-05, "loss": 3.0435, "step": 622 }, { "epoch": 0.01673471580530783, "grad_norm": 1.4396814107894897, "learning_rate": 9.999999976440997e-05, "loss": 3.3262, "step": 623 }, { "epoch": 0.01676157730740303, "grad_norm": 1.3595683574676514, "learning_rate": 9.999999976364691e-05, "loss": 2.9957, "step": 624 }, { "epoch": 0.016788438809498228, "grad_norm": 1.4666507244110107, "learning_rate": 9.999999976288263e-05, "loss": 3.1825, "step": 625 }, { "epoch": 0.016815300311593423, "grad_norm": 1.5233851671218872, "learning_rate": 9.999999976211713e-05, "loss": 3.266, "step": 626 }, { "epoch": 0.01684216181368862, "grad_norm": 1.5205273628234863, "learning_rate": 9.999999976135038e-05, "loss": 3.114, "step": 627 }, { "epoch": 0.01686902331578382, "grad_norm": 1.4155679941177368, "learning_rate": 9.99999997605824e-05, "loss": 3.0824, "step": 628 }, { "epoch": 0.016895884817879014, "grad_norm": 1.4930715560913086, "learning_rate": 9.999999975981319e-05, "loss": 3.0347, "step": 629 }, { "epoch": 0.016922746319974213, "grad_norm": 1.642045021057129, "learning_rate": 9.999999975904275e-05, "loss": 3.339, "step": 630 }, { "epoch": 0.01694960782206941, "grad_norm": 1.6314470767974854, "learning_rate": 9.999999975827107e-05, "loss": 3.4392, "step": 631 }, { "epoch": 0.016976469324164606, "grad_norm": 1.479723572731018, "learning_rate": 9.999999975749816e-05, "loss": 3.4092, "step": 632 }, { "epoch": 0.017003330826259804, "grad_norm": 1.4937212467193604, "learning_rate": 9.999999975672401e-05, "loss": 3.1713, "step": 633 }, { "epoch": 0.017030192328355002, "grad_norm": 1.5711121559143066, "learning_rate": 9.999999975594863e-05, "loss": 3.2504, "step": 634 }, { "epoch": 0.017057053830450197, "grad_norm": 1.7083232402801514, "learning_rate": 9.9999999755172e-05, "loss": 3.5004, "step": 635 }, { "epoch": 0.017083915332545396, "grad_norm": 1.6591018438339233, "learning_rate": 9.999999975439416e-05, "loss": 3.2098, "step": 636 }, { "epoch": 0.017110776834640594, "grad_norm": 1.6006439924240112, "learning_rate": 9.999999975361507e-05, "loss": 3.1391, "step": 637 }, { "epoch": 0.01713763833673579, "grad_norm": 1.6198042631149292, "learning_rate": 9.999999975283476e-05, "loss": 3.303, "step": 638 }, { "epoch": 0.017164499838830987, "grad_norm": 1.6083146333694458, "learning_rate": 9.999999975205321e-05, "loss": 3.3684, "step": 639 }, { "epoch": 0.017191361340926185, "grad_norm": 1.8380709886550903, "learning_rate": 9.999999975127043e-05, "loss": 3.5626, "step": 640 }, { "epoch": 0.01721822284302138, "grad_norm": 1.5533370971679688, "learning_rate": 9.999999975048642e-05, "loss": 3.2367, "step": 641 }, { "epoch": 0.01724508434511658, "grad_norm": 1.57891845703125, "learning_rate": 9.999999974970116e-05, "loss": 3.0813, "step": 642 }, { "epoch": 0.017271945847211777, "grad_norm": 1.556593418121338, "learning_rate": 9.999999974891468e-05, "loss": 3.3402, "step": 643 }, { "epoch": 0.01729880734930697, "grad_norm": 1.5907540321350098, "learning_rate": 9.999999974812697e-05, "loss": 3.3009, "step": 644 }, { "epoch": 0.01732566885140217, "grad_norm": 1.5855154991149902, "learning_rate": 9.9999999747338e-05, "loss": 3.4019, "step": 645 }, { "epoch": 0.017352530353497368, "grad_norm": 1.5891433954238892, "learning_rate": 9.999999974654781e-05, "loss": 3.3439, "step": 646 }, { "epoch": 0.017379391855592563, "grad_norm": 1.7367231845855713, "learning_rate": 9.99999997457564e-05, "loss": 3.4537, "step": 647 }, { "epoch": 0.01740625335768776, "grad_norm": 1.702995777130127, "learning_rate": 9.999999974496375e-05, "loss": 3.3761, "step": 648 }, { "epoch": 0.01743311485978296, "grad_norm": 1.714220643043518, "learning_rate": 9.999999974416986e-05, "loss": 3.4498, "step": 649 }, { "epoch": 0.017459976361878154, "grad_norm": 1.9178133010864258, "learning_rate": 9.999999974337474e-05, "loss": 3.5324, "step": 650 }, { "epoch": 0.017486837863973353, "grad_norm": 1.398617148399353, "learning_rate": 9.999999974257838e-05, "loss": 3.1473, "step": 651 }, { "epoch": 0.01751369936606855, "grad_norm": 1.5064873695373535, "learning_rate": 9.99999997417808e-05, "loss": 3.0855, "step": 652 }, { "epoch": 0.01754056086816375, "grad_norm": 1.5795984268188477, "learning_rate": 9.999999974098198e-05, "loss": 2.8589, "step": 653 }, { "epoch": 0.017567422370258944, "grad_norm": 1.505240797996521, "learning_rate": 9.999999974018193e-05, "loss": 2.99, "step": 654 }, { "epoch": 0.017594283872354143, "grad_norm": 1.434888243675232, "learning_rate": 9.999999973938064e-05, "loss": 2.9245, "step": 655 }, { "epoch": 0.01762114537444934, "grad_norm": 1.3589751720428467, "learning_rate": 9.999999973857811e-05, "loss": 2.6756, "step": 656 }, { "epoch": 0.017648006876544536, "grad_norm": 1.432234764099121, "learning_rate": 9.999999973777435e-05, "loss": 3.0035, "step": 657 }, { "epoch": 0.017674868378639734, "grad_norm": 1.450545072555542, "learning_rate": 9.999999973696936e-05, "loss": 3.1422, "step": 658 }, { "epoch": 0.017701729880734932, "grad_norm": 1.426239013671875, "learning_rate": 9.999999973616315e-05, "loss": 3.1334, "step": 659 }, { "epoch": 0.017728591382830127, "grad_norm": 1.541068196296692, "learning_rate": 9.999999973535567e-05, "loss": 3.0724, "step": 660 }, { "epoch": 0.017755452884925325, "grad_norm": 1.3603521585464478, "learning_rate": 9.9999999734547e-05, "loss": 2.9346, "step": 661 }, { "epoch": 0.017782314387020524, "grad_norm": 1.3395049571990967, "learning_rate": 9.999999973373705e-05, "loss": 3.0172, "step": 662 }, { "epoch": 0.01780917588911572, "grad_norm": 1.4169708490371704, "learning_rate": 9.999999973292591e-05, "loss": 3.08, "step": 663 }, { "epoch": 0.017836037391210917, "grad_norm": 1.4493162631988525, "learning_rate": 9.999999973211352e-05, "loss": 3.0241, "step": 664 }, { "epoch": 0.017862898893306115, "grad_norm": 1.4321480989456177, "learning_rate": 9.99999997312999e-05, "loss": 2.7355, "step": 665 }, { "epoch": 0.01788976039540131, "grad_norm": 1.4396843910217285, "learning_rate": 9.999999973048504e-05, "loss": 3.1973, "step": 666 }, { "epoch": 0.01791662189749651, "grad_norm": 1.5023831129074097, "learning_rate": 9.999999972966894e-05, "loss": 2.8972, "step": 667 }, { "epoch": 0.017943483399591707, "grad_norm": 1.574151635169983, "learning_rate": 9.999999972885161e-05, "loss": 3.25, "step": 668 }, { "epoch": 0.0179703449016869, "grad_norm": 1.382725477218628, "learning_rate": 9.999999972803304e-05, "loss": 2.8051, "step": 669 }, { "epoch": 0.0179972064037821, "grad_norm": 1.4401991367340088, "learning_rate": 9.999999972721326e-05, "loss": 3.0833, "step": 670 }, { "epoch": 0.018024067905877298, "grad_norm": 1.5565216541290283, "learning_rate": 9.999999972639222e-05, "loss": 2.9725, "step": 671 }, { "epoch": 0.018050929407972493, "grad_norm": 1.396734595298767, "learning_rate": 9.999999972556998e-05, "loss": 2.8631, "step": 672 }, { "epoch": 0.01807779091006769, "grad_norm": 1.5907701253890991, "learning_rate": 9.999999972474647e-05, "loss": 3.4372, "step": 673 }, { "epoch": 0.01810465241216289, "grad_norm": 1.479804515838623, "learning_rate": 9.999999972392173e-05, "loss": 2.9795, "step": 674 }, { "epoch": 0.018131513914258084, "grad_norm": 1.3441416025161743, "learning_rate": 9.999999972309577e-05, "loss": 2.8914, "step": 675 }, { "epoch": 0.018158375416353283, "grad_norm": 1.435198426246643, "learning_rate": 9.99999997222686e-05, "loss": 3.1491, "step": 676 }, { "epoch": 0.01818523691844848, "grad_norm": 1.602691411972046, "learning_rate": 9.999999972144016e-05, "loss": 3.148, "step": 677 }, { "epoch": 0.018212098420543676, "grad_norm": 1.5204836130142212, "learning_rate": 9.99999997206105e-05, "loss": 3.2252, "step": 678 }, { "epoch": 0.018238959922638874, "grad_norm": 1.5765471458435059, "learning_rate": 9.999999971977959e-05, "loss": 2.9243, "step": 679 }, { "epoch": 0.018265821424734072, "grad_norm": 1.6346088647842407, "learning_rate": 9.999999971894745e-05, "loss": 3.3296, "step": 680 }, { "epoch": 0.018292682926829267, "grad_norm": 1.4339308738708496, "learning_rate": 9.99999997181141e-05, "loss": 3.091, "step": 681 }, { "epoch": 0.018319544428924465, "grad_norm": 1.6199924945831299, "learning_rate": 9.999999971727949e-05, "loss": 3.2955, "step": 682 }, { "epoch": 0.018346405931019664, "grad_norm": 1.4859867095947266, "learning_rate": 9.999999971644366e-05, "loss": 3.166, "step": 683 }, { "epoch": 0.01837326743311486, "grad_norm": 1.512736439704895, "learning_rate": 9.999999971560659e-05, "loss": 3.2106, "step": 684 }, { "epoch": 0.018400128935210057, "grad_norm": 1.5425671339035034, "learning_rate": 9.99999997147683e-05, "loss": 3.2207, "step": 685 }, { "epoch": 0.018426990437305255, "grad_norm": 1.5040771961212158, "learning_rate": 9.999999971392876e-05, "loss": 2.9827, "step": 686 }, { "epoch": 0.01845385193940045, "grad_norm": 1.5521918535232544, "learning_rate": 9.9999999713088e-05, "loss": 3.3105, "step": 687 }, { "epoch": 0.01848071344149565, "grad_norm": 1.67476487159729, "learning_rate": 9.9999999712246e-05, "loss": 2.9965, "step": 688 }, { "epoch": 0.018507574943590847, "grad_norm": 1.6089451313018799, "learning_rate": 9.999999971140276e-05, "loss": 3.2392, "step": 689 }, { "epoch": 0.01853443644568604, "grad_norm": 1.599771499633789, "learning_rate": 9.999999971055829e-05, "loss": 2.8795, "step": 690 }, { "epoch": 0.01856129794778124, "grad_norm": 1.6681711673736572, "learning_rate": 9.999999970971258e-05, "loss": 3.567, "step": 691 }, { "epoch": 0.018588159449876438, "grad_norm": 1.7651498317718506, "learning_rate": 9.999999970886565e-05, "loss": 3.2726, "step": 692 }, { "epoch": 0.018615020951971633, "grad_norm": 1.6382040977478027, "learning_rate": 9.999999970801748e-05, "loss": 3.2873, "step": 693 }, { "epoch": 0.01864188245406683, "grad_norm": 1.6960110664367676, "learning_rate": 9.999999970716808e-05, "loss": 3.1896, "step": 694 }, { "epoch": 0.01866874395616203, "grad_norm": 1.8641104698181152, "learning_rate": 9.999999970631743e-05, "loss": 3.4889, "step": 695 }, { "epoch": 0.018695605458257224, "grad_norm": 1.672577142715454, "learning_rate": 9.999999970546556e-05, "loss": 3.5504, "step": 696 }, { "epoch": 0.018722466960352423, "grad_norm": 1.8528612852096558, "learning_rate": 9.999999970461247e-05, "loss": 3.5622, "step": 697 }, { "epoch": 0.01874932846244762, "grad_norm": 1.8413525819778442, "learning_rate": 9.999999970375813e-05, "loss": 3.7652, "step": 698 }, { "epoch": 0.018776189964542816, "grad_norm": 1.9651743173599243, "learning_rate": 9.999999970290256e-05, "loss": 3.6158, "step": 699 }, { "epoch": 0.018803051466638014, "grad_norm": 1.8817933797836304, "learning_rate": 9.999999970204576e-05, "loss": 3.3605, "step": 700 }, { "epoch": 0.018829912968733212, "grad_norm": 1.4275424480438232, "learning_rate": 9.999999970118772e-05, "loss": 3.0604, "step": 701 }, { "epoch": 0.018856774470828407, "grad_norm": 1.4607408046722412, "learning_rate": 9.999999970032844e-05, "loss": 3.1583, "step": 702 }, { "epoch": 0.018883635972923606, "grad_norm": 1.4945420026779175, "learning_rate": 9.999999969946794e-05, "loss": 3.4831, "step": 703 }, { "epoch": 0.018910497475018804, "grad_norm": 1.649383306503296, "learning_rate": 9.99999996986062e-05, "loss": 3.1655, "step": 704 }, { "epoch": 0.018937358977114, "grad_norm": 1.4347844123840332, "learning_rate": 9.999999969774322e-05, "loss": 3.0471, "step": 705 }, { "epoch": 0.018964220479209197, "grad_norm": 1.4316843748092651, "learning_rate": 9.9999999696879e-05, "loss": 2.8433, "step": 706 }, { "epoch": 0.018991081981304395, "grad_norm": 1.3896877765655518, "learning_rate": 9.999999969601357e-05, "loss": 2.913, "step": 707 }, { "epoch": 0.01901794348339959, "grad_norm": 1.5291829109191895, "learning_rate": 9.99999996951469e-05, "loss": 3.0895, "step": 708 }, { "epoch": 0.01904480498549479, "grad_norm": 1.4520114660263062, "learning_rate": 9.999999969427898e-05, "loss": 2.9309, "step": 709 }, { "epoch": 0.019071666487589987, "grad_norm": 1.4198780059814453, "learning_rate": 9.999999969340986e-05, "loss": 3.0022, "step": 710 }, { "epoch": 0.01909852798968518, "grad_norm": 1.501908779144287, "learning_rate": 9.999999969253947e-05, "loss": 3.2037, "step": 711 }, { "epoch": 0.01912538949178038, "grad_norm": 1.5056076049804688, "learning_rate": 9.999999969166786e-05, "loss": 3.1124, "step": 712 }, { "epoch": 0.019152250993875578, "grad_norm": 1.327278733253479, "learning_rate": 9.9999999690795e-05, "loss": 2.8448, "step": 713 }, { "epoch": 0.019179112495970773, "grad_norm": 1.3927946090698242, "learning_rate": 9.999999968992093e-05, "loss": 3.0786, "step": 714 }, { "epoch": 0.01920597399806597, "grad_norm": 1.6739174127578735, "learning_rate": 9.999999968904563e-05, "loss": 3.3298, "step": 715 }, { "epoch": 0.01923283550016117, "grad_norm": 1.6534708738327026, "learning_rate": 9.99999996881691e-05, "loss": 3.4348, "step": 716 }, { "epoch": 0.019259697002256364, "grad_norm": 1.492730736732483, "learning_rate": 9.99999996872913e-05, "loss": 3.0935, "step": 717 }, { "epoch": 0.019286558504351563, "grad_norm": 1.4301120042800903, "learning_rate": 9.99999996864123e-05, "loss": 2.872, "step": 718 }, { "epoch": 0.01931342000644676, "grad_norm": 1.351636290550232, "learning_rate": 9.999999968553204e-05, "loss": 2.7152, "step": 719 }, { "epoch": 0.01934028150854196, "grad_norm": 1.4593547582626343, "learning_rate": 9.999999968465057e-05, "loss": 3.1069, "step": 720 }, { "epoch": 0.019367143010637154, "grad_norm": 1.588448405265808, "learning_rate": 9.999999968376786e-05, "loss": 3.278, "step": 721 }, { "epoch": 0.019394004512732353, "grad_norm": 1.5615174770355225, "learning_rate": 9.999999968288391e-05, "loss": 3.0554, "step": 722 }, { "epoch": 0.01942086601482755, "grad_norm": 1.5523321628570557, "learning_rate": 9.999999968199872e-05, "loss": 3.3334, "step": 723 }, { "epoch": 0.019447727516922746, "grad_norm": 1.4447002410888672, "learning_rate": 9.999999968111231e-05, "loss": 3.0282, "step": 724 }, { "epoch": 0.019474589019017944, "grad_norm": 1.560368299484253, "learning_rate": 9.999999968022467e-05, "loss": 2.9756, "step": 725 }, { "epoch": 0.019501450521113142, "grad_norm": 1.681187629699707, "learning_rate": 9.999999967933577e-05, "loss": 3.496, "step": 726 }, { "epoch": 0.019528312023208337, "grad_norm": 1.6926003694534302, "learning_rate": 9.999999967844567e-05, "loss": 3.3941, "step": 727 }, { "epoch": 0.019555173525303535, "grad_norm": 1.4728666543960571, "learning_rate": 9.999999967755431e-05, "loss": 3.0974, "step": 728 }, { "epoch": 0.019582035027398734, "grad_norm": 1.5718094110488892, "learning_rate": 9.999999967666174e-05, "loss": 3.3441, "step": 729 }, { "epoch": 0.01960889652949393, "grad_norm": 1.541793704032898, "learning_rate": 9.999999967576792e-05, "loss": 3.2327, "step": 730 }, { "epoch": 0.019635758031589127, "grad_norm": 1.555477261543274, "learning_rate": 9.999999967487287e-05, "loss": 3.1065, "step": 731 }, { "epoch": 0.019662619533684325, "grad_norm": 1.6656402349472046, "learning_rate": 9.999999967397659e-05, "loss": 3.0119, "step": 732 }, { "epoch": 0.01968948103577952, "grad_norm": 1.548744559288025, "learning_rate": 9.999999967307907e-05, "loss": 3.1914, "step": 733 }, { "epoch": 0.01971634253787472, "grad_norm": 1.5926529169082642, "learning_rate": 9.999999967218032e-05, "loss": 3.1765, "step": 734 }, { "epoch": 0.019743204039969917, "grad_norm": 1.5121639966964722, "learning_rate": 9.999999967128033e-05, "loss": 2.8757, "step": 735 }, { "epoch": 0.01977006554206511, "grad_norm": 1.5775986909866333, "learning_rate": 9.999999967037911e-05, "loss": 3.1299, "step": 736 }, { "epoch": 0.01979692704416031, "grad_norm": 1.5838713645935059, "learning_rate": 9.999999966947666e-05, "loss": 3.2553, "step": 737 }, { "epoch": 0.019823788546255508, "grad_norm": 1.5859333276748657, "learning_rate": 9.999999966857298e-05, "loss": 3.4363, "step": 738 }, { "epoch": 0.019850650048350703, "grad_norm": 1.6026620864868164, "learning_rate": 9.999999966766805e-05, "loss": 3.2706, "step": 739 }, { "epoch": 0.0198775115504459, "grad_norm": 1.58791184425354, "learning_rate": 9.99999996667619e-05, "loss": 3.2601, "step": 740 }, { "epoch": 0.0199043730525411, "grad_norm": 1.5763542652130127, "learning_rate": 9.999999966585452e-05, "loss": 3.1218, "step": 741 }, { "epoch": 0.019931234554636294, "grad_norm": 1.6854625940322876, "learning_rate": 9.999999966494589e-05, "loss": 3.4081, "step": 742 }, { "epoch": 0.019958096056731493, "grad_norm": 1.802384614944458, "learning_rate": 9.999999966403604e-05, "loss": 3.1813, "step": 743 }, { "epoch": 0.01998495755882669, "grad_norm": 1.7802817821502686, "learning_rate": 9.999999966312495e-05, "loss": 3.4648, "step": 744 }, { "epoch": 0.020011819060921886, "grad_norm": 1.629107117652893, "learning_rate": 9.999999966221263e-05, "loss": 3.173, "step": 745 }, { "epoch": 0.020038680563017084, "grad_norm": 1.7447935342788696, "learning_rate": 9.999999966129907e-05, "loss": 2.9959, "step": 746 }, { "epoch": 0.020065542065112282, "grad_norm": 1.965346097946167, "learning_rate": 9.999999966038429e-05, "loss": 3.7127, "step": 747 }, { "epoch": 0.020092403567207477, "grad_norm": 1.9311946630477905, "learning_rate": 9.999999965946826e-05, "loss": 3.7032, "step": 748 }, { "epoch": 0.020119265069302675, "grad_norm": 2.012167453765869, "learning_rate": 9.999999965855101e-05, "loss": 3.0979, "step": 749 }, { "epoch": 0.020146126571397874, "grad_norm": 1.8708699941635132, "learning_rate": 9.999999965763251e-05, "loss": 3.4835, "step": 750 }, { "epoch": 0.02017298807349307, "grad_norm": 1.5525647401809692, "learning_rate": 9.999999965671278e-05, "loss": 3.4838, "step": 751 }, { "epoch": 0.020199849575588267, "grad_norm": 1.684830904006958, "learning_rate": 9.999999965579184e-05, "loss": 3.4157, "step": 752 }, { "epoch": 0.020226711077683465, "grad_norm": 1.487632155418396, "learning_rate": 9.999999965486964e-05, "loss": 3.3135, "step": 753 }, { "epoch": 0.02025357257977866, "grad_norm": 1.4215151071548462, "learning_rate": 9.999999965394623e-05, "loss": 2.806, "step": 754 }, { "epoch": 0.02028043408187386, "grad_norm": 1.4384040832519531, "learning_rate": 9.999999965302156e-05, "loss": 2.725, "step": 755 }, { "epoch": 0.020307295583969057, "grad_norm": 1.519808292388916, "learning_rate": 9.999999965209566e-05, "loss": 3.0673, "step": 756 }, { "epoch": 0.02033415708606425, "grad_norm": 1.5362298488616943, "learning_rate": 9.999999965116853e-05, "loss": 2.9519, "step": 757 }, { "epoch": 0.02036101858815945, "grad_norm": 1.3809818029403687, "learning_rate": 9.999999965024018e-05, "loss": 2.9788, "step": 758 }, { "epoch": 0.020387880090254648, "grad_norm": 1.4069761037826538, "learning_rate": 9.999999964931057e-05, "loss": 3.0046, "step": 759 }, { "epoch": 0.020414741592349843, "grad_norm": 1.5650560855865479, "learning_rate": 9.999999964837976e-05, "loss": 3.0619, "step": 760 }, { "epoch": 0.02044160309444504, "grad_norm": 1.410232663154602, "learning_rate": 9.999999964744768e-05, "loss": 2.9326, "step": 761 }, { "epoch": 0.02046846459654024, "grad_norm": 1.459146499633789, "learning_rate": 9.99999996465144e-05, "loss": 3.0465, "step": 762 }, { "epoch": 0.020495326098635434, "grad_norm": 1.385705590248108, "learning_rate": 9.999999964557987e-05, "loss": 3.0244, "step": 763 }, { "epoch": 0.020522187600730633, "grad_norm": 1.380672812461853, "learning_rate": 9.999999964464411e-05, "loss": 2.7153, "step": 764 }, { "epoch": 0.02054904910282583, "grad_norm": 1.549020528793335, "learning_rate": 9.999999964370712e-05, "loss": 3.2815, "step": 765 }, { "epoch": 0.020575910604921026, "grad_norm": 1.4888899326324463, "learning_rate": 9.999999964276887e-05, "loss": 3.1887, "step": 766 }, { "epoch": 0.020602772107016224, "grad_norm": 1.613270878791809, "learning_rate": 9.999999964182943e-05, "loss": 3.2248, "step": 767 }, { "epoch": 0.020629633609111422, "grad_norm": 1.4555995464324951, "learning_rate": 9.999999964088872e-05, "loss": 3.1396, "step": 768 }, { "epoch": 0.020656495111206617, "grad_norm": 1.5034013986587524, "learning_rate": 9.999999963994679e-05, "loss": 3.0802, "step": 769 }, { "epoch": 0.020683356613301816, "grad_norm": 1.5299456119537354, "learning_rate": 9.999999963900362e-05, "loss": 3.0, "step": 770 }, { "epoch": 0.020710218115397014, "grad_norm": 1.523677110671997, "learning_rate": 9.999999963805922e-05, "loss": 2.9568, "step": 771 }, { "epoch": 0.02073707961749221, "grad_norm": 1.3994543552398682, "learning_rate": 9.99999996371136e-05, "loss": 2.8977, "step": 772 }, { "epoch": 0.020763941119587407, "grad_norm": 1.4499335289001465, "learning_rate": 9.999999963616673e-05, "loss": 3.0254, "step": 773 }, { "epoch": 0.020790802621682605, "grad_norm": 1.4630128145217896, "learning_rate": 9.999999963521863e-05, "loss": 3.2021, "step": 774 }, { "epoch": 0.0208176641237778, "grad_norm": 1.4782243967056274, "learning_rate": 9.99999996342693e-05, "loss": 3.0726, "step": 775 }, { "epoch": 0.020844525625873, "grad_norm": 1.5138225555419922, "learning_rate": 9.999999963331872e-05, "loss": 3.1348, "step": 776 }, { "epoch": 0.020871387127968197, "grad_norm": 1.437690019607544, "learning_rate": 9.999999963236693e-05, "loss": 2.9476, "step": 777 }, { "epoch": 0.02089824863006339, "grad_norm": 1.5484408140182495, "learning_rate": 9.999999963141389e-05, "loss": 3.2604, "step": 778 }, { "epoch": 0.02092511013215859, "grad_norm": 1.4620468616485596, "learning_rate": 9.999999963045963e-05, "loss": 2.8977, "step": 779 }, { "epoch": 0.020951971634253788, "grad_norm": 1.5116389989852905, "learning_rate": 9.999999962950412e-05, "loss": 3.4297, "step": 780 }, { "epoch": 0.020978833136348983, "grad_norm": 1.4593900442123413, "learning_rate": 9.999999962854738e-05, "loss": 2.9795, "step": 781 }, { "epoch": 0.02100569463844418, "grad_norm": 1.7128779888153076, "learning_rate": 9.999999962758942e-05, "loss": 3.16, "step": 782 }, { "epoch": 0.02103255614053938, "grad_norm": 1.6186964511871338, "learning_rate": 9.999999962663021e-05, "loss": 3.3634, "step": 783 }, { "epoch": 0.021059417642634574, "grad_norm": 1.6430516242980957, "learning_rate": 9.999999962566979e-05, "loss": 3.1656, "step": 784 }, { "epoch": 0.021086279144729773, "grad_norm": 1.5856648683547974, "learning_rate": 9.999999962470811e-05, "loss": 3.1166, "step": 785 }, { "epoch": 0.02111314064682497, "grad_norm": 1.5856009721755981, "learning_rate": 9.99999996237452e-05, "loss": 2.9903, "step": 786 }, { "epoch": 0.02114000214892017, "grad_norm": 1.6610063314437866, "learning_rate": 9.999999962278106e-05, "loss": 2.9762, "step": 787 }, { "epoch": 0.021166863651015364, "grad_norm": 1.5724165439605713, "learning_rate": 9.99999996218157e-05, "loss": 3.2502, "step": 788 }, { "epoch": 0.021193725153110562, "grad_norm": 1.6868854761123657, "learning_rate": 9.99999996208491e-05, "loss": 3.6198, "step": 789 }, { "epoch": 0.02122058665520576, "grad_norm": 1.633146047592163, "learning_rate": 9.999999961988126e-05, "loss": 3.4037, "step": 790 }, { "epoch": 0.021247448157300956, "grad_norm": 1.6730390787124634, "learning_rate": 9.999999961891217e-05, "loss": 3.2181, "step": 791 }, { "epoch": 0.021274309659396154, "grad_norm": 1.7232856750488281, "learning_rate": 9.999999961794188e-05, "loss": 3.3526, "step": 792 }, { "epoch": 0.021301171161491352, "grad_norm": 1.7128450870513916, "learning_rate": 9.999999961697032e-05, "loss": 3.4663, "step": 793 }, { "epoch": 0.021328032663586547, "grad_norm": 1.6938360929489136, "learning_rate": 9.999999961599756e-05, "loss": 3.4175, "step": 794 }, { "epoch": 0.021354894165681745, "grad_norm": 1.680937647819519, "learning_rate": 9.999999961502356e-05, "loss": 3.4235, "step": 795 }, { "epoch": 0.021381755667776944, "grad_norm": 1.6139475107192993, "learning_rate": 9.999999961404832e-05, "loss": 3.0056, "step": 796 }, { "epoch": 0.02140861716987214, "grad_norm": 2.0173404216766357, "learning_rate": 9.999999961307184e-05, "loss": 3.5415, "step": 797 }, { "epoch": 0.021435478671967337, "grad_norm": 1.920420527458191, "learning_rate": 9.999999961209412e-05, "loss": 3.5341, "step": 798 }, { "epoch": 0.021462340174062535, "grad_norm": 2.095407247543335, "learning_rate": 9.99999996111152e-05, "loss": 3.5424, "step": 799 }, { "epoch": 0.02148920167615773, "grad_norm": 1.6919872760772705, "learning_rate": 9.999999961013503e-05, "loss": 2.8392, "step": 800 }, { "epoch": 0.021516063178252928, "grad_norm": 1.4953211545944214, "learning_rate": 9.99999996091536e-05, "loss": 3.282, "step": 801 }, { "epoch": 0.021542924680348127, "grad_norm": 1.3652949333190918, "learning_rate": 9.999999960817097e-05, "loss": 2.9762, "step": 802 }, { "epoch": 0.02156978618244332, "grad_norm": 1.412086009979248, "learning_rate": 9.999999960718709e-05, "loss": 2.949, "step": 803 }, { "epoch": 0.02159664768453852, "grad_norm": 1.33501398563385, "learning_rate": 9.999999960620197e-05, "loss": 2.753, "step": 804 }, { "epoch": 0.021623509186633718, "grad_norm": 1.3840041160583496, "learning_rate": 9.999999960521563e-05, "loss": 2.8277, "step": 805 }, { "epoch": 0.021650370688728913, "grad_norm": 1.3337507247924805, "learning_rate": 9.999999960422806e-05, "loss": 2.618, "step": 806 }, { "epoch": 0.02167723219082411, "grad_norm": 1.3044178485870361, "learning_rate": 9.999999960323924e-05, "loss": 2.8496, "step": 807 }, { "epoch": 0.02170409369291931, "grad_norm": 1.5015467405319214, "learning_rate": 9.999999960224919e-05, "loss": 3.1227, "step": 808 }, { "epoch": 0.021730955195014504, "grad_norm": 1.524622917175293, "learning_rate": 9.999999960125792e-05, "loss": 2.9861, "step": 809 }, { "epoch": 0.021757816697109703, "grad_norm": 1.4448390007019043, "learning_rate": 9.99999996002654e-05, "loss": 2.9704, "step": 810 }, { "epoch": 0.0217846781992049, "grad_norm": 1.434091567993164, "learning_rate": 9.999999959927167e-05, "loss": 2.9119, "step": 811 }, { "epoch": 0.021811539701300096, "grad_norm": 1.6284412145614624, "learning_rate": 9.999999959827669e-05, "loss": 3.1126, "step": 812 }, { "epoch": 0.021838401203395294, "grad_norm": 1.4972847700119019, "learning_rate": 9.999999959728045e-05, "loss": 3.0218, "step": 813 }, { "epoch": 0.021865262705490492, "grad_norm": 1.459935188293457, "learning_rate": 9.999999959628301e-05, "loss": 3.062, "step": 814 }, { "epoch": 0.021892124207585687, "grad_norm": 1.628962755203247, "learning_rate": 9.999999959528434e-05, "loss": 3.327, "step": 815 }, { "epoch": 0.021918985709680885, "grad_norm": 1.4703344106674194, "learning_rate": 9.999999959428442e-05, "loss": 3.0919, "step": 816 }, { "epoch": 0.021945847211776084, "grad_norm": 1.5622847080230713, "learning_rate": 9.999999959328328e-05, "loss": 3.1916, "step": 817 }, { "epoch": 0.02197270871387128, "grad_norm": 1.4799901247024536, "learning_rate": 9.999999959228089e-05, "loss": 3.0399, "step": 818 }, { "epoch": 0.021999570215966477, "grad_norm": 1.5466630458831787, "learning_rate": 9.999999959127728e-05, "loss": 3.006, "step": 819 }, { "epoch": 0.022026431718061675, "grad_norm": 1.5267852544784546, "learning_rate": 9.999999959027241e-05, "loss": 3.1746, "step": 820 }, { "epoch": 0.02205329322015687, "grad_norm": 1.6994048357009888, "learning_rate": 9.999999958926633e-05, "loss": 3.2067, "step": 821 }, { "epoch": 0.02208015472225207, "grad_norm": 1.4320420026779175, "learning_rate": 9.999999958825902e-05, "loss": 2.9041, "step": 822 }, { "epoch": 0.022107016224347267, "grad_norm": 1.6223822832107544, "learning_rate": 9.999999958725049e-05, "loss": 3.3279, "step": 823 }, { "epoch": 0.02213387772644246, "grad_norm": 1.6538143157958984, "learning_rate": 9.999999958624069e-05, "loss": 3.2987, "step": 824 }, { "epoch": 0.02216073922853766, "grad_norm": 1.4128156900405884, "learning_rate": 9.999999958522968e-05, "loss": 2.9321, "step": 825 }, { "epoch": 0.022187600730632858, "grad_norm": 1.4784235954284668, "learning_rate": 9.999999958421742e-05, "loss": 2.7364, "step": 826 }, { "epoch": 0.022214462232728053, "grad_norm": 1.5601043701171875, "learning_rate": 9.999999958320394e-05, "loss": 3.172, "step": 827 }, { "epoch": 0.02224132373482325, "grad_norm": 1.6303744316101074, "learning_rate": 9.999999958218921e-05, "loss": 3.1838, "step": 828 }, { "epoch": 0.02226818523691845, "grad_norm": 1.524660587310791, "learning_rate": 9.999999958117326e-05, "loss": 3.0486, "step": 829 }, { "epoch": 0.022295046739013644, "grad_norm": 2.108213186264038, "learning_rate": 9.999999958015608e-05, "loss": 2.8753, "step": 830 }, { "epoch": 0.022321908241108843, "grad_norm": 1.5068790912628174, "learning_rate": 9.999999957913766e-05, "loss": 3.1045, "step": 831 }, { "epoch": 0.02234876974320404, "grad_norm": 1.752556324005127, "learning_rate": 9.9999999578118e-05, "loss": 3.3558, "step": 832 }, { "epoch": 0.022375631245299236, "grad_norm": 1.6734263896942139, "learning_rate": 9.999999957709711e-05, "loss": 3.4312, "step": 833 }, { "epoch": 0.022402492747394434, "grad_norm": 1.5546562671661377, "learning_rate": 9.9999999576075e-05, "loss": 2.9735, "step": 834 }, { "epoch": 0.022429354249489632, "grad_norm": 1.7671935558319092, "learning_rate": 9.999999957505164e-05, "loss": 3.2829, "step": 835 }, { "epoch": 0.022456215751584827, "grad_norm": 1.6296780109405518, "learning_rate": 9.999999957402705e-05, "loss": 3.337, "step": 836 }, { "epoch": 0.022483077253680026, "grad_norm": 1.602173924446106, "learning_rate": 9.999999957300124e-05, "loss": 3.2137, "step": 837 }, { "epoch": 0.022509938755775224, "grad_norm": 1.4791547060012817, "learning_rate": 9.999999957197417e-05, "loss": 3.0097, "step": 838 }, { "epoch": 0.02253680025787042, "grad_norm": 1.6333667039871216, "learning_rate": 9.999999957094587e-05, "loss": 3.162, "step": 839 }, { "epoch": 0.022563661759965617, "grad_norm": 1.7177056074142456, "learning_rate": 9.999999956991636e-05, "loss": 3.2876, "step": 840 }, { "epoch": 0.022590523262060815, "grad_norm": 1.5853782892227173, "learning_rate": 9.999999956888559e-05, "loss": 3.2918, "step": 841 }, { "epoch": 0.02261738476415601, "grad_norm": 1.6492536067962646, "learning_rate": 9.999999956785361e-05, "loss": 3.1207, "step": 842 }, { "epoch": 0.02264424626625121, "grad_norm": 1.7022151947021484, "learning_rate": 9.999999956682038e-05, "loss": 3.3891, "step": 843 }, { "epoch": 0.022671107768346407, "grad_norm": 1.8401191234588623, "learning_rate": 9.999999956578591e-05, "loss": 3.5495, "step": 844 }, { "epoch": 0.0226979692704416, "grad_norm": 2.1895368099212646, "learning_rate": 9.999999956475024e-05, "loss": 3.2325, "step": 845 }, { "epoch": 0.0227248307725368, "grad_norm": 1.718263030052185, "learning_rate": 9.999999956371331e-05, "loss": 3.3796, "step": 846 }, { "epoch": 0.022751692274631998, "grad_norm": 1.8245230913162231, "learning_rate": 9.999999956267515e-05, "loss": 3.2055, "step": 847 }, { "epoch": 0.022778553776727193, "grad_norm": 1.75575852394104, "learning_rate": 9.999999956163575e-05, "loss": 3.2328, "step": 848 }, { "epoch": 0.02280541527882239, "grad_norm": 2.0926082134246826, "learning_rate": 9.999999956059513e-05, "loss": 3.7155, "step": 849 }, { "epoch": 0.02283227678091759, "grad_norm": 2.1530840396881104, "learning_rate": 9.999999955955327e-05, "loss": 3.1956, "step": 850 }, { "epoch": 0.022859138283012784, "grad_norm": 1.3932898044586182, "learning_rate": 9.999999955851017e-05, "loss": 3.1861, "step": 851 }, { "epoch": 0.022885999785107983, "grad_norm": 1.542462944984436, "learning_rate": 9.999999955746584e-05, "loss": 2.8578, "step": 852 }, { "epoch": 0.02291286128720318, "grad_norm": 1.531349778175354, "learning_rate": 9.999999955642028e-05, "loss": 3.432, "step": 853 }, { "epoch": 0.022939722789298376, "grad_norm": 1.368952751159668, "learning_rate": 9.999999955537349e-05, "loss": 2.9651, "step": 854 }, { "epoch": 0.022966584291393574, "grad_norm": 1.5063704252243042, "learning_rate": 9.999999955432546e-05, "loss": 3.0688, "step": 855 }, { "epoch": 0.022993445793488772, "grad_norm": 1.3633675575256348, "learning_rate": 9.99999995532762e-05, "loss": 2.7157, "step": 856 }, { "epoch": 0.02302030729558397, "grad_norm": 1.3414454460144043, "learning_rate": 9.99999995522257e-05, "loss": 2.8958, "step": 857 }, { "epoch": 0.023047168797679166, "grad_norm": 1.3742226362228394, "learning_rate": 9.999999955117396e-05, "loss": 2.9802, "step": 858 }, { "epoch": 0.023074030299774364, "grad_norm": 1.314835548400879, "learning_rate": 9.9999999550121e-05, "loss": 2.9829, "step": 859 }, { "epoch": 0.023100891801869562, "grad_norm": 1.4580899477005005, "learning_rate": 9.999999954906681e-05, "loss": 2.8899, "step": 860 }, { "epoch": 0.023127753303964757, "grad_norm": 1.5694551467895508, "learning_rate": 9.999999954801138e-05, "loss": 2.7876, "step": 861 }, { "epoch": 0.023154614806059955, "grad_norm": 1.5796012878417969, "learning_rate": 9.999999954695472e-05, "loss": 2.9759, "step": 862 }, { "epoch": 0.023181476308155154, "grad_norm": 1.4369691610336304, "learning_rate": 9.99999995458968e-05, "loss": 2.892, "step": 863 }, { "epoch": 0.02320833781025035, "grad_norm": 1.4166815280914307, "learning_rate": 9.999999954483768e-05, "loss": 2.8314, "step": 864 }, { "epoch": 0.023235199312345547, "grad_norm": 1.4398976564407349, "learning_rate": 9.999999954377731e-05, "loss": 2.7829, "step": 865 }, { "epoch": 0.023262060814440745, "grad_norm": 1.561009168624878, "learning_rate": 9.999999954271572e-05, "loss": 3.2241, "step": 866 }, { "epoch": 0.02328892231653594, "grad_norm": 1.3493255376815796, "learning_rate": 9.999999954165287e-05, "loss": 2.895, "step": 867 }, { "epoch": 0.023315783818631138, "grad_norm": 1.5968419313430786, "learning_rate": 9.999999954058881e-05, "loss": 3.0864, "step": 868 }, { "epoch": 0.023342645320726337, "grad_norm": 1.4958103895187378, "learning_rate": 9.999999953952351e-05, "loss": 3.1792, "step": 869 }, { "epoch": 0.02336950682282153, "grad_norm": 1.609138011932373, "learning_rate": 9.999999953845698e-05, "loss": 3.1673, "step": 870 }, { "epoch": 0.02339636832491673, "grad_norm": 1.5247716903686523, "learning_rate": 9.99999995373892e-05, "loss": 3.1263, "step": 871 }, { "epoch": 0.023423229827011928, "grad_norm": 1.4824477434158325, "learning_rate": 9.99999995363202e-05, "loss": 2.8878, "step": 872 }, { "epoch": 0.023450091329107123, "grad_norm": 1.5343728065490723, "learning_rate": 9.999999953524997e-05, "loss": 3.1241, "step": 873 }, { "epoch": 0.02347695283120232, "grad_norm": 1.653027057647705, "learning_rate": 9.99999995341785e-05, "loss": 3.4166, "step": 874 }, { "epoch": 0.02350381433329752, "grad_norm": 1.5273804664611816, "learning_rate": 9.99999995331058e-05, "loss": 3.1788, "step": 875 }, { "epoch": 0.023530675835392714, "grad_norm": 1.5059871673583984, "learning_rate": 9.999999953203187e-05, "loss": 2.8871, "step": 876 }, { "epoch": 0.023557537337487913, "grad_norm": 1.7065390348434448, "learning_rate": 9.999999953095669e-05, "loss": 3.3388, "step": 877 }, { "epoch": 0.02358439883958311, "grad_norm": 1.5093010663986206, "learning_rate": 9.999999952988028e-05, "loss": 2.92, "step": 878 }, { "epoch": 0.023611260341678306, "grad_norm": 1.479245901107788, "learning_rate": 9.999999952880265e-05, "loss": 2.8458, "step": 879 }, { "epoch": 0.023638121843773504, "grad_norm": 1.303599238395691, "learning_rate": 9.999999952772377e-05, "loss": 2.6848, "step": 880 }, { "epoch": 0.023664983345868702, "grad_norm": 1.4369533061981201, "learning_rate": 9.999999952664368e-05, "loss": 2.9162, "step": 881 }, { "epoch": 0.023691844847963897, "grad_norm": 1.4322164058685303, "learning_rate": 9.999999952556234e-05, "loss": 2.807, "step": 882 }, { "epoch": 0.023718706350059095, "grad_norm": 1.5332303047180176, "learning_rate": 9.999999952447976e-05, "loss": 3.1427, "step": 883 }, { "epoch": 0.023745567852154294, "grad_norm": 1.5984649658203125, "learning_rate": 9.999999952339596e-05, "loss": 3.343, "step": 884 }, { "epoch": 0.02377242935424949, "grad_norm": 1.5730228424072266, "learning_rate": 9.999999952231092e-05, "loss": 3.3833, "step": 885 }, { "epoch": 0.023799290856344687, "grad_norm": 1.5562658309936523, "learning_rate": 9.999999952122465e-05, "loss": 3.1578, "step": 886 }, { "epoch": 0.023826152358439885, "grad_norm": 1.659433126449585, "learning_rate": 9.999999952013714e-05, "loss": 3.2602, "step": 887 }, { "epoch": 0.02385301386053508, "grad_norm": 1.6053160429000854, "learning_rate": 9.99999995190484e-05, "loss": 3.1533, "step": 888 }, { "epoch": 0.02387987536263028, "grad_norm": 1.509786605834961, "learning_rate": 9.999999951795842e-05, "loss": 3.0258, "step": 889 }, { "epoch": 0.023906736864725477, "grad_norm": 1.519740104675293, "learning_rate": 9.999999951686721e-05, "loss": 3.0771, "step": 890 }, { "epoch": 0.02393359836682067, "grad_norm": 1.705270767211914, "learning_rate": 9.999999951577477e-05, "loss": 3.0579, "step": 891 }, { "epoch": 0.02396045986891587, "grad_norm": 1.7090507745742798, "learning_rate": 9.999999951468108e-05, "loss": 3.5565, "step": 892 }, { "epoch": 0.023987321371011068, "grad_norm": 1.6336791515350342, "learning_rate": 9.999999951358618e-05, "loss": 3.1644, "step": 893 }, { "epoch": 0.024014182873106263, "grad_norm": 1.828244924545288, "learning_rate": 9.999999951249004e-05, "loss": 3.2391, "step": 894 }, { "epoch": 0.02404104437520146, "grad_norm": 1.870465874671936, "learning_rate": 9.999999951139265e-05, "loss": 3.3868, "step": 895 }, { "epoch": 0.02406790587729666, "grad_norm": 1.7341759204864502, "learning_rate": 9.999999951029405e-05, "loss": 3.0196, "step": 896 }, { "epoch": 0.024094767379391854, "grad_norm": 1.7239795923233032, "learning_rate": 9.99999995091942e-05, "loss": 3.5867, "step": 897 }, { "epoch": 0.024121628881487053, "grad_norm": 1.9196280241012573, "learning_rate": 9.999999950809313e-05, "loss": 3.5105, "step": 898 }, { "epoch": 0.02414849038358225, "grad_norm": 1.9287900924682617, "learning_rate": 9.999999950699082e-05, "loss": 3.3439, "step": 899 }, { "epoch": 0.024175351885677446, "grad_norm": 1.782228708267212, "learning_rate": 9.999999950588727e-05, "loss": 3.1187, "step": 900 }, { "epoch": 0.024202213387772644, "grad_norm": 1.4669206142425537, "learning_rate": 9.999999950478249e-05, "loss": 3.1636, "step": 901 }, { "epoch": 0.024229074889867842, "grad_norm": 1.4358468055725098, "learning_rate": 9.999999950367647e-05, "loss": 2.9277, "step": 902 }, { "epoch": 0.024255936391963037, "grad_norm": 1.6136616468429565, "learning_rate": 9.999999950256923e-05, "loss": 3.2609, "step": 903 }, { "epoch": 0.024282797894058235, "grad_norm": 1.5683788061141968, "learning_rate": 9.999999950146075e-05, "loss": 3.3504, "step": 904 }, { "epoch": 0.024309659396153434, "grad_norm": 1.3870428800582886, "learning_rate": 9.999999950035104e-05, "loss": 2.9261, "step": 905 }, { "epoch": 0.02433652089824863, "grad_norm": 1.3878294229507446, "learning_rate": 9.999999949924008e-05, "loss": 2.8212, "step": 906 }, { "epoch": 0.024363382400343827, "grad_norm": 1.377278208732605, "learning_rate": 9.99999994981279e-05, "loss": 3.0824, "step": 907 }, { "epoch": 0.024390243902439025, "grad_norm": 1.3985542058944702, "learning_rate": 9.999999949701448e-05, "loss": 2.8544, "step": 908 }, { "epoch": 0.02441710540453422, "grad_norm": 1.439070701599121, "learning_rate": 9.999999949589984e-05, "loss": 3.0956, "step": 909 }, { "epoch": 0.02444396690662942, "grad_norm": 1.6687121391296387, "learning_rate": 9.999999949478396e-05, "loss": 3.2903, "step": 910 }, { "epoch": 0.024470828408724617, "grad_norm": 1.4199494123458862, "learning_rate": 9.999999949366684e-05, "loss": 3.1253, "step": 911 }, { "epoch": 0.02449768991081981, "grad_norm": 1.4392849206924438, "learning_rate": 9.999999949254849e-05, "loss": 2.8105, "step": 912 }, { "epoch": 0.02452455141291501, "grad_norm": 1.5232583284378052, "learning_rate": 9.999999949142891e-05, "loss": 2.9298, "step": 913 }, { "epoch": 0.024551412915010208, "grad_norm": 1.428612470626831, "learning_rate": 9.999999949030809e-05, "loss": 3.15, "step": 914 }, { "epoch": 0.024578274417105403, "grad_norm": 1.459532380104065, "learning_rate": 9.999999948918604e-05, "loss": 2.9516, "step": 915 }, { "epoch": 0.0246051359192006, "grad_norm": 1.4741835594177246, "learning_rate": 9.999999948806275e-05, "loss": 2.9968, "step": 916 }, { "epoch": 0.0246319974212958, "grad_norm": 1.3959861993789673, "learning_rate": 9.999999948693823e-05, "loss": 2.7548, "step": 917 }, { "epoch": 0.024658858923390994, "grad_norm": 1.5814383029937744, "learning_rate": 9.999999948581248e-05, "loss": 3.071, "step": 918 }, { "epoch": 0.024685720425486193, "grad_norm": 1.3421992063522339, "learning_rate": 9.99999994846855e-05, "loss": 2.8023, "step": 919 }, { "epoch": 0.02471258192758139, "grad_norm": 1.5291359424591064, "learning_rate": 9.999999948355727e-05, "loss": 2.9152, "step": 920 }, { "epoch": 0.024739443429676586, "grad_norm": 1.5606157779693604, "learning_rate": 9.999999948242781e-05, "loss": 2.769, "step": 921 }, { "epoch": 0.024766304931771784, "grad_norm": 1.4144341945648193, "learning_rate": 9.999999948129714e-05, "loss": 2.9678, "step": 922 }, { "epoch": 0.024793166433866982, "grad_norm": 1.4727364778518677, "learning_rate": 9.99999994801652e-05, "loss": 2.9027, "step": 923 }, { "epoch": 0.02482002793596218, "grad_norm": 1.5003682374954224, "learning_rate": 9.999999947903205e-05, "loss": 2.8261, "step": 924 }, { "epoch": 0.024846889438057376, "grad_norm": 1.4803307056427002, "learning_rate": 9.999999947789766e-05, "loss": 2.959, "step": 925 }, { "epoch": 0.024873750940152574, "grad_norm": 1.6897430419921875, "learning_rate": 9.999999947676206e-05, "loss": 3.5019, "step": 926 }, { "epoch": 0.024900612442247772, "grad_norm": 1.4898759126663208, "learning_rate": 9.999999947562519e-05, "loss": 2.921, "step": 927 }, { "epoch": 0.024927473944342967, "grad_norm": 1.496799349784851, "learning_rate": 9.99999994744871e-05, "loss": 2.8998, "step": 928 }, { "epoch": 0.024954335446438165, "grad_norm": 1.5128743648529053, "learning_rate": 9.999999947334778e-05, "loss": 3.0422, "step": 929 }, { "epoch": 0.024981196948533364, "grad_norm": 1.6503703594207764, "learning_rate": 9.999999947220722e-05, "loss": 3.1786, "step": 930 }, { "epoch": 0.02500805845062856, "grad_norm": 1.4745047092437744, "learning_rate": 9.999999947106544e-05, "loss": 3.3131, "step": 931 }, { "epoch": 0.025034919952723757, "grad_norm": 1.529066801071167, "learning_rate": 9.999999946992241e-05, "loss": 2.9621, "step": 932 }, { "epoch": 0.025061781454818955, "grad_norm": 1.4728082418441772, "learning_rate": 9.999999946877816e-05, "loss": 2.8143, "step": 933 }, { "epoch": 0.02508864295691415, "grad_norm": 1.5155495405197144, "learning_rate": 9.999999946763266e-05, "loss": 3.0541, "step": 934 }, { "epoch": 0.025115504459009348, "grad_norm": 1.4486238956451416, "learning_rate": 9.999999946648594e-05, "loss": 3.2015, "step": 935 }, { "epoch": 0.025142365961104546, "grad_norm": 1.548398733139038, "learning_rate": 9.999999946533798e-05, "loss": 2.9229, "step": 936 }, { "epoch": 0.02516922746319974, "grad_norm": 1.5989586114883423, "learning_rate": 9.999999946418878e-05, "loss": 3.4691, "step": 937 }, { "epoch": 0.02519608896529494, "grad_norm": 1.5218377113342285, "learning_rate": 9.999999946303835e-05, "loss": 3.1763, "step": 938 }, { "epoch": 0.025222950467390138, "grad_norm": 1.7632472515106201, "learning_rate": 9.99999994618867e-05, "loss": 3.0938, "step": 939 }, { "epoch": 0.025249811969485333, "grad_norm": 1.6770436763763428, "learning_rate": 9.999999946073381e-05, "loss": 3.4123, "step": 940 }, { "epoch": 0.02527667347158053, "grad_norm": 1.524204969406128, "learning_rate": 9.999999945957968e-05, "loss": 3.1141, "step": 941 }, { "epoch": 0.02530353497367573, "grad_norm": 1.7641979455947876, "learning_rate": 9.999999945842431e-05, "loss": 3.4322, "step": 942 }, { "epoch": 0.025330396475770924, "grad_norm": 1.7929165363311768, "learning_rate": 9.999999945726772e-05, "loss": 3.2306, "step": 943 }, { "epoch": 0.025357257977866123, "grad_norm": 1.5453864336013794, "learning_rate": 9.999999945610989e-05, "loss": 3.1985, "step": 944 }, { "epoch": 0.02538411947996132, "grad_norm": 1.512291431427002, "learning_rate": 9.999999945495084e-05, "loss": 2.8978, "step": 945 }, { "epoch": 0.025410980982056516, "grad_norm": 1.6914894580841064, "learning_rate": 9.999999945379052e-05, "loss": 3.2245, "step": 946 }, { "epoch": 0.025437842484151714, "grad_norm": 1.6791399717330933, "learning_rate": 9.999999945262902e-05, "loss": 3.2164, "step": 947 }, { "epoch": 0.025464703986246912, "grad_norm": 1.818323016166687, "learning_rate": 9.999999945146624e-05, "loss": 3.3281, "step": 948 }, { "epoch": 0.025491565488342107, "grad_norm": 1.8089464902877808, "learning_rate": 9.999999945030225e-05, "loss": 3.618, "step": 949 }, { "epoch": 0.025518426990437305, "grad_norm": 2.034975290298462, "learning_rate": 9.999999944913702e-05, "loss": 3.4737, "step": 950 }, { "epoch": 0.025545288492532504, "grad_norm": 1.1866132020950317, "learning_rate": 9.999999944797055e-05, "loss": 3.2219, "step": 951 }, { "epoch": 0.0255721499946277, "grad_norm": 1.416660189628601, "learning_rate": 9.999999944680285e-05, "loss": 2.9487, "step": 952 }, { "epoch": 0.025599011496722897, "grad_norm": 1.3192981481552124, "learning_rate": 9.999999944563392e-05, "loss": 3.0613, "step": 953 }, { "epoch": 0.025625872998818095, "grad_norm": 1.4292821884155273, "learning_rate": 9.999999944446376e-05, "loss": 2.9264, "step": 954 }, { "epoch": 0.02565273450091329, "grad_norm": 1.3824403285980225, "learning_rate": 9.999999944329234e-05, "loss": 2.8121, "step": 955 }, { "epoch": 0.02567959600300849, "grad_norm": 1.3949388265609741, "learning_rate": 9.999999944211972e-05, "loss": 2.799, "step": 956 }, { "epoch": 0.025706457505103687, "grad_norm": 1.4141530990600586, "learning_rate": 9.999999944094586e-05, "loss": 3.0121, "step": 957 }, { "epoch": 0.02573331900719888, "grad_norm": 1.291210651397705, "learning_rate": 9.999999943977075e-05, "loss": 2.8466, "step": 958 }, { "epoch": 0.02576018050929408, "grad_norm": 1.3864771127700806, "learning_rate": 9.999999943859441e-05, "loss": 2.8567, "step": 959 }, { "epoch": 0.025787042011389278, "grad_norm": 1.444599986076355, "learning_rate": 9.999999943741685e-05, "loss": 3.0129, "step": 960 }, { "epoch": 0.025813903513484473, "grad_norm": 1.3651354312896729, "learning_rate": 9.999999943623805e-05, "loss": 2.8797, "step": 961 }, { "epoch": 0.02584076501557967, "grad_norm": 1.4720170497894287, "learning_rate": 9.999999943505802e-05, "loss": 3.1372, "step": 962 }, { "epoch": 0.02586762651767487, "grad_norm": 1.3237793445587158, "learning_rate": 9.999999943387674e-05, "loss": 2.8831, "step": 963 }, { "epoch": 0.025894488019770064, "grad_norm": 1.3659074306488037, "learning_rate": 9.999999943269424e-05, "loss": 2.9116, "step": 964 }, { "epoch": 0.025921349521865263, "grad_norm": 1.487601637840271, "learning_rate": 9.999999943151051e-05, "loss": 3.1031, "step": 965 }, { "epoch": 0.02594821102396046, "grad_norm": 1.4030911922454834, "learning_rate": 9.999999943032554e-05, "loss": 2.7534, "step": 966 }, { "epoch": 0.025975072526055656, "grad_norm": 1.4361507892608643, "learning_rate": 9.999999942913933e-05, "loss": 3.0331, "step": 967 }, { "epoch": 0.026001934028150854, "grad_norm": 1.2849395275115967, "learning_rate": 9.99999994279519e-05, "loss": 2.9815, "step": 968 }, { "epoch": 0.026028795530246052, "grad_norm": 1.471046805381775, "learning_rate": 9.999999942676322e-05, "loss": 3.2367, "step": 969 }, { "epoch": 0.026055657032341247, "grad_norm": 1.5335110425949097, "learning_rate": 9.999999942557332e-05, "loss": 3.1871, "step": 970 }, { "epoch": 0.026082518534436445, "grad_norm": 1.4752976894378662, "learning_rate": 9.999999942438219e-05, "loss": 2.9454, "step": 971 }, { "epoch": 0.026109380036531644, "grad_norm": 1.4635950326919556, "learning_rate": 9.99999994231898e-05, "loss": 2.9933, "step": 972 }, { "epoch": 0.02613624153862684, "grad_norm": 1.5313303470611572, "learning_rate": 9.99999994219962e-05, "loss": 3.1916, "step": 973 }, { "epoch": 0.026163103040722037, "grad_norm": 1.5451520681381226, "learning_rate": 9.999999942080136e-05, "loss": 3.0727, "step": 974 }, { "epoch": 0.026189964542817235, "grad_norm": 1.551936149597168, "learning_rate": 9.999999941960529e-05, "loss": 3.007, "step": 975 }, { "epoch": 0.02621682604491243, "grad_norm": 1.4487463235855103, "learning_rate": 9.999999941840798e-05, "loss": 3.1173, "step": 976 }, { "epoch": 0.02624368754700763, "grad_norm": 1.5345468521118164, "learning_rate": 9.999999941720945e-05, "loss": 3.3884, "step": 977 }, { "epoch": 0.026270549049102827, "grad_norm": 1.5212229490280151, "learning_rate": 9.999999941600967e-05, "loss": 3.0135, "step": 978 }, { "epoch": 0.02629741055119802, "grad_norm": 1.5727816820144653, "learning_rate": 9.999999941480866e-05, "loss": 3.2597, "step": 979 }, { "epoch": 0.02632427205329322, "grad_norm": 1.4715198278427124, "learning_rate": 9.999999941360643e-05, "loss": 3.0167, "step": 980 }, { "epoch": 0.026351133555388418, "grad_norm": 1.5815037488937378, "learning_rate": 9.999999941240295e-05, "loss": 3.3436, "step": 981 }, { "epoch": 0.026377995057483613, "grad_norm": 1.573350191116333, "learning_rate": 9.999999941119824e-05, "loss": 2.8999, "step": 982 }, { "epoch": 0.02640485655957881, "grad_norm": 1.422698736190796, "learning_rate": 9.99999994099923e-05, "loss": 2.8865, "step": 983 }, { "epoch": 0.02643171806167401, "grad_norm": 1.5705293416976929, "learning_rate": 9.999999940878512e-05, "loss": 3.0761, "step": 984 }, { "epoch": 0.026458579563769204, "grad_norm": 1.4294575452804565, "learning_rate": 9.99999994075767e-05, "loss": 2.8289, "step": 985 }, { "epoch": 0.026485441065864403, "grad_norm": 1.5163748264312744, "learning_rate": 9.999999940636706e-05, "loss": 2.8431, "step": 986 }, { "epoch": 0.0265123025679596, "grad_norm": 1.6206839084625244, "learning_rate": 9.999999940515618e-05, "loss": 3.3664, "step": 987 }, { "epoch": 0.026539164070054796, "grad_norm": 1.7007125616073608, "learning_rate": 9.999999940394407e-05, "loss": 3.2286, "step": 988 }, { "epoch": 0.026566025572149994, "grad_norm": 1.630654215812683, "learning_rate": 9.999999940273073e-05, "loss": 3.2184, "step": 989 }, { "epoch": 0.026592887074245192, "grad_norm": 1.6554126739501953, "learning_rate": 9.999999940151614e-05, "loss": 3.322, "step": 990 }, { "epoch": 0.026619748576340387, "grad_norm": 1.6082969903945923, "learning_rate": 9.999999940030034e-05, "loss": 3.2394, "step": 991 }, { "epoch": 0.026646610078435586, "grad_norm": 1.7029058933258057, "learning_rate": 9.999999939908328e-05, "loss": 3.2544, "step": 992 }, { "epoch": 0.026673471580530784, "grad_norm": 1.6000038385391235, "learning_rate": 9.9999999397865e-05, "loss": 3.1424, "step": 993 }, { "epoch": 0.026700333082625982, "grad_norm": 1.7077949047088623, "learning_rate": 9.99999993966455e-05, "loss": 3.2201, "step": 994 }, { "epoch": 0.026727194584721177, "grad_norm": 1.7065461874008179, "learning_rate": 9.999999939542475e-05, "loss": 3.336, "step": 995 }, { "epoch": 0.026754056086816375, "grad_norm": 1.8289830684661865, "learning_rate": 9.999999939420277e-05, "loss": 3.5807, "step": 996 }, { "epoch": 0.026780917588911574, "grad_norm": 1.6227738857269287, "learning_rate": 9.999999939297955e-05, "loss": 3.0879, "step": 997 }, { "epoch": 0.02680777909100677, "grad_norm": 1.790627121925354, "learning_rate": 9.99999993917551e-05, "loss": 3.353, "step": 998 }, { "epoch": 0.026834640593101967, "grad_norm": 1.885088562965393, "learning_rate": 9.999999939052941e-05, "loss": 3.4586, "step": 999 }, { "epoch": 0.026861502095197165, "grad_norm": 1.8984928131103516, "learning_rate": 9.999999938930252e-05, "loss": 3.5962, "step": 1000 }, { "epoch": 0.02688836359729236, "grad_norm": 1.2081084251403809, "learning_rate": 9.999999938807436e-05, "loss": 3.2575, "step": 1001 }, { "epoch": 0.026915225099387558, "grad_norm": 1.3805323839187622, "learning_rate": 9.999999938684497e-05, "loss": 3.2441, "step": 1002 }, { "epoch": 0.026942086601482756, "grad_norm": 1.7433286905288696, "learning_rate": 9.999999938561435e-05, "loss": 2.8775, "step": 1003 }, { "epoch": 0.02696894810357795, "grad_norm": 1.3275337219238281, "learning_rate": 9.999999938438252e-05, "loss": 2.8907, "step": 1004 }, { "epoch": 0.02699580960567315, "grad_norm": 1.4613322019577026, "learning_rate": 9.999999938314943e-05, "loss": 2.7962, "step": 1005 }, { "epoch": 0.027022671107768348, "grad_norm": 1.485374927520752, "learning_rate": 9.999999938191512e-05, "loss": 2.8892, "step": 1006 }, { "epoch": 0.027049532609863543, "grad_norm": 1.3549524545669556, "learning_rate": 9.999999938067956e-05, "loss": 3.1675, "step": 1007 }, { "epoch": 0.02707639411195874, "grad_norm": 1.3601874113082886, "learning_rate": 9.999999937944277e-05, "loss": 2.8808, "step": 1008 }, { "epoch": 0.02710325561405394, "grad_norm": 1.3621183633804321, "learning_rate": 9.999999937820475e-05, "loss": 2.7581, "step": 1009 }, { "epoch": 0.027130117116149134, "grad_norm": 1.3292536735534668, "learning_rate": 9.999999937696549e-05, "loss": 2.9433, "step": 1010 }, { "epoch": 0.027156978618244332, "grad_norm": 1.4130446910858154, "learning_rate": 9.9999999375725e-05, "loss": 3.0202, "step": 1011 }, { "epoch": 0.02718384012033953, "grad_norm": 1.4119086265563965, "learning_rate": 9.999999937448329e-05, "loss": 2.8933, "step": 1012 }, { "epoch": 0.027210701622434726, "grad_norm": 1.286368727684021, "learning_rate": 9.999999937324033e-05, "loss": 2.7099, "step": 1013 }, { "epoch": 0.027237563124529924, "grad_norm": 1.292993426322937, "learning_rate": 9.999999937199614e-05, "loss": 2.7148, "step": 1014 }, { "epoch": 0.027264424626625122, "grad_norm": 1.2486885786056519, "learning_rate": 9.999999937075073e-05, "loss": 2.8137, "step": 1015 }, { "epoch": 0.027291286128720317, "grad_norm": 1.3911099433898926, "learning_rate": 9.999999936950408e-05, "loss": 2.7745, "step": 1016 }, { "epoch": 0.027318147630815515, "grad_norm": 1.5100406408309937, "learning_rate": 9.999999936825617e-05, "loss": 2.9737, "step": 1017 }, { "epoch": 0.027345009132910714, "grad_norm": 1.372920274734497, "learning_rate": 9.999999936700705e-05, "loss": 2.7351, "step": 1018 }, { "epoch": 0.02737187063500591, "grad_norm": 1.4184085130691528, "learning_rate": 9.99999993657567e-05, "loss": 2.9937, "step": 1019 }, { "epoch": 0.027398732137101107, "grad_norm": 1.5382386445999146, "learning_rate": 9.999999936450512e-05, "loss": 2.7445, "step": 1020 }, { "epoch": 0.027425593639196305, "grad_norm": 1.5459246635437012, "learning_rate": 9.999999936325228e-05, "loss": 3.0237, "step": 1021 }, { "epoch": 0.0274524551412915, "grad_norm": 1.3898468017578125, "learning_rate": 9.999999936199823e-05, "loss": 2.7733, "step": 1022 }, { "epoch": 0.027479316643386698, "grad_norm": 1.4596545696258545, "learning_rate": 9.999999936074294e-05, "loss": 3.051, "step": 1023 }, { "epoch": 0.027506178145481897, "grad_norm": 1.5145318508148193, "learning_rate": 9.999999935948642e-05, "loss": 3.1378, "step": 1024 }, { "epoch": 0.02753303964757709, "grad_norm": 1.5298538208007812, "learning_rate": 9.999999935822866e-05, "loss": 3.0653, "step": 1025 }, { "epoch": 0.02755990114967229, "grad_norm": 1.523829698562622, "learning_rate": 9.999999935696967e-05, "loss": 3.0143, "step": 1026 }, { "epoch": 0.027586762651767488, "grad_norm": 1.5662864446640015, "learning_rate": 9.999999935570944e-05, "loss": 2.9266, "step": 1027 }, { "epoch": 0.027613624153862683, "grad_norm": 1.5643287897109985, "learning_rate": 9.999999935444798e-05, "loss": 3.0891, "step": 1028 }, { "epoch": 0.02764048565595788, "grad_norm": 1.552836537361145, "learning_rate": 9.999999935318529e-05, "loss": 3.0627, "step": 1029 }, { "epoch": 0.02766734715805308, "grad_norm": 1.5275644063949585, "learning_rate": 9.999999935192137e-05, "loss": 3.2854, "step": 1030 }, { "epoch": 0.027694208660148274, "grad_norm": 1.5676196813583374, "learning_rate": 9.99999993506562e-05, "loss": 3.3264, "step": 1031 }, { "epoch": 0.027721070162243473, "grad_norm": 1.6966971158981323, "learning_rate": 9.999999934938981e-05, "loss": 3.1521, "step": 1032 }, { "epoch": 0.02774793166433867, "grad_norm": 1.573509931564331, "learning_rate": 9.999999934812219e-05, "loss": 3.2024, "step": 1033 }, { "epoch": 0.027774793166433866, "grad_norm": 1.5276821851730347, "learning_rate": 9.99999993468533e-05, "loss": 3.3231, "step": 1034 }, { "epoch": 0.027801654668529064, "grad_norm": 1.4477492570877075, "learning_rate": 9.999999934558322e-05, "loss": 2.9656, "step": 1035 }, { "epoch": 0.027828516170624262, "grad_norm": 1.5181984901428223, "learning_rate": 9.99999993443119e-05, "loss": 3.2101, "step": 1036 }, { "epoch": 0.027855377672719457, "grad_norm": 1.5033587217330933, "learning_rate": 9.999999934303933e-05, "loss": 2.9541, "step": 1037 }, { "epoch": 0.027882239174814655, "grad_norm": 1.6101270914077759, "learning_rate": 9.999999934176554e-05, "loss": 3.1678, "step": 1038 }, { "epoch": 0.027909100676909854, "grad_norm": 1.7118161916732788, "learning_rate": 9.99999993404905e-05, "loss": 3.107, "step": 1039 }, { "epoch": 0.02793596217900505, "grad_norm": 1.597589373588562, "learning_rate": 9.999999933921425e-05, "loss": 3.1001, "step": 1040 }, { "epoch": 0.027962823681100247, "grad_norm": 1.608934760093689, "learning_rate": 9.999999933793675e-05, "loss": 2.9767, "step": 1041 }, { "epoch": 0.027989685183195445, "grad_norm": 1.5927395820617676, "learning_rate": 9.999999933665801e-05, "loss": 3.231, "step": 1042 }, { "epoch": 0.02801654668529064, "grad_norm": 1.8898730278015137, "learning_rate": 9.999999933537804e-05, "loss": 3.8677, "step": 1043 }, { "epoch": 0.02804340818738584, "grad_norm": 1.6011654138565063, "learning_rate": 9.999999933409685e-05, "loss": 3.1195, "step": 1044 }, { "epoch": 0.028070269689481037, "grad_norm": 1.7558504343032837, "learning_rate": 9.99999993328144e-05, "loss": 3.2243, "step": 1045 }, { "epoch": 0.02809713119157623, "grad_norm": 1.6238560676574707, "learning_rate": 9.999999933153075e-05, "loss": 3.3974, "step": 1046 }, { "epoch": 0.02812399269367143, "grad_norm": 1.8154619932174683, "learning_rate": 9.999999933024584e-05, "loss": 3.2551, "step": 1047 }, { "epoch": 0.028150854195766628, "grad_norm": 1.7730555534362793, "learning_rate": 9.999999932895971e-05, "loss": 3.384, "step": 1048 }, { "epoch": 0.028177715697861823, "grad_norm": 1.717028260231018, "learning_rate": 9.999999932767235e-05, "loss": 3.1164, "step": 1049 }, { "epoch": 0.02820457719995702, "grad_norm": 2.2091801166534424, "learning_rate": 9.999999932638376e-05, "loss": 3.6432, "step": 1050 }, { "epoch": 0.02823143870205222, "grad_norm": 1.4108836650848389, "learning_rate": 9.999999932509392e-05, "loss": 2.8067, "step": 1051 }, { "epoch": 0.028258300204147414, "grad_norm": 1.5172497034072876, "learning_rate": 9.999999932380284e-05, "loss": 2.8339, "step": 1052 }, { "epoch": 0.028285161706242613, "grad_norm": 1.5175940990447998, "learning_rate": 9.999999932251054e-05, "loss": 3.0708, "step": 1053 }, { "epoch": 0.02831202320833781, "grad_norm": 1.5778652429580688, "learning_rate": 9.9999999321217e-05, "loss": 2.9358, "step": 1054 }, { "epoch": 0.028338884710433006, "grad_norm": 1.3586126565933228, "learning_rate": 9.999999931992224e-05, "loss": 2.7746, "step": 1055 }, { "epoch": 0.028365746212528204, "grad_norm": 1.4578770399093628, "learning_rate": 9.999999931862624e-05, "loss": 3.1832, "step": 1056 }, { "epoch": 0.028392607714623402, "grad_norm": 1.4358021020889282, "learning_rate": 9.999999931732899e-05, "loss": 2.9011, "step": 1057 }, { "epoch": 0.028419469216718597, "grad_norm": 1.2991894483566284, "learning_rate": 9.999999931603052e-05, "loss": 2.7933, "step": 1058 }, { "epoch": 0.028446330718813796, "grad_norm": 1.4651480913162231, "learning_rate": 9.999999931473083e-05, "loss": 2.9399, "step": 1059 }, { "epoch": 0.028473192220908994, "grad_norm": 1.494518756866455, "learning_rate": 9.999999931342989e-05, "loss": 2.9017, "step": 1060 }, { "epoch": 0.028500053723004192, "grad_norm": 1.462116003036499, "learning_rate": 9.99999993121277e-05, "loss": 2.8853, "step": 1061 }, { "epoch": 0.028526915225099387, "grad_norm": 1.2462494373321533, "learning_rate": 9.99999993108243e-05, "loss": 2.5481, "step": 1062 }, { "epoch": 0.028553776727194585, "grad_norm": 1.4719215631484985, "learning_rate": 9.999999930951967e-05, "loss": 3.06, "step": 1063 }, { "epoch": 0.028580638229289784, "grad_norm": 1.359512209892273, "learning_rate": 9.99999993082138e-05, "loss": 2.851, "step": 1064 }, { "epoch": 0.02860749973138498, "grad_norm": 1.4545495510101318, "learning_rate": 9.999999930690669e-05, "loss": 2.9734, "step": 1065 }, { "epoch": 0.028634361233480177, "grad_norm": 1.3730028867721558, "learning_rate": 9.999999930559836e-05, "loss": 2.8539, "step": 1066 }, { "epoch": 0.028661222735575375, "grad_norm": 1.6204618215560913, "learning_rate": 9.999999930428879e-05, "loss": 3.3271, "step": 1067 }, { "epoch": 0.02868808423767057, "grad_norm": 1.4270602464675903, "learning_rate": 9.999999930297796e-05, "loss": 3.167, "step": 1068 }, { "epoch": 0.028714945739765768, "grad_norm": 1.3054600954055786, "learning_rate": 9.999999930166593e-05, "loss": 2.8688, "step": 1069 }, { "epoch": 0.028741807241860966, "grad_norm": 1.5023961067199707, "learning_rate": 9.999999930035265e-05, "loss": 3.0314, "step": 1070 }, { "epoch": 0.02876866874395616, "grad_norm": 1.6203995943069458, "learning_rate": 9.999999929903813e-05, "loss": 2.913, "step": 1071 }, { "epoch": 0.02879553024605136, "grad_norm": 1.333970546722412, "learning_rate": 9.99999992977224e-05, "loss": 2.9784, "step": 1072 }, { "epoch": 0.028822391748146558, "grad_norm": 1.5159754753112793, "learning_rate": 9.999999929640542e-05, "loss": 2.9283, "step": 1073 }, { "epoch": 0.028849253250241753, "grad_norm": 1.4754468202590942, "learning_rate": 9.99999992950872e-05, "loss": 3.3527, "step": 1074 }, { "epoch": 0.02887611475233695, "grad_norm": 1.4421638250350952, "learning_rate": 9.999999929376777e-05, "loss": 2.8453, "step": 1075 }, { "epoch": 0.02890297625443215, "grad_norm": 1.4105534553527832, "learning_rate": 9.99999992924471e-05, "loss": 2.9349, "step": 1076 }, { "epoch": 0.028929837756527344, "grad_norm": 1.4494372606277466, "learning_rate": 9.999999929112518e-05, "loss": 2.9975, "step": 1077 }, { "epoch": 0.028956699258622542, "grad_norm": 1.558030605316162, "learning_rate": 9.999999928980203e-05, "loss": 3.5427, "step": 1078 }, { "epoch": 0.02898356076071774, "grad_norm": 1.4982047080993652, "learning_rate": 9.999999928847767e-05, "loss": 3.0858, "step": 1079 }, { "epoch": 0.029010422262812936, "grad_norm": 1.428868293762207, "learning_rate": 9.999999928715205e-05, "loss": 2.9926, "step": 1080 }, { "epoch": 0.029037283764908134, "grad_norm": 1.652311086654663, "learning_rate": 9.99999992858252e-05, "loss": 3.0683, "step": 1081 }, { "epoch": 0.029064145267003332, "grad_norm": 1.5730986595153809, "learning_rate": 9.999999928449713e-05, "loss": 3.17, "step": 1082 }, { "epoch": 0.029091006769098527, "grad_norm": 1.486251950263977, "learning_rate": 9.999999928316781e-05, "loss": 2.8794, "step": 1083 }, { "epoch": 0.029117868271193725, "grad_norm": 1.6299165487289429, "learning_rate": 9.999999928183727e-05, "loss": 3.1198, "step": 1084 }, { "epoch": 0.029144729773288924, "grad_norm": 1.542945146560669, "learning_rate": 9.999999928050549e-05, "loss": 3.2898, "step": 1085 }, { "epoch": 0.02917159127538412, "grad_norm": 1.529078722000122, "learning_rate": 9.999999927917247e-05, "loss": 3.0489, "step": 1086 }, { "epoch": 0.029198452777479317, "grad_norm": 1.5292929410934448, "learning_rate": 9.999999927783823e-05, "loss": 3.3094, "step": 1087 }, { "epoch": 0.029225314279574515, "grad_norm": 1.4960463047027588, "learning_rate": 9.999999927650275e-05, "loss": 3.3103, "step": 1088 }, { "epoch": 0.02925217578166971, "grad_norm": 1.602150797843933, "learning_rate": 9.999999927516603e-05, "loss": 3.2901, "step": 1089 }, { "epoch": 0.029279037283764908, "grad_norm": 1.5850414037704468, "learning_rate": 9.999999927382809e-05, "loss": 3.1176, "step": 1090 }, { "epoch": 0.029305898785860107, "grad_norm": 1.6117327213287354, "learning_rate": 9.99999992724889e-05, "loss": 3.1988, "step": 1091 }, { "epoch": 0.0293327602879553, "grad_norm": 1.6982961893081665, "learning_rate": 9.999999927114848e-05, "loss": 3.0294, "step": 1092 }, { "epoch": 0.0293596217900505, "grad_norm": 1.521851658821106, "learning_rate": 9.999999926980684e-05, "loss": 3.2821, "step": 1093 }, { "epoch": 0.029386483292145698, "grad_norm": 1.780248761177063, "learning_rate": 9.999999926846396e-05, "loss": 3.6807, "step": 1094 }, { "epoch": 0.029413344794240893, "grad_norm": 1.5586732625961304, "learning_rate": 9.999999926711983e-05, "loss": 3.1557, "step": 1095 }, { "epoch": 0.02944020629633609, "grad_norm": 1.6442537307739258, "learning_rate": 9.999999926577449e-05, "loss": 3.1615, "step": 1096 }, { "epoch": 0.02946706779843129, "grad_norm": 1.8581600189208984, "learning_rate": 9.999999926442789e-05, "loss": 3.6989, "step": 1097 }, { "epoch": 0.029493929300526484, "grad_norm": 2.0552151203155518, "learning_rate": 9.999999926308008e-05, "loss": 3.532, "step": 1098 }, { "epoch": 0.029520790802621683, "grad_norm": 1.9782418012619019, "learning_rate": 9.999999926173102e-05, "loss": 3.4415, "step": 1099 }, { "epoch": 0.02954765230471688, "grad_norm": 1.8358986377716064, "learning_rate": 9.999999926038073e-05, "loss": 3.2857, "step": 1100 }, { "epoch": 0.029574513806812076, "grad_norm": 1.302351951599121, "learning_rate": 9.99999992590292e-05, "loss": 3.3119, "step": 1101 }, { "epoch": 0.029601375308907274, "grad_norm": 1.38497793674469, "learning_rate": 9.999999925767647e-05, "loss": 3.249, "step": 1102 }, { "epoch": 0.029628236811002472, "grad_norm": 1.4507490396499634, "learning_rate": 9.999999925632248e-05, "loss": 2.9888, "step": 1103 }, { "epoch": 0.029655098313097667, "grad_norm": 1.4330673217773438, "learning_rate": 9.999999925496725e-05, "loss": 2.9996, "step": 1104 }, { "epoch": 0.029681959815192865, "grad_norm": 1.317047357559204, "learning_rate": 9.99999992536108e-05, "loss": 2.8336, "step": 1105 }, { "epoch": 0.029708821317288064, "grad_norm": 1.5266220569610596, "learning_rate": 9.999999925225312e-05, "loss": 2.9996, "step": 1106 }, { "epoch": 0.02973568281938326, "grad_norm": 1.4145293235778809, "learning_rate": 9.999999925089418e-05, "loss": 3.0004, "step": 1107 }, { "epoch": 0.029762544321478457, "grad_norm": 1.4431809186935425, "learning_rate": 9.999999924953403e-05, "loss": 2.85, "step": 1108 }, { "epoch": 0.029789405823573655, "grad_norm": 1.3787111043930054, "learning_rate": 9.999999924817265e-05, "loss": 2.9795, "step": 1109 }, { "epoch": 0.02981626732566885, "grad_norm": 1.3092586994171143, "learning_rate": 9.999999924681003e-05, "loss": 3.0196, "step": 1110 }, { "epoch": 0.02984312882776405, "grad_norm": 1.555515170097351, "learning_rate": 9.999999924544617e-05, "loss": 2.9242, "step": 1111 }, { "epoch": 0.029869990329859247, "grad_norm": 1.4019018411636353, "learning_rate": 9.999999924408108e-05, "loss": 3.1464, "step": 1112 }, { "epoch": 0.02989685183195444, "grad_norm": 1.4033538103103638, "learning_rate": 9.999999924271475e-05, "loss": 3.0895, "step": 1113 }, { "epoch": 0.02992371333404964, "grad_norm": 1.3757456541061401, "learning_rate": 9.99999992413472e-05, "loss": 2.8177, "step": 1114 }, { "epoch": 0.029950574836144838, "grad_norm": 1.557436227798462, "learning_rate": 9.99999992399784e-05, "loss": 3.0616, "step": 1115 }, { "epoch": 0.029977436338240033, "grad_norm": 1.4687376022338867, "learning_rate": 9.999999923860838e-05, "loss": 3.2204, "step": 1116 }, { "epoch": 0.03000429784033523, "grad_norm": 1.4528030157089233, "learning_rate": 9.999999923723712e-05, "loss": 3.104, "step": 1117 }, { "epoch": 0.03003115934243043, "grad_norm": 1.4383186101913452, "learning_rate": 9.999999923586463e-05, "loss": 3.0886, "step": 1118 }, { "epoch": 0.030058020844525624, "grad_norm": 1.5394346714019775, "learning_rate": 9.99999992344909e-05, "loss": 3.123, "step": 1119 }, { "epoch": 0.030084882346620823, "grad_norm": 1.4198474884033203, "learning_rate": 9.999999923311594e-05, "loss": 2.8897, "step": 1120 }, { "epoch": 0.03011174384871602, "grad_norm": 1.4363526105880737, "learning_rate": 9.999999923173975e-05, "loss": 2.726, "step": 1121 }, { "epoch": 0.030138605350811216, "grad_norm": 1.4953033924102783, "learning_rate": 9.999999923036231e-05, "loss": 2.732, "step": 1122 }, { "epoch": 0.030165466852906414, "grad_norm": 1.4169050455093384, "learning_rate": 9.999999922898367e-05, "loss": 3.1504, "step": 1123 }, { "epoch": 0.030192328355001612, "grad_norm": 1.4073117971420288, "learning_rate": 9.999999922760377e-05, "loss": 2.8981, "step": 1124 }, { "epoch": 0.030219189857096807, "grad_norm": 1.4770647287368774, "learning_rate": 9.999999922622263e-05, "loss": 3.1128, "step": 1125 }, { "epoch": 0.030246051359192005, "grad_norm": 1.5720592737197876, "learning_rate": 9.999999922484027e-05, "loss": 3.1258, "step": 1126 }, { "epoch": 0.030272912861287204, "grad_norm": 1.4397382736206055, "learning_rate": 9.999999922345668e-05, "loss": 2.8535, "step": 1127 }, { "epoch": 0.0302997743633824, "grad_norm": 1.7102172374725342, "learning_rate": 9.999999922207185e-05, "loss": 3.1901, "step": 1128 }, { "epoch": 0.030326635865477597, "grad_norm": 1.5595030784606934, "learning_rate": 9.99999992206858e-05, "loss": 3.0476, "step": 1129 }, { "epoch": 0.030353497367572795, "grad_norm": 1.5575852394104004, "learning_rate": 9.999999921929848e-05, "loss": 3.0828, "step": 1130 }, { "epoch": 0.030380358869667994, "grad_norm": 1.4322677850723267, "learning_rate": 9.999999921790995e-05, "loss": 3.2307, "step": 1131 }, { "epoch": 0.03040722037176319, "grad_norm": 1.57375967502594, "learning_rate": 9.99999992165202e-05, "loss": 2.9122, "step": 1132 }, { "epoch": 0.030434081873858387, "grad_norm": 1.4671905040740967, "learning_rate": 9.999999921512919e-05, "loss": 2.8622, "step": 1133 }, { "epoch": 0.030460943375953585, "grad_norm": 1.4324138164520264, "learning_rate": 9.999999921373696e-05, "loss": 2.8266, "step": 1134 }, { "epoch": 0.03048780487804878, "grad_norm": 1.4229416847229004, "learning_rate": 9.99999992123435e-05, "loss": 2.8596, "step": 1135 }, { "epoch": 0.030514666380143978, "grad_norm": 1.6876296997070312, "learning_rate": 9.999999921094881e-05, "loss": 3.2123, "step": 1136 }, { "epoch": 0.030541527882239176, "grad_norm": 1.5105502605438232, "learning_rate": 9.999999920955287e-05, "loss": 3.0074, "step": 1137 }, { "epoch": 0.03056838938433437, "grad_norm": 1.645257830619812, "learning_rate": 9.99999992081557e-05, "loss": 3.1036, "step": 1138 }, { "epoch": 0.03059525088642957, "grad_norm": 1.5362334251403809, "learning_rate": 9.99999992067573e-05, "loss": 3.2537, "step": 1139 }, { "epoch": 0.030622112388524768, "grad_norm": 1.575076699256897, "learning_rate": 9.999999920535766e-05, "loss": 3.1689, "step": 1140 }, { "epoch": 0.030648973890619963, "grad_norm": 1.5552635192871094, "learning_rate": 9.99999992039568e-05, "loss": 3.0966, "step": 1141 }, { "epoch": 0.03067583539271516, "grad_norm": 1.720339059829712, "learning_rate": 9.99999992025547e-05, "loss": 3.1505, "step": 1142 }, { "epoch": 0.03070269689481036, "grad_norm": 1.7185444831848145, "learning_rate": 9.999999920115136e-05, "loss": 3.1868, "step": 1143 }, { "epoch": 0.030729558396905554, "grad_norm": 1.6548991203308105, "learning_rate": 9.999999919974679e-05, "loss": 3.1969, "step": 1144 }, { "epoch": 0.030756419899000752, "grad_norm": 1.5805326700210571, "learning_rate": 9.999999919834099e-05, "loss": 3.292, "step": 1145 }, { "epoch": 0.03078328140109595, "grad_norm": 1.8295358419418335, "learning_rate": 9.999999919693395e-05, "loss": 3.3623, "step": 1146 }, { "epoch": 0.030810142903191146, "grad_norm": 1.737511157989502, "learning_rate": 9.999999919552568e-05, "loss": 2.9244, "step": 1147 }, { "epoch": 0.030837004405286344, "grad_norm": 1.8514723777770996, "learning_rate": 9.999999919411618e-05, "loss": 3.5062, "step": 1148 }, { "epoch": 0.030863865907381542, "grad_norm": 1.9149837493896484, "learning_rate": 9.999999919270544e-05, "loss": 3.6376, "step": 1149 }, { "epoch": 0.030890727409476737, "grad_norm": 1.8508201837539673, "learning_rate": 9.999999919129347e-05, "loss": 3.1467, "step": 1150 }, { "epoch": 0.030917588911571935, "grad_norm": 1.4321824312210083, "learning_rate": 9.999999918988027e-05, "loss": 3.0066, "step": 1151 }, { "epoch": 0.030944450413667134, "grad_norm": 1.5123839378356934, "learning_rate": 9.999999918846583e-05, "loss": 2.9124, "step": 1152 }, { "epoch": 0.03097131191576233, "grad_norm": 1.4272949695587158, "learning_rate": 9.999999918705016e-05, "loss": 2.8225, "step": 1153 }, { "epoch": 0.030998173417857527, "grad_norm": 1.405633568763733, "learning_rate": 9.999999918563326e-05, "loss": 2.7074, "step": 1154 }, { "epoch": 0.031025034919952725, "grad_norm": 1.487655520439148, "learning_rate": 9.99999991842151e-05, "loss": 3.0166, "step": 1155 }, { "epoch": 0.03105189642204792, "grad_norm": 1.4216954708099365, "learning_rate": 9.999999918279575e-05, "loss": 3.0085, "step": 1156 }, { "epoch": 0.031078757924143118, "grad_norm": 1.403308629989624, "learning_rate": 9.999999918137512e-05, "loss": 3.0478, "step": 1157 }, { "epoch": 0.031105619426238316, "grad_norm": 1.4875620603561401, "learning_rate": 9.999999917995328e-05, "loss": 2.9702, "step": 1158 }, { "epoch": 0.03113248092833351, "grad_norm": 1.4649858474731445, "learning_rate": 9.999999917853023e-05, "loss": 3.2984, "step": 1159 }, { "epoch": 0.03115934243042871, "grad_norm": 1.4901381731033325, "learning_rate": 9.999999917710591e-05, "loss": 3.0888, "step": 1160 }, { "epoch": 0.031186203932523908, "grad_norm": 1.4207940101623535, "learning_rate": 9.999999917568037e-05, "loss": 2.7057, "step": 1161 }, { "epoch": 0.031213065434619103, "grad_norm": 1.3899128437042236, "learning_rate": 9.999999917425359e-05, "loss": 2.9937, "step": 1162 }, { "epoch": 0.0312399269367143, "grad_norm": 1.36077082157135, "learning_rate": 9.999999917282559e-05, "loss": 2.7426, "step": 1163 }, { "epoch": 0.0312667884388095, "grad_norm": 1.2691547870635986, "learning_rate": 9.999999917139633e-05, "loss": 2.6888, "step": 1164 }, { "epoch": 0.031293649940904694, "grad_norm": 1.309258222579956, "learning_rate": 9.999999916996586e-05, "loss": 2.7449, "step": 1165 }, { "epoch": 0.031320511442999896, "grad_norm": 1.3002082109451294, "learning_rate": 9.999999916853415e-05, "loss": 2.8566, "step": 1166 }, { "epoch": 0.03134737294509509, "grad_norm": 1.4026422500610352, "learning_rate": 9.99999991671012e-05, "loss": 3.1578, "step": 1167 }, { "epoch": 0.031374234447190286, "grad_norm": 1.3621559143066406, "learning_rate": 9.999999916566702e-05, "loss": 2.9677, "step": 1168 }, { "epoch": 0.03140109594928549, "grad_norm": 1.408431887626648, "learning_rate": 9.999999916423162e-05, "loss": 2.8851, "step": 1169 }, { "epoch": 0.03142795745138068, "grad_norm": 1.5579081773757935, "learning_rate": 9.999999916279497e-05, "loss": 3.2076, "step": 1170 }, { "epoch": 0.03145481895347588, "grad_norm": 1.422640085220337, "learning_rate": 9.999999916135709e-05, "loss": 2.9792, "step": 1171 }, { "epoch": 0.03148168045557108, "grad_norm": 1.508641004562378, "learning_rate": 9.999999915991797e-05, "loss": 3.1418, "step": 1172 }, { "epoch": 0.031508541957666274, "grad_norm": 1.4941315650939941, "learning_rate": 9.999999915847764e-05, "loss": 2.9193, "step": 1173 }, { "epoch": 0.03153540345976147, "grad_norm": 1.4279118776321411, "learning_rate": 9.999999915703607e-05, "loss": 2.7299, "step": 1174 }, { "epoch": 0.03156226496185667, "grad_norm": 1.6342811584472656, "learning_rate": 9.999999915559324e-05, "loss": 3.1708, "step": 1175 }, { "epoch": 0.031589126463951865, "grad_norm": 1.4493217468261719, "learning_rate": 9.999999915414919e-05, "loss": 2.9587, "step": 1176 }, { "epoch": 0.03161598796604706, "grad_norm": 1.4111332893371582, "learning_rate": 9.999999915270392e-05, "loss": 2.8108, "step": 1177 }, { "epoch": 0.03164284946814226, "grad_norm": 1.5913037061691284, "learning_rate": 9.999999915125739e-05, "loss": 2.9306, "step": 1178 }, { "epoch": 0.03166971097023746, "grad_norm": 1.4138201475143433, "learning_rate": 9.999999914980966e-05, "loss": 2.9746, "step": 1179 }, { "epoch": 0.03169657247233265, "grad_norm": 1.542961835861206, "learning_rate": 9.999999914836066e-05, "loss": 3.1946, "step": 1180 }, { "epoch": 0.03172343397442785, "grad_norm": 1.4823840856552124, "learning_rate": 9.999999914691045e-05, "loss": 2.7606, "step": 1181 }, { "epoch": 0.03175029547652305, "grad_norm": 1.477311611175537, "learning_rate": 9.999999914545901e-05, "loss": 3.1212, "step": 1182 }, { "epoch": 0.03177715697861824, "grad_norm": 1.6379215717315674, "learning_rate": 9.999999914400633e-05, "loss": 2.98, "step": 1183 }, { "epoch": 0.031804018480713445, "grad_norm": 1.4688607454299927, "learning_rate": 9.99999991425524e-05, "loss": 2.9571, "step": 1184 }, { "epoch": 0.03183087998280864, "grad_norm": 1.4939223527908325, "learning_rate": 9.999999914109725e-05, "loss": 3.0431, "step": 1185 }, { "epoch": 0.031857741484903834, "grad_norm": 1.5596034526824951, "learning_rate": 9.999999913964087e-05, "loss": 3.1911, "step": 1186 }, { "epoch": 0.031884602986999036, "grad_norm": 1.693818211555481, "learning_rate": 9.999999913818326e-05, "loss": 3.0475, "step": 1187 }, { "epoch": 0.03191146448909423, "grad_norm": 1.6540929079055786, "learning_rate": 9.99999991367244e-05, "loss": 3.1208, "step": 1188 }, { "epoch": 0.031938325991189426, "grad_norm": 1.4901649951934814, "learning_rate": 9.999999913526433e-05, "loss": 2.9145, "step": 1189 }, { "epoch": 0.03196518749328463, "grad_norm": 1.585123896598816, "learning_rate": 9.9999999133803e-05, "loss": 3.1728, "step": 1190 }, { "epoch": 0.03199204899537982, "grad_norm": 1.743322730064392, "learning_rate": 9.999999913234044e-05, "loss": 3.4658, "step": 1191 }, { "epoch": 0.03201891049747502, "grad_norm": 1.6741468906402588, "learning_rate": 9.999999913087666e-05, "loss": 3.4217, "step": 1192 }, { "epoch": 0.03204577199957022, "grad_norm": 1.5321619510650635, "learning_rate": 9.999999912941164e-05, "loss": 3.0894, "step": 1193 }, { "epoch": 0.032072633501665414, "grad_norm": 1.6068675518035889, "learning_rate": 9.999999912794538e-05, "loss": 2.8289, "step": 1194 }, { "epoch": 0.03209949500376061, "grad_norm": 1.7000651359558105, "learning_rate": 9.99999991264779e-05, "loss": 3.2203, "step": 1195 }, { "epoch": 0.03212635650585581, "grad_norm": 1.652759313583374, "learning_rate": 9.999999912500917e-05, "loss": 3.2407, "step": 1196 }, { "epoch": 0.032153218007951005, "grad_norm": 1.8210004568099976, "learning_rate": 9.999999912353921e-05, "loss": 3.2567, "step": 1197 }, { "epoch": 0.0321800795100462, "grad_norm": 1.9404109716415405, "learning_rate": 9.999999912206803e-05, "loss": 3.3876, "step": 1198 }, { "epoch": 0.0322069410121414, "grad_norm": 1.9805941581726074, "learning_rate": 9.999999912059561e-05, "loss": 3.4857, "step": 1199 }, { "epoch": 0.0322338025142366, "grad_norm": 1.789170742034912, "learning_rate": 9.999999911912195e-05, "loss": 3.3542, "step": 1200 }, { "epoch": 0.03226066401633179, "grad_norm": 1.601011872291565, "learning_rate": 9.999999911764706e-05, "loss": 3.0094, "step": 1201 }, { "epoch": 0.03228752551842699, "grad_norm": 1.5233467817306519, "learning_rate": 9.999999911617094e-05, "loss": 2.9386, "step": 1202 }, { "epoch": 0.03231438702052219, "grad_norm": 1.4733026027679443, "learning_rate": 9.999999911469358e-05, "loss": 2.6907, "step": 1203 }, { "epoch": 0.03234124852261738, "grad_norm": 1.5229771137237549, "learning_rate": 9.999999911321499e-05, "loss": 2.9349, "step": 1204 }, { "epoch": 0.032368110024712585, "grad_norm": 1.4411981105804443, "learning_rate": 9.999999911173516e-05, "loss": 2.7616, "step": 1205 }, { "epoch": 0.03239497152680778, "grad_norm": 1.3132145404815674, "learning_rate": 9.99999991102541e-05, "loss": 2.7224, "step": 1206 }, { "epoch": 0.032421833028902974, "grad_norm": 1.500889778137207, "learning_rate": 9.999999910877182e-05, "loss": 3.0192, "step": 1207 }, { "epoch": 0.032448694530998176, "grad_norm": 1.2993311882019043, "learning_rate": 9.999999910728829e-05, "loss": 2.7571, "step": 1208 }, { "epoch": 0.03247555603309337, "grad_norm": 1.5036752223968506, "learning_rate": 9.999999910580353e-05, "loss": 3.0489, "step": 1209 }, { "epoch": 0.032502417535188566, "grad_norm": 1.3930540084838867, "learning_rate": 9.999999910431754e-05, "loss": 2.6477, "step": 1210 }, { "epoch": 0.03252927903728377, "grad_norm": 1.3620692491531372, "learning_rate": 9.999999910283031e-05, "loss": 2.7436, "step": 1211 }, { "epoch": 0.03255614053937896, "grad_norm": 1.5991593599319458, "learning_rate": 9.999999910134186e-05, "loss": 2.979, "step": 1212 }, { "epoch": 0.03258300204147416, "grad_norm": 1.58819580078125, "learning_rate": 9.999999909985216e-05, "loss": 3.0439, "step": 1213 }, { "epoch": 0.03260986354356936, "grad_norm": 1.4783276319503784, "learning_rate": 9.999999909836123e-05, "loss": 2.8303, "step": 1214 }, { "epoch": 0.032636725045664554, "grad_norm": 1.3597859144210815, "learning_rate": 9.999999909686907e-05, "loss": 2.6549, "step": 1215 }, { "epoch": 0.03266358654775975, "grad_norm": 1.3881914615631104, "learning_rate": 9.999999909537567e-05, "loss": 2.9567, "step": 1216 }, { "epoch": 0.03269044804985495, "grad_norm": 1.3996658325195312, "learning_rate": 9.999999909388105e-05, "loss": 2.7215, "step": 1217 }, { "epoch": 0.032717309551950145, "grad_norm": 1.3187663555145264, "learning_rate": 9.999999909238517e-05, "loss": 2.8384, "step": 1218 }, { "epoch": 0.03274417105404534, "grad_norm": 1.509340763092041, "learning_rate": 9.999999909088808e-05, "loss": 2.9029, "step": 1219 }, { "epoch": 0.03277103255614054, "grad_norm": 1.4753671884536743, "learning_rate": 9.999999908938975e-05, "loss": 3.2016, "step": 1220 }, { "epoch": 0.03279789405823574, "grad_norm": 1.4223262071609497, "learning_rate": 9.99999990878902e-05, "loss": 2.8779, "step": 1221 }, { "epoch": 0.03282475556033093, "grad_norm": 1.3905526399612427, "learning_rate": 9.99999990863894e-05, "loss": 2.8502, "step": 1222 }, { "epoch": 0.03285161706242613, "grad_norm": 1.6468901634216309, "learning_rate": 9.999999908488736e-05, "loss": 3.287, "step": 1223 }, { "epoch": 0.03287847856452133, "grad_norm": 1.385778546333313, "learning_rate": 9.99999990833841e-05, "loss": 2.8047, "step": 1224 }, { "epoch": 0.03290534006661652, "grad_norm": 1.3317996263504028, "learning_rate": 9.999999908187961e-05, "loss": 2.9318, "step": 1225 }, { "epoch": 0.032932201568711725, "grad_norm": 1.4120289087295532, "learning_rate": 9.999999908037387e-05, "loss": 3.1235, "step": 1226 }, { "epoch": 0.03295906307080692, "grad_norm": 1.570051670074463, "learning_rate": 9.999999907886691e-05, "loss": 3.1125, "step": 1227 }, { "epoch": 0.032985924572902114, "grad_norm": 1.5419656038284302, "learning_rate": 9.999999907735871e-05, "loss": 2.9827, "step": 1228 }, { "epoch": 0.033012786074997316, "grad_norm": 1.5014476776123047, "learning_rate": 9.999999907584927e-05, "loss": 3.1199, "step": 1229 }, { "epoch": 0.03303964757709251, "grad_norm": 1.653408169746399, "learning_rate": 9.99999990743386e-05, "loss": 3.2555, "step": 1230 }, { "epoch": 0.033066509079187706, "grad_norm": 1.3667315244674683, "learning_rate": 9.99999990728267e-05, "loss": 2.9395, "step": 1231 }, { "epoch": 0.03309337058128291, "grad_norm": 1.576900839805603, "learning_rate": 9.999999907131357e-05, "loss": 3.0171, "step": 1232 }, { "epoch": 0.0331202320833781, "grad_norm": 1.6688435077667236, "learning_rate": 9.99999990697992e-05, "loss": 3.283, "step": 1233 }, { "epoch": 0.0331470935854733, "grad_norm": 1.5353676080703735, "learning_rate": 9.999999906828359e-05, "loss": 3.0857, "step": 1234 }, { "epoch": 0.0331739550875685, "grad_norm": 1.4716721773147583, "learning_rate": 9.999999906676676e-05, "loss": 2.8933, "step": 1235 }, { "epoch": 0.033200816589663694, "grad_norm": 1.624703288078308, "learning_rate": 9.999999906524868e-05, "loss": 3.1667, "step": 1236 }, { "epoch": 0.03322767809175889, "grad_norm": 1.4201050996780396, "learning_rate": 9.999999906372939e-05, "loss": 3.0671, "step": 1237 }, { "epoch": 0.03325453959385409, "grad_norm": 1.5696899890899658, "learning_rate": 9.999999906220885e-05, "loss": 2.8465, "step": 1238 }, { "epoch": 0.033281401095949285, "grad_norm": 1.5536811351776123, "learning_rate": 9.999999906068709e-05, "loss": 3.101, "step": 1239 }, { "epoch": 0.03330826259804448, "grad_norm": 1.5983662605285645, "learning_rate": 9.999999905916408e-05, "loss": 3.286, "step": 1240 }, { "epoch": 0.03333512410013968, "grad_norm": 1.7818171977996826, "learning_rate": 9.999999905763984e-05, "loss": 3.3457, "step": 1241 }, { "epoch": 0.03336198560223488, "grad_norm": 1.4938325881958008, "learning_rate": 9.999999905611437e-05, "loss": 3.1784, "step": 1242 }, { "epoch": 0.03338884710433007, "grad_norm": 1.4970808029174805, "learning_rate": 9.999999905458765e-05, "loss": 3.1908, "step": 1243 }, { "epoch": 0.03341570860642527, "grad_norm": 1.6312114000320435, "learning_rate": 9.999999905305972e-05, "loss": 3.2914, "step": 1244 }, { "epoch": 0.03344257010852047, "grad_norm": 1.6739710569381714, "learning_rate": 9.999999905153055e-05, "loss": 3.2133, "step": 1245 }, { "epoch": 0.03346943161061566, "grad_norm": 1.586802363395691, "learning_rate": 9.999999905000015e-05, "loss": 3.1242, "step": 1246 }, { "epoch": 0.033496293112710865, "grad_norm": 1.670663595199585, "learning_rate": 9.999999904846851e-05, "loss": 3.2219, "step": 1247 }, { "epoch": 0.03352315461480606, "grad_norm": 1.7760072946548462, "learning_rate": 9.999999904693564e-05, "loss": 3.0606, "step": 1248 }, { "epoch": 0.033550016116901255, "grad_norm": 2.1186087131500244, "learning_rate": 9.999999904540152e-05, "loss": 3.6282, "step": 1249 }, { "epoch": 0.033576877618996456, "grad_norm": 1.7062263488769531, "learning_rate": 9.999999904386619e-05, "loss": 3.3226, "step": 1250 }, { "epoch": 0.03360373912109165, "grad_norm": 1.3062018156051636, "learning_rate": 9.99999990423296e-05, "loss": 2.91, "step": 1251 }, { "epoch": 0.033630600623186846, "grad_norm": 1.5175307989120483, "learning_rate": 9.99999990407918e-05, "loss": 2.8442, "step": 1252 }, { "epoch": 0.03365746212528205, "grad_norm": 1.4948550462722778, "learning_rate": 9.999999903925276e-05, "loss": 3.1676, "step": 1253 }, { "epoch": 0.03368432362737724, "grad_norm": 1.4918733835220337, "learning_rate": 9.999999903771249e-05, "loss": 2.6735, "step": 1254 }, { "epoch": 0.03371118512947244, "grad_norm": 1.4677417278289795, "learning_rate": 9.999999903617097e-05, "loss": 3.114, "step": 1255 }, { "epoch": 0.03373804663156764, "grad_norm": 1.4189213514328003, "learning_rate": 9.999999903462822e-05, "loss": 3.101, "step": 1256 }, { "epoch": 0.033764908133662834, "grad_norm": 1.2749756574630737, "learning_rate": 9.999999903308426e-05, "loss": 2.7508, "step": 1257 }, { "epoch": 0.03379176963575803, "grad_norm": 1.621442437171936, "learning_rate": 9.999999903153904e-05, "loss": 3.0446, "step": 1258 }, { "epoch": 0.03381863113785323, "grad_norm": 1.3713841438293457, "learning_rate": 9.999999902999259e-05, "loss": 2.8511, "step": 1259 }, { "epoch": 0.033845492639948425, "grad_norm": 1.3999048471450806, "learning_rate": 9.999999902844492e-05, "loss": 3.0518, "step": 1260 }, { "epoch": 0.03387235414204362, "grad_norm": 1.4194591045379639, "learning_rate": 9.9999999026896e-05, "loss": 3.0724, "step": 1261 }, { "epoch": 0.03389921564413882, "grad_norm": 1.4514492750167847, "learning_rate": 9.999999902534586e-05, "loss": 3.3577, "step": 1262 }, { "epoch": 0.03392607714623402, "grad_norm": 1.4304295778274536, "learning_rate": 9.999999902379449e-05, "loss": 2.897, "step": 1263 }, { "epoch": 0.03395293864832921, "grad_norm": 1.552476406097412, "learning_rate": 9.999999902224186e-05, "loss": 3.0367, "step": 1264 }, { "epoch": 0.033979800150424413, "grad_norm": 1.3050061464309692, "learning_rate": 9.999999902068802e-05, "loss": 2.7819, "step": 1265 }, { "epoch": 0.03400666165251961, "grad_norm": 1.317513346672058, "learning_rate": 9.999999901913294e-05, "loss": 2.7208, "step": 1266 }, { "epoch": 0.0340335231546148, "grad_norm": 1.4023017883300781, "learning_rate": 9.999999901757663e-05, "loss": 2.8803, "step": 1267 }, { "epoch": 0.034060384656710005, "grad_norm": 1.3437764644622803, "learning_rate": 9.999999901601908e-05, "loss": 2.7362, "step": 1268 }, { "epoch": 0.0340872461588052, "grad_norm": 1.3950257301330566, "learning_rate": 9.99999990144603e-05, "loss": 2.9147, "step": 1269 }, { "epoch": 0.034114107660900395, "grad_norm": 1.3344415426254272, "learning_rate": 9.999999901290028e-05, "loss": 2.9881, "step": 1270 }, { "epoch": 0.034140969162995596, "grad_norm": 1.3639448881149292, "learning_rate": 9.999999901133903e-05, "loss": 3.0179, "step": 1271 }, { "epoch": 0.03416783066509079, "grad_norm": 1.5759127140045166, "learning_rate": 9.999999900977654e-05, "loss": 3.1648, "step": 1272 }, { "epoch": 0.034194692167185986, "grad_norm": 1.438621163368225, "learning_rate": 9.999999900821284e-05, "loss": 3.3035, "step": 1273 }, { "epoch": 0.03422155366928119, "grad_norm": 1.6033320426940918, "learning_rate": 9.999999900664789e-05, "loss": 3.0717, "step": 1274 }, { "epoch": 0.03424841517137638, "grad_norm": 1.4944597482681274, "learning_rate": 9.99999990050817e-05, "loss": 3.1131, "step": 1275 }, { "epoch": 0.03427527667347158, "grad_norm": 1.5210902690887451, "learning_rate": 9.999999900351429e-05, "loss": 2.9868, "step": 1276 }, { "epoch": 0.03430213817556678, "grad_norm": 1.703231930732727, "learning_rate": 9.999999900194564e-05, "loss": 3.0296, "step": 1277 }, { "epoch": 0.034328999677661974, "grad_norm": 1.466875433921814, "learning_rate": 9.999999900037575e-05, "loss": 2.7917, "step": 1278 }, { "epoch": 0.03435586117975717, "grad_norm": 1.3921716213226318, "learning_rate": 9.999999899880464e-05, "loss": 2.8462, "step": 1279 }, { "epoch": 0.03438272268185237, "grad_norm": 1.496484637260437, "learning_rate": 9.999999899723228e-05, "loss": 2.8478, "step": 1280 }, { "epoch": 0.034409584183947566, "grad_norm": 1.4175546169281006, "learning_rate": 9.99999989956587e-05, "loss": 3.0167, "step": 1281 }, { "epoch": 0.03443644568604276, "grad_norm": 1.5685076713562012, "learning_rate": 9.999999899408387e-05, "loss": 2.8429, "step": 1282 }, { "epoch": 0.03446330718813796, "grad_norm": 1.6189390420913696, "learning_rate": 9.999999899250783e-05, "loss": 3.1261, "step": 1283 }, { "epoch": 0.03449016869023316, "grad_norm": 1.6355414390563965, "learning_rate": 9.999999899093054e-05, "loss": 3.1554, "step": 1284 }, { "epoch": 0.03451703019232835, "grad_norm": 1.6216799020767212, "learning_rate": 9.999999898935202e-05, "loss": 3.1108, "step": 1285 }, { "epoch": 0.034543891694423554, "grad_norm": 1.700219988822937, "learning_rate": 9.999999898777225e-05, "loss": 3.1269, "step": 1286 }, { "epoch": 0.03457075319651875, "grad_norm": 1.6669987440109253, "learning_rate": 9.999999898619127e-05, "loss": 3.1803, "step": 1287 }, { "epoch": 0.03459761469861394, "grad_norm": 1.504661202430725, "learning_rate": 9.999999898460905e-05, "loss": 3.1455, "step": 1288 }, { "epoch": 0.034624476200709145, "grad_norm": 1.5908552408218384, "learning_rate": 9.999999898302559e-05, "loss": 3.1618, "step": 1289 }, { "epoch": 0.03465133770280434, "grad_norm": 1.505043864250183, "learning_rate": 9.999999898144091e-05, "loss": 3.1347, "step": 1290 }, { "epoch": 0.034678199204899535, "grad_norm": 1.5669896602630615, "learning_rate": 9.999999897985498e-05, "loss": 3.2465, "step": 1291 }, { "epoch": 0.034705060706994736, "grad_norm": 1.8743952512741089, "learning_rate": 9.999999897826781e-05, "loss": 3.258, "step": 1292 }, { "epoch": 0.03473192220908993, "grad_norm": 1.6266064643859863, "learning_rate": 9.999999897667943e-05, "loss": 3.3005, "step": 1293 }, { "epoch": 0.034758783711185126, "grad_norm": 1.530107021331787, "learning_rate": 9.99999989750898e-05, "loss": 3.2074, "step": 1294 }, { "epoch": 0.03478564521328033, "grad_norm": 1.5905359983444214, "learning_rate": 9.999999897349895e-05, "loss": 3.1991, "step": 1295 }, { "epoch": 0.03481250671537552, "grad_norm": 1.6169246435165405, "learning_rate": 9.999999897190687e-05, "loss": 3.0985, "step": 1296 }, { "epoch": 0.03483936821747072, "grad_norm": 1.626260757446289, "learning_rate": 9.999999897031354e-05, "loss": 3.258, "step": 1297 }, { "epoch": 0.03486622971956592, "grad_norm": 1.6810840368270874, "learning_rate": 9.999999896871897e-05, "loss": 3.2958, "step": 1298 }, { "epoch": 0.034893091221661114, "grad_norm": 1.7200833559036255, "learning_rate": 9.999999896712319e-05, "loss": 3.2765, "step": 1299 }, { "epoch": 0.03491995272375631, "grad_norm": 1.8822802305221558, "learning_rate": 9.999999896552616e-05, "loss": 3.6315, "step": 1300 }, { "epoch": 0.03494681422585151, "grad_norm": 1.3117274045944214, "learning_rate": 9.99999989639279e-05, "loss": 3.141, "step": 1301 }, { "epoch": 0.034973675727946706, "grad_norm": 1.3925999402999878, "learning_rate": 9.999999896232841e-05, "loss": 2.7824, "step": 1302 }, { "epoch": 0.03500053723004191, "grad_norm": 1.2862520217895508, "learning_rate": 9.999999896072767e-05, "loss": 2.5686, "step": 1303 }, { "epoch": 0.0350273987321371, "grad_norm": 1.3466994762420654, "learning_rate": 9.999999895912572e-05, "loss": 3.005, "step": 1304 }, { "epoch": 0.0350542602342323, "grad_norm": 1.3251762390136719, "learning_rate": 9.999999895752253e-05, "loss": 3.1006, "step": 1305 }, { "epoch": 0.0350811217363275, "grad_norm": 1.561327338218689, "learning_rate": 9.99999989559181e-05, "loss": 2.9555, "step": 1306 }, { "epoch": 0.035107983238422694, "grad_norm": 1.4351792335510254, "learning_rate": 9.999999895431242e-05, "loss": 3.1577, "step": 1307 }, { "epoch": 0.03513484474051789, "grad_norm": 1.4175413846969604, "learning_rate": 9.999999895270554e-05, "loss": 2.973, "step": 1308 }, { "epoch": 0.03516170624261309, "grad_norm": 1.4891241788864136, "learning_rate": 9.999999895109741e-05, "loss": 3.0839, "step": 1309 }, { "epoch": 0.035188567744708285, "grad_norm": 1.434227705001831, "learning_rate": 9.999999894948803e-05, "loss": 3.0451, "step": 1310 }, { "epoch": 0.03521542924680348, "grad_norm": 1.4105538129806519, "learning_rate": 9.999999894787745e-05, "loss": 2.9684, "step": 1311 }, { "epoch": 0.03524229074889868, "grad_norm": 1.4166136980056763, "learning_rate": 9.999999894626562e-05, "loss": 2.9175, "step": 1312 }, { "epoch": 0.035269152250993877, "grad_norm": 1.3854029178619385, "learning_rate": 9.999999894465254e-05, "loss": 2.9375, "step": 1313 }, { "epoch": 0.03529601375308907, "grad_norm": 1.3970662355422974, "learning_rate": 9.999999894303825e-05, "loss": 2.8387, "step": 1314 }, { "epoch": 0.03532287525518427, "grad_norm": 1.337108850479126, "learning_rate": 9.999999894142272e-05, "loss": 2.8672, "step": 1315 }, { "epoch": 0.03534973675727947, "grad_norm": 1.4422601461410522, "learning_rate": 9.999999893980596e-05, "loss": 2.8305, "step": 1316 }, { "epoch": 0.03537659825937466, "grad_norm": 1.3969252109527588, "learning_rate": 9.999999893818795e-05, "loss": 2.903, "step": 1317 }, { "epoch": 0.035403459761469865, "grad_norm": 1.3909648656845093, "learning_rate": 9.999999893656873e-05, "loss": 2.7991, "step": 1318 }, { "epoch": 0.03543032126356506, "grad_norm": 1.3882468938827515, "learning_rate": 9.999999893494826e-05, "loss": 2.9512, "step": 1319 }, { "epoch": 0.035457182765660254, "grad_norm": 1.487926721572876, "learning_rate": 9.999999893332656e-05, "loss": 2.8579, "step": 1320 }, { "epoch": 0.035484044267755456, "grad_norm": 1.5172538757324219, "learning_rate": 9.999999893170362e-05, "loss": 3.0704, "step": 1321 }, { "epoch": 0.03551090576985065, "grad_norm": 1.4259088039398193, "learning_rate": 9.999999893007946e-05, "loss": 2.6893, "step": 1322 }, { "epoch": 0.035537767271945846, "grad_norm": 1.4706926345825195, "learning_rate": 9.999999892845406e-05, "loss": 3.0833, "step": 1323 }, { "epoch": 0.03556462877404105, "grad_norm": 1.39505934715271, "learning_rate": 9.999999892682742e-05, "loss": 2.9461, "step": 1324 }, { "epoch": 0.03559149027613624, "grad_norm": 1.4205573797225952, "learning_rate": 9.999999892519956e-05, "loss": 3.0997, "step": 1325 }, { "epoch": 0.03561835177823144, "grad_norm": 1.408349871635437, "learning_rate": 9.999999892357045e-05, "loss": 3.0815, "step": 1326 }, { "epoch": 0.03564521328032664, "grad_norm": 1.4647812843322754, "learning_rate": 9.999999892194012e-05, "loss": 2.9036, "step": 1327 }, { "epoch": 0.035672074782421834, "grad_norm": 1.5610274076461792, "learning_rate": 9.999999892030855e-05, "loss": 3.0536, "step": 1328 }, { "epoch": 0.03569893628451703, "grad_norm": 1.4923373460769653, "learning_rate": 9.999999891867574e-05, "loss": 3.1588, "step": 1329 }, { "epoch": 0.03572579778661223, "grad_norm": 1.4299325942993164, "learning_rate": 9.999999891704171e-05, "loss": 2.7535, "step": 1330 }, { "epoch": 0.035752659288707425, "grad_norm": 1.51185142993927, "learning_rate": 9.999999891540644e-05, "loss": 3.0361, "step": 1331 }, { "epoch": 0.03577952079080262, "grad_norm": 1.6059998273849487, "learning_rate": 9.999999891376993e-05, "loss": 2.9251, "step": 1332 }, { "epoch": 0.03580638229289782, "grad_norm": 1.5953091382980347, "learning_rate": 9.999999891213219e-05, "loss": 3.2808, "step": 1333 }, { "epoch": 0.03583324379499302, "grad_norm": 1.534511685371399, "learning_rate": 9.999999891049322e-05, "loss": 3.1435, "step": 1334 }, { "epoch": 0.03586010529708821, "grad_norm": 1.5457303524017334, "learning_rate": 9.999999890885302e-05, "loss": 3.3197, "step": 1335 }, { "epoch": 0.03588696679918341, "grad_norm": 1.5344340801239014, "learning_rate": 9.999999890721156e-05, "loss": 3.2265, "step": 1336 }, { "epoch": 0.03591382830127861, "grad_norm": 1.5263392925262451, "learning_rate": 9.99999989055689e-05, "loss": 3.2468, "step": 1337 }, { "epoch": 0.0359406898033738, "grad_norm": 1.5331720113754272, "learning_rate": 9.999999890392499e-05, "loss": 2.9191, "step": 1338 }, { "epoch": 0.035967551305469005, "grad_norm": 1.6845088005065918, "learning_rate": 9.999999890227985e-05, "loss": 3.4195, "step": 1339 }, { "epoch": 0.0359944128075642, "grad_norm": 1.6606717109680176, "learning_rate": 9.999999890063348e-05, "loss": 3.254, "step": 1340 }, { "epoch": 0.036021274309659394, "grad_norm": 1.5407532453536987, "learning_rate": 9.999999889898587e-05, "loss": 3.1018, "step": 1341 }, { "epoch": 0.036048135811754596, "grad_norm": 1.6971338987350464, "learning_rate": 9.999999889733703e-05, "loss": 3.1919, "step": 1342 }, { "epoch": 0.03607499731384979, "grad_norm": 1.7509889602661133, "learning_rate": 9.999999889568695e-05, "loss": 3.4225, "step": 1343 }, { "epoch": 0.036101858815944986, "grad_norm": 1.6628385782241821, "learning_rate": 9.999999889403564e-05, "loss": 3.1497, "step": 1344 }, { "epoch": 0.03612872031804019, "grad_norm": 1.778279185295105, "learning_rate": 9.999999889238309e-05, "loss": 3.6656, "step": 1345 }, { "epoch": 0.03615558182013538, "grad_norm": 1.6677271127700806, "learning_rate": 9.999999889072933e-05, "loss": 3.2719, "step": 1346 }, { "epoch": 0.03618244332223058, "grad_norm": 1.6962394714355469, "learning_rate": 9.999999888907432e-05, "loss": 3.3428, "step": 1347 }, { "epoch": 0.03620930482432578, "grad_norm": 1.7792705297470093, "learning_rate": 9.999999888741807e-05, "loss": 3.5057, "step": 1348 }, { "epoch": 0.036236166326420974, "grad_norm": 1.8109219074249268, "learning_rate": 9.99999988857606e-05, "loss": 3.4714, "step": 1349 }, { "epoch": 0.03626302782851617, "grad_norm": 2.049417018890381, "learning_rate": 9.999999888410187e-05, "loss": 3.4827, "step": 1350 }, { "epoch": 0.03628988933061137, "grad_norm": 1.3721641302108765, "learning_rate": 9.999999888244193e-05, "loss": 3.1125, "step": 1351 }, { "epoch": 0.036316750832706565, "grad_norm": 1.4423147439956665, "learning_rate": 9.999999888078076e-05, "loss": 2.8924, "step": 1352 }, { "epoch": 0.03634361233480176, "grad_norm": 1.4216868877410889, "learning_rate": 9.999999887911835e-05, "loss": 2.8453, "step": 1353 }, { "epoch": 0.03637047383689696, "grad_norm": 1.444206953048706, "learning_rate": 9.99999988774547e-05, "loss": 2.8776, "step": 1354 }, { "epoch": 0.03639733533899216, "grad_norm": 1.450270175933838, "learning_rate": 9.999999887578981e-05, "loss": 2.997, "step": 1355 }, { "epoch": 0.03642419684108735, "grad_norm": 1.467962384223938, "learning_rate": 9.99999988741237e-05, "loss": 3.2055, "step": 1356 }, { "epoch": 0.03645105834318255, "grad_norm": 1.3029931783676147, "learning_rate": 9.999999887245636e-05, "loss": 2.7665, "step": 1357 }, { "epoch": 0.03647791984527775, "grad_norm": 1.3137893676757812, "learning_rate": 9.999999887078778e-05, "loss": 2.974, "step": 1358 }, { "epoch": 0.03650478134737294, "grad_norm": 1.6335506439208984, "learning_rate": 9.999999886911797e-05, "loss": 3.4446, "step": 1359 }, { "epoch": 0.036531642849468145, "grad_norm": 1.386962652206421, "learning_rate": 9.999999886744691e-05, "loss": 3.1126, "step": 1360 }, { "epoch": 0.03655850435156334, "grad_norm": 1.363294005393982, "learning_rate": 9.999999886577464e-05, "loss": 2.7373, "step": 1361 }, { "epoch": 0.036585365853658534, "grad_norm": 1.4247697591781616, "learning_rate": 9.999999886410112e-05, "loss": 2.8468, "step": 1362 }, { "epoch": 0.036612227355753736, "grad_norm": 1.3875802755355835, "learning_rate": 9.999999886242637e-05, "loss": 2.8448, "step": 1363 }, { "epoch": 0.03663908885784893, "grad_norm": 1.4311139583587646, "learning_rate": 9.99999988607504e-05, "loss": 2.8377, "step": 1364 }, { "epoch": 0.036665950359944126, "grad_norm": 1.4337817430496216, "learning_rate": 9.999999885907316e-05, "loss": 2.791, "step": 1365 }, { "epoch": 0.03669281186203933, "grad_norm": 1.4047051668167114, "learning_rate": 9.999999885739473e-05, "loss": 3.131, "step": 1366 }, { "epoch": 0.03671967336413452, "grad_norm": 1.412001132965088, "learning_rate": 9.999999885571503e-05, "loss": 3.0659, "step": 1367 }, { "epoch": 0.03674653486622972, "grad_norm": 1.4448264837265015, "learning_rate": 9.999999885403411e-05, "loss": 2.7289, "step": 1368 }, { "epoch": 0.03677339636832492, "grad_norm": 1.4868664741516113, "learning_rate": 9.999999885235196e-05, "loss": 3.0849, "step": 1369 }, { "epoch": 0.036800257870420114, "grad_norm": 1.4030438661575317, "learning_rate": 9.999999885066857e-05, "loss": 3.0095, "step": 1370 }, { "epoch": 0.03682711937251531, "grad_norm": 1.3381463289260864, "learning_rate": 9.999999884898397e-05, "loss": 2.9221, "step": 1371 }, { "epoch": 0.03685398087461051, "grad_norm": 1.3347033262252808, "learning_rate": 9.999999884729812e-05, "loss": 3.023, "step": 1372 }, { "epoch": 0.036880842376705705, "grad_norm": 1.4255144596099854, "learning_rate": 9.999999884561102e-05, "loss": 3.0732, "step": 1373 }, { "epoch": 0.0369077038788009, "grad_norm": 1.49003267288208, "learning_rate": 9.999999884392271e-05, "loss": 2.9899, "step": 1374 }, { "epoch": 0.0369345653808961, "grad_norm": 1.465358853340149, "learning_rate": 9.999999884223314e-05, "loss": 3.0834, "step": 1375 }, { "epoch": 0.0369614268829913, "grad_norm": 1.5213181972503662, "learning_rate": 9.999999884054237e-05, "loss": 2.9631, "step": 1376 }, { "epoch": 0.03698828838508649, "grad_norm": 1.55282723903656, "learning_rate": 9.999999883885034e-05, "loss": 3.1508, "step": 1377 }, { "epoch": 0.03701514988718169, "grad_norm": 1.5790045261383057, "learning_rate": 9.99999988371571e-05, "loss": 3.2373, "step": 1378 }, { "epoch": 0.03704201138927689, "grad_norm": 1.5668582916259766, "learning_rate": 9.99999988354626e-05, "loss": 3.0675, "step": 1379 }, { "epoch": 0.03706887289137208, "grad_norm": 1.5427089929580688, "learning_rate": 9.999999883376688e-05, "loss": 3.102, "step": 1380 }, { "epoch": 0.037095734393467285, "grad_norm": 1.5573800802230835, "learning_rate": 9.999999883206991e-05, "loss": 3.1938, "step": 1381 }, { "epoch": 0.03712259589556248, "grad_norm": 1.4574451446533203, "learning_rate": 9.999999883037173e-05, "loss": 2.9663, "step": 1382 }, { "epoch": 0.037149457397657674, "grad_norm": 1.497205138206482, "learning_rate": 9.999999882867231e-05, "loss": 3.0097, "step": 1383 }, { "epoch": 0.037176318899752876, "grad_norm": 1.5007003545761108, "learning_rate": 9.999999882697164e-05, "loss": 3.1039, "step": 1384 }, { "epoch": 0.03720318040184807, "grad_norm": 1.5595407485961914, "learning_rate": 9.999999882526976e-05, "loss": 3.3815, "step": 1385 }, { "epoch": 0.037230041903943266, "grad_norm": 1.456282615661621, "learning_rate": 9.999999882356664e-05, "loss": 2.9351, "step": 1386 }, { "epoch": 0.03725690340603847, "grad_norm": 1.6644130945205688, "learning_rate": 9.999999882186227e-05, "loss": 3.3192, "step": 1387 }, { "epoch": 0.03728376490813366, "grad_norm": 1.5125242471694946, "learning_rate": 9.999999882015669e-05, "loss": 3.0214, "step": 1388 }, { "epoch": 0.03731062641022886, "grad_norm": 1.468849539756775, "learning_rate": 9.999999881844985e-05, "loss": 3.1451, "step": 1389 }, { "epoch": 0.03733748791232406, "grad_norm": 1.6174612045288086, "learning_rate": 9.99999988167418e-05, "loss": 3.2808, "step": 1390 }, { "epoch": 0.037364349414419254, "grad_norm": 1.575363039970398, "learning_rate": 9.99999988150325e-05, "loss": 3.225, "step": 1391 }, { "epoch": 0.03739121091651445, "grad_norm": 1.5213159322738647, "learning_rate": 9.999999881332198e-05, "loss": 3.1011, "step": 1392 }, { "epoch": 0.03741807241860965, "grad_norm": 1.6819603443145752, "learning_rate": 9.999999881161022e-05, "loss": 3.1736, "step": 1393 }, { "epoch": 0.037444933920704845, "grad_norm": 1.672792673110962, "learning_rate": 9.999999880989723e-05, "loss": 3.4121, "step": 1394 }, { "epoch": 0.03747179542280004, "grad_norm": 1.7510223388671875, "learning_rate": 9.9999998808183e-05, "loss": 2.8893, "step": 1395 }, { "epoch": 0.03749865692489524, "grad_norm": 1.7332818508148193, "learning_rate": 9.999999880646754e-05, "loss": 3.2137, "step": 1396 }, { "epoch": 0.03752551842699044, "grad_norm": 1.836311936378479, "learning_rate": 9.999999880475084e-05, "loss": 3.605, "step": 1397 }, { "epoch": 0.03755237992908563, "grad_norm": 1.770053505897522, "learning_rate": 9.999999880303292e-05, "loss": 2.998, "step": 1398 }, { "epoch": 0.03757924143118083, "grad_norm": 1.8517436981201172, "learning_rate": 9.999999880131374e-05, "loss": 3.3581, "step": 1399 }, { "epoch": 0.03760610293327603, "grad_norm": 1.7875633239746094, "learning_rate": 9.999999879959336e-05, "loss": 3.0633, "step": 1400 }, { "epoch": 0.03763296443537122, "grad_norm": 1.2558006048202515, "learning_rate": 9.999999879787172e-05, "loss": 3.1465, "step": 1401 }, { "epoch": 0.037659825937466425, "grad_norm": 1.3543556928634644, "learning_rate": 9.999999879614886e-05, "loss": 2.6867, "step": 1402 }, { "epoch": 0.03768668743956162, "grad_norm": 1.4992352724075317, "learning_rate": 9.999999879442476e-05, "loss": 3.1387, "step": 1403 }, { "epoch": 0.037713548941656815, "grad_norm": 1.4522989988327026, "learning_rate": 9.999999879269943e-05, "loss": 2.9011, "step": 1404 }, { "epoch": 0.037740410443752016, "grad_norm": 1.5051820278167725, "learning_rate": 9.999999879097286e-05, "loss": 3.0651, "step": 1405 }, { "epoch": 0.03776727194584721, "grad_norm": 1.3663911819458008, "learning_rate": 9.999999878924507e-05, "loss": 2.7669, "step": 1406 }, { "epoch": 0.037794133447942406, "grad_norm": 1.3655157089233398, "learning_rate": 9.999999878751603e-05, "loss": 3.2849, "step": 1407 }, { "epoch": 0.03782099495003761, "grad_norm": 1.4261425733566284, "learning_rate": 9.999999878578577e-05, "loss": 2.9059, "step": 1408 }, { "epoch": 0.0378478564521328, "grad_norm": 1.3434581756591797, "learning_rate": 9.999999878405427e-05, "loss": 2.6944, "step": 1409 }, { "epoch": 0.037874717954228, "grad_norm": 1.4687180519104004, "learning_rate": 9.999999878232154e-05, "loss": 3.0075, "step": 1410 }, { "epoch": 0.0379015794563232, "grad_norm": 1.4739652872085571, "learning_rate": 9.999999878058756e-05, "loss": 2.8742, "step": 1411 }, { "epoch": 0.037928440958418394, "grad_norm": 1.3373268842697144, "learning_rate": 9.999999877885236e-05, "loss": 2.8139, "step": 1412 }, { "epoch": 0.03795530246051359, "grad_norm": 1.6320712566375732, "learning_rate": 9.999999877711593e-05, "loss": 2.9475, "step": 1413 }, { "epoch": 0.03798216396260879, "grad_norm": 1.3583111763000488, "learning_rate": 9.999999877537827e-05, "loss": 2.838, "step": 1414 }, { "epoch": 0.038009025464703985, "grad_norm": 1.3993921279907227, "learning_rate": 9.999999877363937e-05, "loss": 2.8855, "step": 1415 }, { "epoch": 0.03803588696679918, "grad_norm": 1.529668927192688, "learning_rate": 9.999999877189923e-05, "loss": 3.0322, "step": 1416 }, { "epoch": 0.03806274846889438, "grad_norm": 1.4245907068252563, "learning_rate": 9.999999877015786e-05, "loss": 2.981, "step": 1417 }, { "epoch": 0.03808960997098958, "grad_norm": 1.3823530673980713, "learning_rate": 9.999999876841527e-05, "loss": 3.0361, "step": 1418 }, { "epoch": 0.03811647147308477, "grad_norm": 1.3851081132888794, "learning_rate": 9.999999876667142e-05, "loss": 2.6808, "step": 1419 }, { "epoch": 0.038143332975179974, "grad_norm": 1.4888771772384644, "learning_rate": 9.999999876492636e-05, "loss": 3.1491, "step": 1420 }, { "epoch": 0.03817019447727517, "grad_norm": 1.4121801853179932, "learning_rate": 9.999999876318005e-05, "loss": 2.6652, "step": 1421 }, { "epoch": 0.03819705597937036, "grad_norm": 1.4982889890670776, "learning_rate": 9.999999876143252e-05, "loss": 2.9528, "step": 1422 }, { "epoch": 0.038223917481465565, "grad_norm": 1.4127792119979858, "learning_rate": 9.999999875968374e-05, "loss": 2.9695, "step": 1423 }, { "epoch": 0.03825077898356076, "grad_norm": 1.475388765335083, "learning_rate": 9.999999875793373e-05, "loss": 2.7763, "step": 1424 }, { "epoch": 0.038277640485655955, "grad_norm": 1.503794550895691, "learning_rate": 9.999999875618248e-05, "loss": 3.0564, "step": 1425 }, { "epoch": 0.038304501987751156, "grad_norm": 1.58021080493927, "learning_rate": 9.999999875443002e-05, "loss": 2.9922, "step": 1426 }, { "epoch": 0.03833136348984635, "grad_norm": 1.6000361442565918, "learning_rate": 9.999999875267632e-05, "loss": 3.1504, "step": 1427 }, { "epoch": 0.038358224991941546, "grad_norm": 1.5512723922729492, "learning_rate": 9.999999875092138e-05, "loss": 3.1249, "step": 1428 }, { "epoch": 0.03838508649403675, "grad_norm": 1.6090893745422363, "learning_rate": 9.99999987491652e-05, "loss": 3.1778, "step": 1429 }, { "epoch": 0.03841194799613194, "grad_norm": 1.527324914932251, "learning_rate": 9.999999874740779e-05, "loss": 3.1531, "step": 1430 }, { "epoch": 0.03843880949822714, "grad_norm": 1.4883356094360352, "learning_rate": 9.999999874564916e-05, "loss": 2.9801, "step": 1431 }, { "epoch": 0.03846567100032234, "grad_norm": 1.5239859819412231, "learning_rate": 9.999999874388928e-05, "loss": 3.1369, "step": 1432 }, { "epoch": 0.038492532502417534, "grad_norm": 1.6074607372283936, "learning_rate": 9.999999874212817e-05, "loss": 3.0369, "step": 1433 }, { "epoch": 0.03851939400451273, "grad_norm": 1.3886668682098389, "learning_rate": 9.999999874036583e-05, "loss": 3.0488, "step": 1434 }, { "epoch": 0.03854625550660793, "grad_norm": 1.579745888710022, "learning_rate": 9.999999873860225e-05, "loss": 3.2854, "step": 1435 }, { "epoch": 0.038573117008703126, "grad_norm": 1.5234684944152832, "learning_rate": 9.999999873683744e-05, "loss": 3.0604, "step": 1436 }, { "epoch": 0.03859997851079832, "grad_norm": 1.6631017923355103, "learning_rate": 9.99999987350714e-05, "loss": 3.286, "step": 1437 }, { "epoch": 0.03862684001289352, "grad_norm": 1.71566641330719, "learning_rate": 9.999999873330412e-05, "loss": 3.2429, "step": 1438 }, { "epoch": 0.03865370151498872, "grad_norm": 1.5708800554275513, "learning_rate": 9.99999987315356e-05, "loss": 3.2407, "step": 1439 }, { "epoch": 0.03868056301708392, "grad_norm": 1.631901741027832, "learning_rate": 9.999999872976587e-05, "loss": 3.1672, "step": 1440 }, { "epoch": 0.038707424519179114, "grad_norm": 1.5446592569351196, "learning_rate": 9.999999872799488e-05, "loss": 3.2726, "step": 1441 }, { "epoch": 0.03873428602127431, "grad_norm": 1.555907130241394, "learning_rate": 9.999999872622267e-05, "loss": 3.2899, "step": 1442 }, { "epoch": 0.03876114752336951, "grad_norm": 1.5833879709243774, "learning_rate": 9.999999872444923e-05, "loss": 3.207, "step": 1443 }, { "epoch": 0.038788009025464705, "grad_norm": 1.6748602390289307, "learning_rate": 9.999999872267456e-05, "loss": 3.1702, "step": 1444 }, { "epoch": 0.0388148705275599, "grad_norm": 1.6045279502868652, "learning_rate": 9.999999872089864e-05, "loss": 2.9434, "step": 1445 }, { "epoch": 0.0388417320296551, "grad_norm": 1.7981611490249634, "learning_rate": 9.999999871912148e-05, "loss": 3.1452, "step": 1446 }, { "epoch": 0.038868593531750296, "grad_norm": 1.9610273838043213, "learning_rate": 9.999999871734311e-05, "loss": 3.3071, "step": 1447 }, { "epoch": 0.03889545503384549, "grad_norm": 1.8365384340286255, "learning_rate": 9.999999871556349e-05, "loss": 3.3733, "step": 1448 }, { "epoch": 0.03892231653594069, "grad_norm": 1.834254503250122, "learning_rate": 9.999999871378266e-05, "loss": 3.5926, "step": 1449 }, { "epoch": 0.03894917803803589, "grad_norm": 1.7254598140716553, "learning_rate": 9.999999871200057e-05, "loss": 3.3867, "step": 1450 }, { "epoch": 0.03897603954013108, "grad_norm": 1.444594144821167, "learning_rate": 9.999999871021726e-05, "loss": 3.006, "step": 1451 }, { "epoch": 0.039002901042226285, "grad_norm": 1.5244362354278564, "learning_rate": 9.99999987084327e-05, "loss": 3.0045, "step": 1452 }, { "epoch": 0.03902976254432148, "grad_norm": 1.3606665134429932, "learning_rate": 9.999999870664693e-05, "loss": 2.9012, "step": 1453 }, { "epoch": 0.039056624046416674, "grad_norm": 1.3562020063400269, "learning_rate": 9.99999987048599e-05, "loss": 2.7546, "step": 1454 }, { "epoch": 0.039083485548511876, "grad_norm": 1.3499842882156372, "learning_rate": 9.999999870307166e-05, "loss": 2.7078, "step": 1455 }, { "epoch": 0.03911034705060707, "grad_norm": 1.4267544746398926, "learning_rate": 9.999999870128218e-05, "loss": 2.9684, "step": 1456 }, { "epoch": 0.039137208552702266, "grad_norm": 1.4771356582641602, "learning_rate": 9.999999869949146e-05, "loss": 3.0608, "step": 1457 }, { "epoch": 0.03916407005479747, "grad_norm": 1.292306661605835, "learning_rate": 9.999999869769952e-05, "loss": 2.6641, "step": 1458 }, { "epoch": 0.03919093155689266, "grad_norm": 1.3434932231903076, "learning_rate": 9.999999869590633e-05, "loss": 2.9786, "step": 1459 }, { "epoch": 0.03921779305898786, "grad_norm": 1.3802857398986816, "learning_rate": 9.99999986941119e-05, "loss": 3.0925, "step": 1460 }, { "epoch": 0.03924465456108306, "grad_norm": 1.3967256546020508, "learning_rate": 9.999999869231625e-05, "loss": 2.964, "step": 1461 }, { "epoch": 0.039271516063178254, "grad_norm": 1.4296356439590454, "learning_rate": 9.999999869051936e-05, "loss": 2.7799, "step": 1462 }, { "epoch": 0.03929837756527345, "grad_norm": 1.3817452192306519, "learning_rate": 9.999999868872126e-05, "loss": 2.8511, "step": 1463 }, { "epoch": 0.03932523906736865, "grad_norm": 1.372567892074585, "learning_rate": 9.999999868692189e-05, "loss": 3.0817, "step": 1464 }, { "epoch": 0.039352100569463845, "grad_norm": 1.3290953636169434, "learning_rate": 9.999999868512131e-05, "loss": 2.8261, "step": 1465 }, { "epoch": 0.03937896207155904, "grad_norm": 1.4655951261520386, "learning_rate": 9.999999868331949e-05, "loss": 2.7318, "step": 1466 }, { "epoch": 0.03940582357365424, "grad_norm": 1.3358664512634277, "learning_rate": 9.999999868151643e-05, "loss": 2.5184, "step": 1467 }, { "epoch": 0.03943268507574944, "grad_norm": 1.4155128002166748, "learning_rate": 9.999999867971215e-05, "loss": 2.8334, "step": 1468 }, { "epoch": 0.03945954657784463, "grad_norm": 1.5112532377243042, "learning_rate": 9.999999867790661e-05, "loss": 3.0017, "step": 1469 }, { "epoch": 0.03948640807993983, "grad_norm": 1.4111354351043701, "learning_rate": 9.999999867609985e-05, "loss": 2.99, "step": 1470 }, { "epoch": 0.03951326958203503, "grad_norm": 1.3667641878128052, "learning_rate": 9.999999867429188e-05, "loss": 2.8692, "step": 1471 }, { "epoch": 0.03954013108413022, "grad_norm": 1.4051331281661987, "learning_rate": 9.999999867248266e-05, "loss": 2.9006, "step": 1472 }, { "epoch": 0.039566992586225425, "grad_norm": 1.4388536214828491, "learning_rate": 9.99999986706722e-05, "loss": 2.7888, "step": 1473 }, { "epoch": 0.03959385408832062, "grad_norm": 1.400656819343567, "learning_rate": 9.99999986688605e-05, "loss": 2.7677, "step": 1474 }, { "epoch": 0.039620715590415814, "grad_norm": 1.5121233463287354, "learning_rate": 9.999999866704759e-05, "loss": 3.0488, "step": 1475 }, { "epoch": 0.039647577092511016, "grad_norm": 1.4835797548294067, "learning_rate": 9.999999866523342e-05, "loss": 2.8857, "step": 1476 }, { "epoch": 0.03967443859460621, "grad_norm": 1.390968680381775, "learning_rate": 9.999999866341804e-05, "loss": 3.0377, "step": 1477 }, { "epoch": 0.039701300096701406, "grad_norm": 1.4880067110061646, "learning_rate": 9.999999866160142e-05, "loss": 3.2275, "step": 1478 }, { "epoch": 0.03972816159879661, "grad_norm": 1.5887243747711182, "learning_rate": 9.999999865978356e-05, "loss": 3.3484, "step": 1479 }, { "epoch": 0.0397550231008918, "grad_norm": 1.509476900100708, "learning_rate": 9.999999865796446e-05, "loss": 3.1582, "step": 1480 }, { "epoch": 0.039781884602987, "grad_norm": 1.5989946126937866, "learning_rate": 9.999999865614414e-05, "loss": 3.1388, "step": 1481 }, { "epoch": 0.0398087461050822, "grad_norm": 1.590970516204834, "learning_rate": 9.999999865432257e-05, "loss": 3.1798, "step": 1482 }, { "epoch": 0.039835607607177394, "grad_norm": 1.383935809135437, "learning_rate": 9.999999865249978e-05, "loss": 2.8013, "step": 1483 }, { "epoch": 0.03986246910927259, "grad_norm": 1.5269430875778198, "learning_rate": 9.999999865067575e-05, "loss": 2.7182, "step": 1484 }, { "epoch": 0.03988933061136779, "grad_norm": 1.5870354175567627, "learning_rate": 9.999999864885049e-05, "loss": 3.0641, "step": 1485 }, { "epoch": 0.039916192113462985, "grad_norm": 1.4706482887268066, "learning_rate": 9.999999864702399e-05, "loss": 2.7107, "step": 1486 }, { "epoch": 0.03994305361555818, "grad_norm": 1.4456549882888794, "learning_rate": 9.999999864519627e-05, "loss": 2.8107, "step": 1487 }, { "epoch": 0.03996991511765338, "grad_norm": 1.6855895519256592, "learning_rate": 9.99999986433673e-05, "loss": 3.1461, "step": 1488 }, { "epoch": 0.03999677661974858, "grad_norm": 1.5434343814849854, "learning_rate": 9.99999986415371e-05, "loss": 3.4355, "step": 1489 }, { "epoch": 0.04002363812184377, "grad_norm": 1.6378633975982666, "learning_rate": 9.999999863970568e-05, "loss": 3.1938, "step": 1490 }, { "epoch": 0.04005049962393897, "grad_norm": 1.5038949251174927, "learning_rate": 9.999999863787301e-05, "loss": 3.134, "step": 1491 }, { "epoch": 0.04007736112603417, "grad_norm": 1.6363450288772583, "learning_rate": 9.999999863603913e-05, "loss": 3.3421, "step": 1492 }, { "epoch": 0.04010422262812936, "grad_norm": 1.6093322038650513, "learning_rate": 9.999999863420399e-05, "loss": 3.0592, "step": 1493 }, { "epoch": 0.040131084130224565, "grad_norm": 1.5667686462402344, "learning_rate": 9.999999863236762e-05, "loss": 3.1278, "step": 1494 }, { "epoch": 0.04015794563231976, "grad_norm": 1.4725762605667114, "learning_rate": 9.999999863053002e-05, "loss": 3.1241, "step": 1495 }, { "epoch": 0.040184807134414954, "grad_norm": 1.6674635410308838, "learning_rate": 9.99999986286912e-05, "loss": 3.1671, "step": 1496 }, { "epoch": 0.040211668636510156, "grad_norm": 1.7384889125823975, "learning_rate": 9.999999862685114e-05, "loss": 3.2027, "step": 1497 }, { "epoch": 0.04023853013860535, "grad_norm": 1.6738016605377197, "learning_rate": 9.999999862500984e-05, "loss": 3.3651, "step": 1498 }, { "epoch": 0.040265391640700546, "grad_norm": 1.8409940004348755, "learning_rate": 9.99999986231673e-05, "loss": 3.4753, "step": 1499 }, { "epoch": 0.04029225314279575, "grad_norm": 2.1469180583953857, "learning_rate": 9.999999862132355e-05, "loss": 3.5186, "step": 1500 }, { "epoch": 0.04031911464489094, "grad_norm": 1.4282715320587158, "learning_rate": 9.999999861947853e-05, "loss": 3.2386, "step": 1501 }, { "epoch": 0.04034597614698614, "grad_norm": 1.378355860710144, "learning_rate": 9.99999986176323e-05, "loss": 2.8773, "step": 1502 }, { "epoch": 0.04037283764908134, "grad_norm": 1.383313536643982, "learning_rate": 9.999999861578483e-05, "loss": 2.8719, "step": 1503 }, { "epoch": 0.040399699151176534, "grad_norm": 1.3885672092437744, "learning_rate": 9.999999861393615e-05, "loss": 3.0667, "step": 1504 }, { "epoch": 0.04042656065327173, "grad_norm": 1.460296869277954, "learning_rate": 9.99999986120862e-05, "loss": 3.2173, "step": 1505 }, { "epoch": 0.04045342215536693, "grad_norm": 1.3763436079025269, "learning_rate": 9.999999861023503e-05, "loss": 2.9576, "step": 1506 }, { "epoch": 0.040480283657462125, "grad_norm": 1.3316450119018555, "learning_rate": 9.999999860838263e-05, "loss": 2.8878, "step": 1507 }, { "epoch": 0.04050714515955732, "grad_norm": 1.3709266185760498, "learning_rate": 9.9999998606529e-05, "loss": 2.7754, "step": 1508 }, { "epoch": 0.04053400666165252, "grad_norm": 1.2321733236312866, "learning_rate": 9.999999860467413e-05, "loss": 2.7415, "step": 1509 }, { "epoch": 0.04056086816374772, "grad_norm": 1.3904420137405396, "learning_rate": 9.9999998602818e-05, "loss": 2.9703, "step": 1510 }, { "epoch": 0.04058772966584291, "grad_norm": 1.352800965309143, "learning_rate": 9.999999860096067e-05, "loss": 2.6487, "step": 1511 }, { "epoch": 0.04061459116793811, "grad_norm": 1.2946110963821411, "learning_rate": 9.999999859910212e-05, "loss": 2.8044, "step": 1512 }, { "epoch": 0.04064145267003331, "grad_norm": 1.3256781101226807, "learning_rate": 9.999999859724231e-05, "loss": 2.7241, "step": 1513 }, { "epoch": 0.0406683141721285, "grad_norm": 1.4185431003570557, "learning_rate": 9.999999859538127e-05, "loss": 3.0663, "step": 1514 }, { "epoch": 0.040695175674223705, "grad_norm": 1.3325541019439697, "learning_rate": 9.9999998593519e-05, "loss": 2.8112, "step": 1515 }, { "epoch": 0.0407220371763189, "grad_norm": 1.5682042837142944, "learning_rate": 9.99999985916555e-05, "loss": 2.8277, "step": 1516 }, { "epoch": 0.040748898678414094, "grad_norm": 1.333392858505249, "learning_rate": 9.999999858979075e-05, "loss": 2.6806, "step": 1517 }, { "epoch": 0.040775760180509296, "grad_norm": 1.3029935359954834, "learning_rate": 9.999999858792477e-05, "loss": 2.556, "step": 1518 }, { "epoch": 0.04080262168260449, "grad_norm": 1.3836957216262817, "learning_rate": 9.999999858605757e-05, "loss": 3.1112, "step": 1519 }, { "epoch": 0.040829483184699686, "grad_norm": 1.5574411153793335, "learning_rate": 9.999999858418914e-05, "loss": 2.9932, "step": 1520 }, { "epoch": 0.04085634468679489, "grad_norm": 1.3623356819152832, "learning_rate": 9.999999858231946e-05, "loss": 2.7071, "step": 1521 }, { "epoch": 0.04088320618889008, "grad_norm": 1.3509150743484497, "learning_rate": 9.999999858044856e-05, "loss": 2.7619, "step": 1522 }, { "epoch": 0.04091006769098528, "grad_norm": 1.4030135869979858, "learning_rate": 9.99999985785764e-05, "loss": 2.9066, "step": 1523 }, { "epoch": 0.04093692919308048, "grad_norm": 1.4573057889938354, "learning_rate": 9.999999857670303e-05, "loss": 3.0447, "step": 1524 }, { "epoch": 0.040963790695175674, "grad_norm": 1.4426836967468262, "learning_rate": 9.999999857482842e-05, "loss": 2.8589, "step": 1525 }, { "epoch": 0.04099065219727087, "grad_norm": 1.4468977451324463, "learning_rate": 9.999999857295259e-05, "loss": 2.8131, "step": 1526 }, { "epoch": 0.04101751369936607, "grad_norm": 1.6241178512573242, "learning_rate": 9.999999857107551e-05, "loss": 3.0325, "step": 1527 }, { "epoch": 0.041044375201461265, "grad_norm": 1.6806358098983765, "learning_rate": 9.99999985691972e-05, "loss": 3.2774, "step": 1528 }, { "epoch": 0.04107123670355646, "grad_norm": 1.4917420148849487, "learning_rate": 9.999999856731765e-05, "loss": 2.9552, "step": 1529 }, { "epoch": 0.04109809820565166, "grad_norm": 1.6034181118011475, "learning_rate": 9.999999856543687e-05, "loss": 2.9498, "step": 1530 }, { "epoch": 0.04112495970774686, "grad_norm": 1.4520667791366577, "learning_rate": 9.999999856355486e-05, "loss": 2.9688, "step": 1531 }, { "epoch": 0.04115182120984205, "grad_norm": 1.5000613927841187, "learning_rate": 9.999999856167161e-05, "loss": 3.3126, "step": 1532 }, { "epoch": 0.04117868271193725, "grad_norm": 1.5931119918823242, "learning_rate": 9.999999855978715e-05, "loss": 3.2781, "step": 1533 }, { "epoch": 0.04120554421403245, "grad_norm": 1.6332576274871826, "learning_rate": 9.999999855790143e-05, "loss": 3.2623, "step": 1534 }, { "epoch": 0.04123240571612764, "grad_norm": 1.444083333015442, "learning_rate": 9.999999855601448e-05, "loss": 2.9727, "step": 1535 }, { "epoch": 0.041259267218222845, "grad_norm": 1.5641528367996216, "learning_rate": 9.99999985541263e-05, "loss": 3.105, "step": 1536 }, { "epoch": 0.04128612872031804, "grad_norm": 1.5843452215194702, "learning_rate": 9.99999985522369e-05, "loss": 2.935, "step": 1537 }, { "epoch": 0.041312990222413234, "grad_norm": 1.5150201320648193, "learning_rate": 9.999999855034624e-05, "loss": 2.8343, "step": 1538 }, { "epoch": 0.041339851724508436, "grad_norm": 1.5015759468078613, "learning_rate": 9.999999854845436e-05, "loss": 3.0577, "step": 1539 }, { "epoch": 0.04136671322660363, "grad_norm": 1.5382952690124512, "learning_rate": 9.999999854656125e-05, "loss": 3.1988, "step": 1540 }, { "epoch": 0.041393574728698826, "grad_norm": 1.6064475774765015, "learning_rate": 9.999999854466689e-05, "loss": 3.3073, "step": 1541 }, { "epoch": 0.04142043623079403, "grad_norm": 1.4302036762237549, "learning_rate": 9.999999854277131e-05, "loss": 2.9684, "step": 1542 }, { "epoch": 0.04144729773288922, "grad_norm": 1.4662116765975952, "learning_rate": 9.99999985408745e-05, "loss": 3.0528, "step": 1543 }, { "epoch": 0.04147415923498442, "grad_norm": 1.5845084190368652, "learning_rate": 9.999999853897646e-05, "loss": 3.0684, "step": 1544 }, { "epoch": 0.04150102073707962, "grad_norm": 1.603434681892395, "learning_rate": 9.999999853707716e-05, "loss": 2.9611, "step": 1545 }, { "epoch": 0.041527882239174814, "grad_norm": 1.66338312625885, "learning_rate": 9.999999853517665e-05, "loss": 3.2688, "step": 1546 }, { "epoch": 0.04155474374127001, "grad_norm": 1.900890827178955, "learning_rate": 9.999999853327491e-05, "loss": 3.4207, "step": 1547 }, { "epoch": 0.04158160524336521, "grad_norm": 1.735031247138977, "learning_rate": 9.999999853137192e-05, "loss": 3.5208, "step": 1548 }, { "epoch": 0.041608466745460405, "grad_norm": 1.8877718448638916, "learning_rate": 9.99999985294677e-05, "loss": 3.4457, "step": 1549 }, { "epoch": 0.0416353282475556, "grad_norm": 1.8484257459640503, "learning_rate": 9.999999852756226e-05, "loss": 3.4321, "step": 1550 }, { "epoch": 0.0416621897496508, "grad_norm": 1.1651768684387207, "learning_rate": 9.999999852565555e-05, "loss": 2.7929, "step": 1551 }, { "epoch": 0.041689051251746, "grad_norm": 1.3331292867660522, "learning_rate": 9.999999852374764e-05, "loss": 2.808, "step": 1552 }, { "epoch": 0.04171591275384119, "grad_norm": 1.459187388420105, "learning_rate": 9.999999852183849e-05, "loss": 3.018, "step": 1553 }, { "epoch": 0.04174277425593639, "grad_norm": 1.4368375539779663, "learning_rate": 9.999999851992811e-05, "loss": 3.0412, "step": 1554 }, { "epoch": 0.04176963575803159, "grad_norm": 1.465675711631775, "learning_rate": 9.99999985180165e-05, "loss": 2.9956, "step": 1555 }, { "epoch": 0.04179649726012678, "grad_norm": 1.3604998588562012, "learning_rate": 9.999999851610364e-05, "loss": 2.7662, "step": 1556 }, { "epoch": 0.041823358762221985, "grad_norm": 1.3840458393096924, "learning_rate": 9.999999851418955e-05, "loss": 2.6917, "step": 1557 }, { "epoch": 0.04185022026431718, "grad_norm": 1.2864190340042114, "learning_rate": 9.999999851227422e-05, "loss": 2.8664, "step": 1558 }, { "epoch": 0.041877081766412375, "grad_norm": 1.3945748805999756, "learning_rate": 9.999999851035767e-05, "loss": 2.7021, "step": 1559 }, { "epoch": 0.041903943268507576, "grad_norm": 1.3237626552581787, "learning_rate": 9.999999850843989e-05, "loss": 2.5358, "step": 1560 }, { "epoch": 0.04193080477060277, "grad_norm": 1.3199323415756226, "learning_rate": 9.999999850652086e-05, "loss": 2.83, "step": 1561 }, { "epoch": 0.041957666272697966, "grad_norm": 1.2470163106918335, "learning_rate": 9.99999985046006e-05, "loss": 2.7546, "step": 1562 }, { "epoch": 0.04198452777479317, "grad_norm": 1.3408187627792358, "learning_rate": 9.999999850267912e-05, "loss": 2.7943, "step": 1563 }, { "epoch": 0.04201138927688836, "grad_norm": 1.4136046171188354, "learning_rate": 9.999999850075639e-05, "loss": 3.1962, "step": 1564 }, { "epoch": 0.04203825077898356, "grad_norm": 1.4965014457702637, "learning_rate": 9.999999849883243e-05, "loss": 2.9666, "step": 1565 }, { "epoch": 0.04206511228107876, "grad_norm": 1.4411835670471191, "learning_rate": 9.999999849690724e-05, "loss": 2.8392, "step": 1566 }, { "epoch": 0.042091973783173954, "grad_norm": 1.4846420288085938, "learning_rate": 9.999999849498082e-05, "loss": 2.7586, "step": 1567 }, { "epoch": 0.04211883528526915, "grad_norm": 1.3521579504013062, "learning_rate": 9.999999849305316e-05, "loss": 3.1934, "step": 1568 }, { "epoch": 0.04214569678736435, "grad_norm": 1.4549918174743652, "learning_rate": 9.999999849112427e-05, "loss": 2.9655, "step": 1569 }, { "epoch": 0.042172558289459545, "grad_norm": 1.4649477005004883, "learning_rate": 9.999999848919415e-05, "loss": 2.8873, "step": 1570 }, { "epoch": 0.04219941979155474, "grad_norm": 1.4793885946273804, "learning_rate": 9.999999848726278e-05, "loss": 3.0737, "step": 1571 }, { "epoch": 0.04222628129364994, "grad_norm": 1.5236968994140625, "learning_rate": 9.99999984853302e-05, "loss": 3.1176, "step": 1572 }, { "epoch": 0.04225314279574514, "grad_norm": 1.5447379350662231, "learning_rate": 9.999999848339636e-05, "loss": 3.3341, "step": 1573 }, { "epoch": 0.04228000429784034, "grad_norm": 1.3562862873077393, "learning_rate": 9.99999984814613e-05, "loss": 2.6696, "step": 1574 }, { "epoch": 0.042306865799935534, "grad_norm": 1.4716746807098389, "learning_rate": 9.999999847952502e-05, "loss": 2.9011, "step": 1575 }, { "epoch": 0.04233372730203073, "grad_norm": 1.501473307609558, "learning_rate": 9.999999847758748e-05, "loss": 3.0257, "step": 1576 }, { "epoch": 0.04236058880412593, "grad_norm": 1.5300209522247314, "learning_rate": 9.999999847564872e-05, "loss": 3.2147, "step": 1577 }, { "epoch": 0.042387450306221125, "grad_norm": 1.4848058223724365, "learning_rate": 9.999999847370872e-05, "loss": 3.0189, "step": 1578 }, { "epoch": 0.04241431180831632, "grad_norm": 1.4604220390319824, "learning_rate": 9.99999984717675e-05, "loss": 3.0825, "step": 1579 }, { "epoch": 0.04244117331041152, "grad_norm": 1.4743536710739136, "learning_rate": 9.999999846982504e-05, "loss": 2.968, "step": 1580 }, { "epoch": 0.042468034812506716, "grad_norm": 1.4050565958023071, "learning_rate": 9.999999846788133e-05, "loss": 3.1072, "step": 1581 }, { "epoch": 0.04249489631460191, "grad_norm": 1.6336671113967896, "learning_rate": 9.999999846593641e-05, "loss": 3.4397, "step": 1582 }, { "epoch": 0.04252175781669711, "grad_norm": 1.4869568347930908, "learning_rate": 9.999999846399024e-05, "loss": 3.0334, "step": 1583 }, { "epoch": 0.04254861931879231, "grad_norm": 1.423067331314087, "learning_rate": 9.999999846204284e-05, "loss": 2.8322, "step": 1584 }, { "epoch": 0.0425754808208875, "grad_norm": 1.4375420808792114, "learning_rate": 9.999999846009423e-05, "loss": 3.0971, "step": 1585 }, { "epoch": 0.042602342322982704, "grad_norm": 1.5238673686981201, "learning_rate": 9.999999845814435e-05, "loss": 3.0143, "step": 1586 }, { "epoch": 0.0426292038250779, "grad_norm": 1.4856241941452026, "learning_rate": 9.999999845619326e-05, "loss": 2.8321, "step": 1587 }, { "epoch": 0.042656065327173094, "grad_norm": 1.6609350442886353, "learning_rate": 9.999999845424093e-05, "loss": 3.2155, "step": 1588 }, { "epoch": 0.042682926829268296, "grad_norm": 1.5115586519241333, "learning_rate": 9.999999845228736e-05, "loss": 3.2471, "step": 1589 }, { "epoch": 0.04270978833136349, "grad_norm": 1.7532490491867065, "learning_rate": 9.999999845033256e-05, "loss": 3.1795, "step": 1590 }, { "epoch": 0.042736649833458686, "grad_norm": 1.5894776582717896, "learning_rate": 9.999999844837652e-05, "loss": 3.123, "step": 1591 }, { "epoch": 0.04276351133555389, "grad_norm": 1.6659984588623047, "learning_rate": 9.999999844641925e-05, "loss": 3.3585, "step": 1592 }, { "epoch": 0.04279037283764908, "grad_norm": 2.417973756790161, "learning_rate": 9.999999844446075e-05, "loss": 3.098, "step": 1593 }, { "epoch": 0.04281723433974428, "grad_norm": 1.5774519443511963, "learning_rate": 9.999999844250102e-05, "loss": 3.1257, "step": 1594 }, { "epoch": 0.04284409584183948, "grad_norm": 1.5874313116073608, "learning_rate": 9.999999844054006e-05, "loss": 3.0155, "step": 1595 }, { "epoch": 0.042870957343934674, "grad_norm": 1.5999934673309326, "learning_rate": 9.999999843857786e-05, "loss": 3.1506, "step": 1596 }, { "epoch": 0.04289781884602987, "grad_norm": 1.7312874794006348, "learning_rate": 9.999999843661442e-05, "loss": 3.351, "step": 1597 }, { "epoch": 0.04292468034812507, "grad_norm": 1.7281516790390015, "learning_rate": 9.999999843464974e-05, "loss": 3.5093, "step": 1598 }, { "epoch": 0.042951541850220265, "grad_norm": 1.9014663696289062, "learning_rate": 9.999999843268384e-05, "loss": 3.5891, "step": 1599 }, { "epoch": 0.04297840335231546, "grad_norm": 2.007746458053589, "learning_rate": 9.99999984307167e-05, "loss": 3.5878, "step": 1600 }, { "epoch": 0.04300526485441066, "grad_norm": 1.3168871402740479, "learning_rate": 9.999999842874834e-05, "loss": 2.8627, "step": 1601 }, { "epoch": 0.043032126356505856, "grad_norm": 1.3844327926635742, "learning_rate": 9.999999842677874e-05, "loss": 2.8701, "step": 1602 }, { "epoch": 0.04305898785860105, "grad_norm": 1.3635506629943848, "learning_rate": 9.999999842480789e-05, "loss": 2.6733, "step": 1603 }, { "epoch": 0.04308584936069625, "grad_norm": 1.3588570356369019, "learning_rate": 9.999999842283582e-05, "loss": 2.8806, "step": 1604 }, { "epoch": 0.04311271086279145, "grad_norm": 1.3737713098526, "learning_rate": 9.999999842086252e-05, "loss": 2.9025, "step": 1605 }, { "epoch": 0.04313957236488664, "grad_norm": 1.40369713306427, "learning_rate": 9.999999841888799e-05, "loss": 2.8484, "step": 1606 }, { "epoch": 0.043166433866981845, "grad_norm": 1.2884548902511597, "learning_rate": 9.99999984169122e-05, "loss": 2.7603, "step": 1607 }, { "epoch": 0.04319329536907704, "grad_norm": 1.351904273033142, "learning_rate": 9.99999984149352e-05, "loss": 3.0526, "step": 1608 }, { "epoch": 0.043220156871172234, "grad_norm": 1.4556187391281128, "learning_rate": 9.999999841295697e-05, "loss": 3.0934, "step": 1609 }, { "epoch": 0.043247018373267436, "grad_norm": 1.3308874368667603, "learning_rate": 9.999999841097748e-05, "loss": 2.7632, "step": 1610 }, { "epoch": 0.04327387987536263, "grad_norm": 1.3838894367218018, "learning_rate": 9.999999840899678e-05, "loss": 3.0159, "step": 1611 }, { "epoch": 0.043300741377457826, "grad_norm": 1.504431962966919, "learning_rate": 9.999999840701483e-05, "loss": 3.0511, "step": 1612 }, { "epoch": 0.04332760287955303, "grad_norm": 1.2878671884536743, "learning_rate": 9.999999840503167e-05, "loss": 2.8596, "step": 1613 }, { "epoch": 0.04335446438164822, "grad_norm": 1.3833988904953003, "learning_rate": 9.999999840304725e-05, "loss": 2.7807, "step": 1614 }, { "epoch": 0.04338132588374342, "grad_norm": 1.3519421815872192, "learning_rate": 9.999999840106163e-05, "loss": 2.7777, "step": 1615 }, { "epoch": 0.04340818738583862, "grad_norm": 1.4081404209136963, "learning_rate": 9.999999839907475e-05, "loss": 2.7198, "step": 1616 }, { "epoch": 0.043435048887933814, "grad_norm": 1.3711954355239868, "learning_rate": 9.999999839708662e-05, "loss": 2.717, "step": 1617 }, { "epoch": 0.04346191039002901, "grad_norm": 1.3935779333114624, "learning_rate": 9.999999839509728e-05, "loss": 3.0255, "step": 1618 }, { "epoch": 0.04348877189212421, "grad_norm": 1.4566105604171753, "learning_rate": 9.99999983931067e-05, "loss": 3.1218, "step": 1619 }, { "epoch": 0.043515633394219405, "grad_norm": 1.4367201328277588, "learning_rate": 9.99999983911149e-05, "loss": 3.0931, "step": 1620 }, { "epoch": 0.0435424948963146, "grad_norm": 1.390453815460205, "learning_rate": 9.999999838912187e-05, "loss": 2.903, "step": 1621 }, { "epoch": 0.0435693563984098, "grad_norm": 1.2955039739608765, "learning_rate": 9.999999838712759e-05, "loss": 2.9835, "step": 1622 }, { "epoch": 0.043596217900505, "grad_norm": 1.5156861543655396, "learning_rate": 9.999999838513207e-05, "loss": 3.1388, "step": 1623 }, { "epoch": 0.04362307940260019, "grad_norm": 1.4825620651245117, "learning_rate": 9.999999838313532e-05, "loss": 3.1583, "step": 1624 }, { "epoch": 0.04364994090469539, "grad_norm": 1.369698166847229, "learning_rate": 9.999999838113734e-05, "loss": 2.9362, "step": 1625 }, { "epoch": 0.04367680240679059, "grad_norm": 1.3391295671463013, "learning_rate": 9.999999837913813e-05, "loss": 2.8266, "step": 1626 }, { "epoch": 0.04370366390888578, "grad_norm": 1.4127203226089478, "learning_rate": 9.999999837713768e-05, "loss": 2.827, "step": 1627 }, { "epoch": 0.043730525410980985, "grad_norm": 1.542043685913086, "learning_rate": 9.999999837513601e-05, "loss": 3.0887, "step": 1628 }, { "epoch": 0.04375738691307618, "grad_norm": 1.513839602470398, "learning_rate": 9.99999983731331e-05, "loss": 3.0853, "step": 1629 }, { "epoch": 0.043784248415171374, "grad_norm": 1.4729801416397095, "learning_rate": 9.999999837112895e-05, "loss": 2.98, "step": 1630 }, { "epoch": 0.043811109917266576, "grad_norm": 1.509283185005188, "learning_rate": 9.999999836912355e-05, "loss": 3.2404, "step": 1631 }, { "epoch": 0.04383797141936177, "grad_norm": 1.543927550315857, "learning_rate": 9.999999836711694e-05, "loss": 3.119, "step": 1632 }, { "epoch": 0.043864832921456966, "grad_norm": 1.5025025606155396, "learning_rate": 9.999999836510909e-05, "loss": 3.1039, "step": 1633 }, { "epoch": 0.04389169442355217, "grad_norm": 1.3757987022399902, "learning_rate": 9.999999836310001e-05, "loss": 2.8637, "step": 1634 }, { "epoch": 0.04391855592564736, "grad_norm": 1.5544795989990234, "learning_rate": 9.99999983610897e-05, "loss": 3.0113, "step": 1635 }, { "epoch": 0.04394541742774256, "grad_norm": 1.4375749826431274, "learning_rate": 9.999999835907815e-05, "loss": 2.9539, "step": 1636 }, { "epoch": 0.04397227892983776, "grad_norm": 1.557188868522644, "learning_rate": 9.999999835706537e-05, "loss": 3.1977, "step": 1637 }, { "epoch": 0.043999140431932954, "grad_norm": 1.6400185823440552, "learning_rate": 9.999999835505136e-05, "loss": 3.1995, "step": 1638 }, { "epoch": 0.04402600193402815, "grad_norm": 1.4655009508132935, "learning_rate": 9.99999983530361e-05, "loss": 2.9985, "step": 1639 }, { "epoch": 0.04405286343612335, "grad_norm": 1.4965097904205322, "learning_rate": 9.999999835101961e-05, "loss": 3.1393, "step": 1640 }, { "epoch": 0.044079724938218545, "grad_norm": 1.5797890424728394, "learning_rate": 9.999999834900189e-05, "loss": 3.0014, "step": 1641 }, { "epoch": 0.04410658644031374, "grad_norm": 1.567730188369751, "learning_rate": 9.999999834698295e-05, "loss": 2.9215, "step": 1642 }, { "epoch": 0.04413344794240894, "grad_norm": 1.6687631607055664, "learning_rate": 9.999999834496276e-05, "loss": 3.2531, "step": 1643 }, { "epoch": 0.04416030944450414, "grad_norm": 1.6272687911987305, "learning_rate": 9.999999834294133e-05, "loss": 3.3399, "step": 1644 }, { "epoch": 0.04418717094659933, "grad_norm": 1.6043704748153687, "learning_rate": 9.999999834091869e-05, "loss": 3.0772, "step": 1645 }, { "epoch": 0.04421403244869453, "grad_norm": 1.6957578659057617, "learning_rate": 9.999999833889479e-05, "loss": 3.3066, "step": 1646 }, { "epoch": 0.04424089395078973, "grad_norm": 1.7268688678741455, "learning_rate": 9.999999833686968e-05, "loss": 3.3125, "step": 1647 }, { "epoch": 0.04426775545288492, "grad_norm": 1.7406071424484253, "learning_rate": 9.999999833484333e-05, "loss": 3.2264, "step": 1648 }, { "epoch": 0.044294616954980125, "grad_norm": 1.8623542785644531, "learning_rate": 9.999999833281574e-05, "loss": 3.3522, "step": 1649 }, { "epoch": 0.04432147845707532, "grad_norm": 1.8407955169677734, "learning_rate": 9.999999833078691e-05, "loss": 3.3308, "step": 1650 }, { "epoch": 0.044348339959170514, "grad_norm": 1.4079015254974365, "learning_rate": 9.999999832875686e-05, "loss": 2.9442, "step": 1651 }, { "epoch": 0.044375201461265716, "grad_norm": 1.4412444829940796, "learning_rate": 9.999999832672557e-05, "loss": 3.038, "step": 1652 }, { "epoch": 0.04440206296336091, "grad_norm": 1.4149682521820068, "learning_rate": 9.999999832469304e-05, "loss": 2.7261, "step": 1653 }, { "epoch": 0.044428924465456106, "grad_norm": 1.411676287651062, "learning_rate": 9.99999983226593e-05, "loss": 2.9101, "step": 1654 }, { "epoch": 0.04445578596755131, "grad_norm": 1.470566987991333, "learning_rate": 9.99999983206243e-05, "loss": 2.77, "step": 1655 }, { "epoch": 0.0444826474696465, "grad_norm": 1.397863507270813, "learning_rate": 9.999999831858808e-05, "loss": 2.9582, "step": 1656 }, { "epoch": 0.0445095089717417, "grad_norm": 1.3953698873519897, "learning_rate": 9.999999831655063e-05, "loss": 3.0912, "step": 1657 }, { "epoch": 0.0445363704738369, "grad_norm": 1.3832037448883057, "learning_rate": 9.999999831451192e-05, "loss": 2.8913, "step": 1658 }, { "epoch": 0.044563231975932094, "grad_norm": 1.3912192583084106, "learning_rate": 9.9999998312472e-05, "loss": 2.6594, "step": 1659 }, { "epoch": 0.04459009347802729, "grad_norm": 1.3151278495788574, "learning_rate": 9.999999831043084e-05, "loss": 2.6635, "step": 1660 }, { "epoch": 0.04461695498012249, "grad_norm": 1.2868192195892334, "learning_rate": 9.999999830838846e-05, "loss": 2.7065, "step": 1661 }, { "epoch": 0.044643816482217685, "grad_norm": 1.387132167816162, "learning_rate": 9.999999830634483e-05, "loss": 2.6766, "step": 1662 }, { "epoch": 0.04467067798431288, "grad_norm": 1.4079433679580688, "learning_rate": 9.999999830429997e-05, "loss": 2.8444, "step": 1663 }, { "epoch": 0.04469753948640808, "grad_norm": 1.4303300380706787, "learning_rate": 9.999999830225387e-05, "loss": 2.9663, "step": 1664 }, { "epoch": 0.04472440098850328, "grad_norm": 1.39901864528656, "learning_rate": 9.999999830020654e-05, "loss": 2.9923, "step": 1665 }, { "epoch": 0.04475126249059847, "grad_norm": 1.4272572994232178, "learning_rate": 9.999999829815798e-05, "loss": 2.899, "step": 1666 }, { "epoch": 0.04477812399269367, "grad_norm": 1.492218017578125, "learning_rate": 9.99999982961082e-05, "loss": 3.0391, "step": 1667 }, { "epoch": 0.04480498549478887, "grad_norm": 1.450843095779419, "learning_rate": 9.999999829405716e-05, "loss": 2.9127, "step": 1668 }, { "epoch": 0.04483184699688406, "grad_norm": 1.3440508842468262, "learning_rate": 9.99999982920049e-05, "loss": 3.0001, "step": 1669 }, { "epoch": 0.044858708498979265, "grad_norm": 1.4402127265930176, "learning_rate": 9.999999828995141e-05, "loss": 2.819, "step": 1670 }, { "epoch": 0.04488557000107446, "grad_norm": 1.4429413080215454, "learning_rate": 9.999999828789667e-05, "loss": 3.0246, "step": 1671 }, { "epoch": 0.044912431503169654, "grad_norm": 2.6085400581359863, "learning_rate": 9.999999828584071e-05, "loss": 2.9273, "step": 1672 }, { "epoch": 0.044939293005264856, "grad_norm": 1.5993056297302246, "learning_rate": 9.999999828378352e-05, "loss": 3.2987, "step": 1673 }, { "epoch": 0.04496615450736005, "grad_norm": 1.5255417823791504, "learning_rate": 9.999999828172508e-05, "loss": 2.9552, "step": 1674 }, { "epoch": 0.044993016009455246, "grad_norm": 1.557896614074707, "learning_rate": 9.999999827966542e-05, "loss": 3.123, "step": 1675 }, { "epoch": 0.04501987751155045, "grad_norm": 1.558122992515564, "learning_rate": 9.999999827760452e-05, "loss": 2.9187, "step": 1676 }, { "epoch": 0.04504673901364564, "grad_norm": 1.425349473953247, "learning_rate": 9.999999827554239e-05, "loss": 2.899, "step": 1677 }, { "epoch": 0.04507360051574084, "grad_norm": 1.5926076173782349, "learning_rate": 9.999999827347903e-05, "loss": 3.2055, "step": 1678 }, { "epoch": 0.04510046201783604, "grad_norm": 1.6083916425704956, "learning_rate": 9.999999827141443e-05, "loss": 2.8889, "step": 1679 }, { "epoch": 0.045127323519931234, "grad_norm": 1.4613057374954224, "learning_rate": 9.999999826934859e-05, "loss": 2.968, "step": 1680 }, { "epoch": 0.04515418502202643, "grad_norm": 1.591672420501709, "learning_rate": 9.999999826728153e-05, "loss": 3.0252, "step": 1681 }, { "epoch": 0.04518104652412163, "grad_norm": 1.539560317993164, "learning_rate": 9.999999826521322e-05, "loss": 2.8637, "step": 1682 }, { "epoch": 0.045207908026216825, "grad_norm": 1.4949159622192383, "learning_rate": 9.99999982631437e-05, "loss": 3.05, "step": 1683 }, { "epoch": 0.04523476952831202, "grad_norm": 1.5287278890609741, "learning_rate": 9.999999826107294e-05, "loss": 3.0176, "step": 1684 }, { "epoch": 0.04526163103040722, "grad_norm": 1.4513976573944092, "learning_rate": 9.999999825900092e-05, "loss": 3.0043, "step": 1685 }, { "epoch": 0.04528849253250242, "grad_norm": 1.4938799142837524, "learning_rate": 9.99999982569277e-05, "loss": 2.998, "step": 1686 }, { "epoch": 0.04531535403459761, "grad_norm": 1.6789308786392212, "learning_rate": 9.999999825485323e-05, "loss": 3.108, "step": 1687 }, { "epoch": 0.04534221553669281, "grad_norm": 1.4213138818740845, "learning_rate": 9.999999825277752e-05, "loss": 3.0397, "step": 1688 }, { "epoch": 0.04536907703878801, "grad_norm": 1.5696649551391602, "learning_rate": 9.999999825070058e-05, "loss": 3.1145, "step": 1689 }, { "epoch": 0.0453959385408832, "grad_norm": 1.6566909551620483, "learning_rate": 9.999999824862241e-05, "loss": 3.3197, "step": 1690 }, { "epoch": 0.045422800042978405, "grad_norm": 1.591908574104309, "learning_rate": 9.999999824654302e-05, "loss": 2.9155, "step": 1691 }, { "epoch": 0.0454496615450736, "grad_norm": 1.6618692874908447, "learning_rate": 9.999999824446238e-05, "loss": 3.3152, "step": 1692 }, { "epoch": 0.045476523047168795, "grad_norm": 1.660921573638916, "learning_rate": 9.999999824238051e-05, "loss": 3.1987, "step": 1693 }, { "epoch": 0.045503384549263996, "grad_norm": 1.680649757385254, "learning_rate": 9.99999982402974e-05, "loss": 3.2139, "step": 1694 }, { "epoch": 0.04553024605135919, "grad_norm": 1.7884151935577393, "learning_rate": 9.999999823821307e-05, "loss": 3.2991, "step": 1695 }, { "epoch": 0.045557107553454386, "grad_norm": 1.6756354570388794, "learning_rate": 9.99999982361275e-05, "loss": 3.1816, "step": 1696 }, { "epoch": 0.04558396905554959, "grad_norm": 1.6229280233383179, "learning_rate": 9.999999823404069e-05, "loss": 3.2563, "step": 1697 }, { "epoch": 0.04561083055764478, "grad_norm": 1.625819444656372, "learning_rate": 9.999999823195265e-05, "loss": 3.1226, "step": 1698 }, { "epoch": 0.04563769205973998, "grad_norm": 1.7141685485839844, "learning_rate": 9.999999822986337e-05, "loss": 3.3355, "step": 1699 }, { "epoch": 0.04566455356183518, "grad_norm": 1.7747595310211182, "learning_rate": 9.999999822777286e-05, "loss": 3.2331, "step": 1700 } ], "logging_steps": 1, "max_steps": 20000000, "num_input_tokens_seen": 0, "num_train_epochs": 538, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.320207374503117e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }