{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 13863, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002164033758926639, "grad_norm": 1.1804725541377281, "learning_rate": 5e-06, "loss": 0.7931, "step": 10 }, { "epoch": 0.004328067517853278, "grad_norm": 1.0142507971826196, "learning_rate": 5e-06, "loss": 0.7085, "step": 20 }, { "epoch": 0.006492101276779918, "grad_norm": 1.0822486668830535, "learning_rate": 5e-06, "loss": 0.6971, "step": 30 }, { "epoch": 0.008656135035706556, "grad_norm": 0.9775955015066439, "learning_rate": 5e-06, "loss": 0.6931, "step": 40 }, { "epoch": 0.010820168794633196, "grad_norm": 1.0533969471863989, "learning_rate": 5e-06, "loss": 0.6748, "step": 50 }, { "epoch": 0.012984202553559835, "grad_norm": 0.9409532123493094, "learning_rate": 5e-06, "loss": 0.6582, "step": 60 }, { "epoch": 0.015148236312486475, "grad_norm": 0.8701409472453093, "learning_rate": 5e-06, "loss": 0.6612, "step": 70 }, { "epoch": 0.017312270071413113, "grad_norm": 0.8629522430209661, "learning_rate": 5e-06, "loss": 0.667, "step": 80 }, { "epoch": 0.019476303830339752, "grad_norm": 0.9005724399072088, "learning_rate": 5e-06, "loss": 0.6664, "step": 90 }, { "epoch": 0.02164033758926639, "grad_norm": 0.9031490400120155, "learning_rate": 5e-06, "loss": 0.6494, "step": 100 }, { "epoch": 0.02380437134819303, "grad_norm": 0.8491980446537303, "learning_rate": 5e-06, "loss": 0.6611, "step": 110 }, { "epoch": 0.02596840510711967, "grad_norm": 0.8544450914337819, "learning_rate": 5e-06, "loss": 0.6475, "step": 120 }, { "epoch": 0.02813243886604631, "grad_norm": 0.8940642349549588, "learning_rate": 5e-06, "loss": 0.6676, "step": 130 }, { "epoch": 0.03029647262497295, "grad_norm": 0.8624378682337847, "learning_rate": 5e-06, "loss": 0.6362, "step": 140 }, { "epoch": 0.032460506383899586, "grad_norm": 0.7937661473250804, "learning_rate": 5e-06, "loss": 0.6502, "step": 150 }, { "epoch": 0.034624540142826225, "grad_norm": 0.8511299785366766, "learning_rate": 5e-06, "loss": 0.6397, "step": 160 }, { "epoch": 0.036788573901752865, "grad_norm": 0.8247509378529784, "learning_rate": 5e-06, "loss": 0.6381, "step": 170 }, { "epoch": 0.038952607660679504, "grad_norm": 0.8606650790301921, "learning_rate": 5e-06, "loss": 0.6507, "step": 180 }, { "epoch": 0.041116641419606144, "grad_norm": 0.9000039308059945, "learning_rate": 5e-06, "loss": 0.6517, "step": 190 }, { "epoch": 0.04328067517853278, "grad_norm": 0.874046131263459, "learning_rate": 5e-06, "loss": 0.6372, "step": 200 }, { "epoch": 0.04544470893745942, "grad_norm": 0.826945338144314, "learning_rate": 5e-06, "loss": 0.6331, "step": 210 }, { "epoch": 0.04760874269638606, "grad_norm": 0.8647577576035517, "learning_rate": 5e-06, "loss": 0.6388, "step": 220 }, { "epoch": 0.0497727764553127, "grad_norm": 0.8357940501078124, "learning_rate": 5e-06, "loss": 0.6468, "step": 230 }, { "epoch": 0.05193681021423934, "grad_norm": 0.8205082815962632, "learning_rate": 5e-06, "loss": 0.6206, "step": 240 }, { "epoch": 0.05410084397316598, "grad_norm": 0.8029856499548153, "learning_rate": 5e-06, "loss": 0.6252, "step": 250 }, { "epoch": 0.05626487773209262, "grad_norm": 0.8266962137493432, "learning_rate": 5e-06, "loss": 0.6267, "step": 260 }, { "epoch": 0.05842891149101926, "grad_norm": 0.7760155385414025, "learning_rate": 5e-06, "loss": 0.6233, "step": 270 }, { "epoch": 0.0605929452499459, "grad_norm": 0.857439553904383, "learning_rate": 5e-06, "loss": 0.6247, "step": 280 }, { "epoch": 0.06275697900887253, "grad_norm": 0.8571645215071354, "learning_rate": 5e-06, "loss": 0.6291, "step": 290 }, { "epoch": 0.06492101276779917, "grad_norm": 0.8324700525072513, "learning_rate": 5e-06, "loss": 0.6313, "step": 300 }, { "epoch": 0.06708504652672581, "grad_norm": 0.7832243300229523, "learning_rate": 5e-06, "loss": 0.6329, "step": 310 }, { "epoch": 0.06924908028565245, "grad_norm": 0.7493686602077012, "learning_rate": 5e-06, "loss": 0.6198, "step": 320 }, { "epoch": 0.07141311404457909, "grad_norm": 0.8110396523299721, "learning_rate": 5e-06, "loss": 0.6268, "step": 330 }, { "epoch": 0.07357714780350573, "grad_norm": 0.7729143403225553, "learning_rate": 5e-06, "loss": 0.6131, "step": 340 }, { "epoch": 0.07574118156243237, "grad_norm": 0.844299608680804, "learning_rate": 5e-06, "loss": 0.628, "step": 350 }, { "epoch": 0.07790521532135901, "grad_norm": 0.8311658262532373, "learning_rate": 5e-06, "loss": 0.6284, "step": 360 }, { "epoch": 0.08006924908028565, "grad_norm": 0.7958716863199661, "learning_rate": 5e-06, "loss": 0.6166, "step": 370 }, { "epoch": 0.08223328283921229, "grad_norm": 0.8412185496400149, "learning_rate": 5e-06, "loss": 0.6458, "step": 380 }, { "epoch": 0.08439731659813893, "grad_norm": 0.7950152179064658, "learning_rate": 5e-06, "loss": 0.625, "step": 390 }, { "epoch": 0.08656135035706557, "grad_norm": 0.7728715222121862, "learning_rate": 5e-06, "loss": 0.6246, "step": 400 }, { "epoch": 0.0887253841159922, "grad_norm": 0.806073545694218, "learning_rate": 5e-06, "loss": 0.6315, "step": 410 }, { "epoch": 0.09088941787491885, "grad_norm": 0.8489148782720433, "learning_rate": 5e-06, "loss": 0.6145, "step": 420 }, { "epoch": 0.09305345163384549, "grad_norm": 0.7918376736911522, "learning_rate": 5e-06, "loss": 0.6192, "step": 430 }, { "epoch": 0.09521748539277212, "grad_norm": 0.8400111831435656, "learning_rate": 5e-06, "loss": 0.621, "step": 440 }, { "epoch": 0.09738151915169876, "grad_norm": 0.7614028740715489, "learning_rate": 5e-06, "loss": 0.605, "step": 450 }, { "epoch": 0.0995455529106254, "grad_norm": 0.8577505860556417, "learning_rate": 5e-06, "loss": 0.607, "step": 460 }, { "epoch": 0.10170958666955204, "grad_norm": 0.8729565243092611, "learning_rate": 5e-06, "loss": 0.6229, "step": 470 }, { "epoch": 0.10387362042847868, "grad_norm": 0.822547387381443, "learning_rate": 5e-06, "loss": 0.6213, "step": 480 }, { "epoch": 0.10603765418740532, "grad_norm": 0.7797820836510193, "learning_rate": 5e-06, "loss": 0.6055, "step": 490 }, { "epoch": 0.10820168794633196, "grad_norm": 0.8039389365692253, "learning_rate": 5e-06, "loss": 0.6158, "step": 500 }, { "epoch": 0.1103657217052586, "grad_norm": 0.8014536678260006, "learning_rate": 5e-06, "loss": 0.6078, "step": 510 }, { "epoch": 0.11252975546418524, "grad_norm": 0.8655111256073293, "learning_rate": 5e-06, "loss": 0.6279, "step": 520 }, { "epoch": 0.11469378922311188, "grad_norm": 0.872957811479719, "learning_rate": 5e-06, "loss": 0.6172, "step": 530 }, { "epoch": 0.11685782298203852, "grad_norm": 0.8819039368989797, "learning_rate": 5e-06, "loss": 0.6121, "step": 540 }, { "epoch": 0.11902185674096516, "grad_norm": 0.8640564878023476, "learning_rate": 5e-06, "loss": 0.5962, "step": 550 }, { "epoch": 0.1211858904998918, "grad_norm": 0.8207671083993288, "learning_rate": 5e-06, "loss": 0.6061, "step": 560 }, { "epoch": 0.12334992425881844, "grad_norm": 0.8639259231144906, "learning_rate": 5e-06, "loss": 0.6102, "step": 570 }, { "epoch": 0.12551395801774506, "grad_norm": 0.8252682175137697, "learning_rate": 5e-06, "loss": 0.5919, "step": 580 }, { "epoch": 0.12767799177667172, "grad_norm": 0.8178974092958998, "learning_rate": 5e-06, "loss": 0.6166, "step": 590 }, { "epoch": 0.12984202553559834, "grad_norm": 0.8895228285922248, "learning_rate": 5e-06, "loss": 0.6098, "step": 600 }, { "epoch": 0.132006059294525, "grad_norm": 0.7969967604154639, "learning_rate": 5e-06, "loss": 0.6077, "step": 610 }, { "epoch": 0.13417009305345162, "grad_norm": 0.8683433820973785, "learning_rate": 5e-06, "loss": 0.6099, "step": 620 }, { "epoch": 0.13633412681237828, "grad_norm": 0.8063824032551594, "learning_rate": 5e-06, "loss": 0.6127, "step": 630 }, { "epoch": 0.1384981605713049, "grad_norm": 0.8136662571064663, "learning_rate": 5e-06, "loss": 0.5968, "step": 640 }, { "epoch": 0.14066219433023155, "grad_norm": 0.822893712320517, "learning_rate": 5e-06, "loss": 0.6038, "step": 650 }, { "epoch": 0.14282622808915818, "grad_norm": 0.8485980406618856, "learning_rate": 5e-06, "loss": 0.6127, "step": 660 }, { "epoch": 0.14499026184808483, "grad_norm": 0.8396390754266708, "learning_rate": 5e-06, "loss": 0.6105, "step": 670 }, { "epoch": 0.14715429560701146, "grad_norm": 0.863650354540203, "learning_rate": 5e-06, "loss": 0.5947, "step": 680 }, { "epoch": 0.1493183293659381, "grad_norm": 0.8767137343250653, "learning_rate": 5e-06, "loss": 0.5944, "step": 690 }, { "epoch": 0.15148236312486474, "grad_norm": 0.7785697177833184, "learning_rate": 5e-06, "loss": 0.5957, "step": 700 }, { "epoch": 0.1536463968837914, "grad_norm": 0.7903385744648517, "learning_rate": 5e-06, "loss": 0.5887, "step": 710 }, { "epoch": 0.15581043064271802, "grad_norm": 0.8304334914378555, "learning_rate": 5e-06, "loss": 0.5972, "step": 720 }, { "epoch": 0.15797446440164467, "grad_norm": 0.8032146774897264, "learning_rate": 5e-06, "loss": 0.6055, "step": 730 }, { "epoch": 0.1601384981605713, "grad_norm": 0.818462114006868, "learning_rate": 5e-06, "loss": 0.6086, "step": 740 }, { "epoch": 0.16230253191949795, "grad_norm": 0.7999227700105283, "learning_rate": 5e-06, "loss": 0.6, "step": 750 }, { "epoch": 0.16446656567842458, "grad_norm": 0.779725395098173, "learning_rate": 5e-06, "loss": 0.581, "step": 760 }, { "epoch": 0.16663059943735123, "grad_norm": 0.8501680445359217, "learning_rate": 5e-06, "loss": 0.6034, "step": 770 }, { "epoch": 0.16879463319627785, "grad_norm": 0.7955262980950765, "learning_rate": 5e-06, "loss": 0.591, "step": 780 }, { "epoch": 0.1709586669552045, "grad_norm": 0.7969375284741476, "learning_rate": 5e-06, "loss": 0.5917, "step": 790 }, { "epoch": 0.17312270071413113, "grad_norm": 0.7651500787357158, "learning_rate": 5e-06, "loss": 0.5925, "step": 800 }, { "epoch": 0.1752867344730578, "grad_norm": 0.7869498336245857, "learning_rate": 5e-06, "loss": 0.5919, "step": 810 }, { "epoch": 0.1774507682319844, "grad_norm": 0.8171995660498751, "learning_rate": 5e-06, "loss": 0.5938, "step": 820 }, { "epoch": 0.17961480199091107, "grad_norm": 0.7836587740083709, "learning_rate": 5e-06, "loss": 0.5786, "step": 830 }, { "epoch": 0.1817788357498377, "grad_norm": 0.8784878650037878, "learning_rate": 5e-06, "loss": 0.6044, "step": 840 }, { "epoch": 0.18394286950876435, "grad_norm": 0.800243869063219, "learning_rate": 5e-06, "loss": 0.5889, "step": 850 }, { "epoch": 0.18610690326769097, "grad_norm": 0.8018420320157924, "learning_rate": 5e-06, "loss": 0.6087, "step": 860 }, { "epoch": 0.18827093702661762, "grad_norm": 0.8180751714275638, "learning_rate": 5e-06, "loss": 0.6037, "step": 870 }, { "epoch": 0.19043497078554425, "grad_norm": 0.785600856617542, "learning_rate": 5e-06, "loss": 0.5837, "step": 880 }, { "epoch": 0.1925990045444709, "grad_norm": 0.8412200127939977, "learning_rate": 5e-06, "loss": 0.5896, "step": 890 }, { "epoch": 0.19476303830339753, "grad_norm": 0.8025538669907342, "learning_rate": 5e-06, "loss": 0.5991, "step": 900 }, { "epoch": 0.19692707206232418, "grad_norm": 0.7720798100780025, "learning_rate": 5e-06, "loss": 0.5891, "step": 910 }, { "epoch": 0.1990911058212508, "grad_norm": 0.8661764118040983, "learning_rate": 5e-06, "loss": 0.5931, "step": 920 }, { "epoch": 0.20125513958017746, "grad_norm": 0.8120743515574185, "learning_rate": 5e-06, "loss": 0.5957, "step": 930 }, { "epoch": 0.2034191733391041, "grad_norm": 0.8345160783897795, "learning_rate": 5e-06, "loss": 0.5984, "step": 940 }, { "epoch": 0.20558320709803074, "grad_norm": 0.811502391003602, "learning_rate": 5e-06, "loss": 0.6081, "step": 950 }, { "epoch": 0.20774724085695737, "grad_norm": 0.7825461582026372, "learning_rate": 5e-06, "loss": 0.5948, "step": 960 }, { "epoch": 0.20991127461588402, "grad_norm": 0.8055902800373363, "learning_rate": 5e-06, "loss": 0.5982, "step": 970 }, { "epoch": 0.21207530837481064, "grad_norm": 0.8388084051375468, "learning_rate": 5e-06, "loss": 0.5893, "step": 980 }, { "epoch": 0.2142393421337373, "grad_norm": 0.7679133259037194, "learning_rate": 5e-06, "loss": 0.5907, "step": 990 }, { "epoch": 0.21640337589266392, "grad_norm": 0.8624733193059415, "learning_rate": 5e-06, "loss": 0.5814, "step": 1000 }, { "epoch": 0.21856740965159058, "grad_norm": 0.7984486425858306, "learning_rate": 5e-06, "loss": 0.5913, "step": 1010 }, { "epoch": 0.2207314434105172, "grad_norm": 0.8510028087137639, "learning_rate": 5e-06, "loss": 0.5949, "step": 1020 }, { "epoch": 0.22289547716944386, "grad_norm": 0.7996439613799075, "learning_rate": 5e-06, "loss": 0.601, "step": 1030 }, { "epoch": 0.22505951092837048, "grad_norm": 0.8088661553876958, "learning_rate": 5e-06, "loss": 0.6074, "step": 1040 }, { "epoch": 0.22722354468729714, "grad_norm": 0.8163892550046303, "learning_rate": 5e-06, "loss": 0.5863, "step": 1050 }, { "epoch": 0.22938757844622376, "grad_norm": 0.9418880337035802, "learning_rate": 5e-06, "loss": 0.6033, "step": 1060 }, { "epoch": 0.2315516122051504, "grad_norm": 0.8308760617654204, "learning_rate": 5e-06, "loss": 0.603, "step": 1070 }, { "epoch": 0.23371564596407704, "grad_norm": 0.8145198410135479, "learning_rate": 5e-06, "loss": 0.5947, "step": 1080 }, { "epoch": 0.23587967972300367, "grad_norm": 0.8623357006048706, "learning_rate": 5e-06, "loss": 0.6056, "step": 1090 }, { "epoch": 0.23804371348193032, "grad_norm": 0.7965551896548404, "learning_rate": 5e-06, "loss": 0.5923, "step": 1100 }, { "epoch": 0.24020774724085694, "grad_norm": 0.8052327006114539, "learning_rate": 5e-06, "loss": 0.6007, "step": 1110 }, { "epoch": 0.2423717809997836, "grad_norm": 0.8213993699298745, "learning_rate": 5e-06, "loss": 0.5797, "step": 1120 }, { "epoch": 0.24453581475871022, "grad_norm": 0.8149866187723401, "learning_rate": 5e-06, "loss": 0.5941, "step": 1130 }, { "epoch": 0.24669984851763688, "grad_norm": 0.7837216156150254, "learning_rate": 5e-06, "loss": 0.59, "step": 1140 }, { "epoch": 0.2488638822765635, "grad_norm": 0.8126197328739624, "learning_rate": 5e-06, "loss": 0.5952, "step": 1150 }, { "epoch": 0.25102791603549013, "grad_norm": 0.8221669781147581, "learning_rate": 5e-06, "loss": 0.6005, "step": 1160 }, { "epoch": 0.2531919497944168, "grad_norm": 0.7963300719869713, "learning_rate": 5e-06, "loss": 0.5877, "step": 1170 }, { "epoch": 0.25535598355334344, "grad_norm": 0.788058435618901, "learning_rate": 5e-06, "loss": 0.591, "step": 1180 }, { "epoch": 0.2575200173122701, "grad_norm": 0.7519308880120247, "learning_rate": 5e-06, "loss": 0.5928, "step": 1190 }, { "epoch": 0.2596840510711967, "grad_norm": 0.8266442618730233, "learning_rate": 5e-06, "loss": 0.5839, "step": 1200 }, { "epoch": 0.26184808483012334, "grad_norm": 0.8292557026765464, "learning_rate": 5e-06, "loss": 0.5865, "step": 1210 }, { "epoch": 0.26401211858905, "grad_norm": 0.8853558777310622, "learning_rate": 5e-06, "loss": 0.5983, "step": 1220 }, { "epoch": 0.26617615234797665, "grad_norm": 0.9109356382425066, "learning_rate": 5e-06, "loss": 0.5977, "step": 1230 }, { "epoch": 0.26834018610690324, "grad_norm": 0.89504318082267, "learning_rate": 5e-06, "loss": 0.592, "step": 1240 }, { "epoch": 0.2705042198658299, "grad_norm": 0.8160219829275853, "learning_rate": 5e-06, "loss": 0.593, "step": 1250 }, { "epoch": 0.27266825362475655, "grad_norm": 0.8076508336570862, "learning_rate": 5e-06, "loss": 0.5856, "step": 1260 }, { "epoch": 0.2748322873836832, "grad_norm": 0.8022139372756553, "learning_rate": 5e-06, "loss": 0.5747, "step": 1270 }, { "epoch": 0.2769963211426098, "grad_norm": 0.8145806416794947, "learning_rate": 5e-06, "loss": 0.5887, "step": 1280 }, { "epoch": 0.27916035490153646, "grad_norm": 0.7591150136319158, "learning_rate": 5e-06, "loss": 0.5732, "step": 1290 }, { "epoch": 0.2813243886604631, "grad_norm": 0.8140968921103322, "learning_rate": 5e-06, "loss": 0.5912, "step": 1300 }, { "epoch": 0.28348842241938976, "grad_norm": 0.7944048590141184, "learning_rate": 5e-06, "loss": 0.5895, "step": 1310 }, { "epoch": 0.28565245617831636, "grad_norm": 0.8789105404070925, "learning_rate": 5e-06, "loss": 0.5747, "step": 1320 }, { "epoch": 0.287816489937243, "grad_norm": 0.7796055521428117, "learning_rate": 5e-06, "loss": 0.5881, "step": 1330 }, { "epoch": 0.28998052369616967, "grad_norm": 0.817568265263925, "learning_rate": 5e-06, "loss": 0.5818, "step": 1340 }, { "epoch": 0.2921445574550963, "grad_norm": 0.7781293420250672, "learning_rate": 5e-06, "loss": 0.5912, "step": 1350 }, { "epoch": 0.2943085912140229, "grad_norm": 0.8504639244998208, "learning_rate": 5e-06, "loss": 0.5914, "step": 1360 }, { "epoch": 0.29647262497294957, "grad_norm": 0.848929905244684, "learning_rate": 5e-06, "loss": 0.5705, "step": 1370 }, { "epoch": 0.2986366587318762, "grad_norm": 0.8401835594137422, "learning_rate": 5e-06, "loss": 0.5859, "step": 1380 }, { "epoch": 0.3008006924908029, "grad_norm": 0.8430038720121172, "learning_rate": 5e-06, "loss": 0.5909, "step": 1390 }, { "epoch": 0.3029647262497295, "grad_norm": 0.7925617711932909, "learning_rate": 5e-06, "loss": 0.5822, "step": 1400 }, { "epoch": 0.30512876000865613, "grad_norm": 0.8081455173268478, "learning_rate": 5e-06, "loss": 0.5745, "step": 1410 }, { "epoch": 0.3072927937675828, "grad_norm": 0.7903132825985777, "learning_rate": 5e-06, "loss": 0.5759, "step": 1420 }, { "epoch": 0.30945682752650944, "grad_norm": 0.8243849826447428, "learning_rate": 5e-06, "loss": 0.5968, "step": 1430 }, { "epoch": 0.31162086128543603, "grad_norm": 0.7603966355823882, "learning_rate": 5e-06, "loss": 0.5873, "step": 1440 }, { "epoch": 0.3137848950443627, "grad_norm": 0.801455870233685, "learning_rate": 5e-06, "loss": 0.5786, "step": 1450 }, { "epoch": 0.31594892880328934, "grad_norm": 0.7686291740407882, "learning_rate": 5e-06, "loss": 0.5789, "step": 1460 }, { "epoch": 0.318112962562216, "grad_norm": 0.8554910506261316, "learning_rate": 5e-06, "loss": 0.5795, "step": 1470 }, { "epoch": 0.3202769963211426, "grad_norm": 0.820274584453091, "learning_rate": 5e-06, "loss": 0.5756, "step": 1480 }, { "epoch": 0.32244103008006925, "grad_norm": 0.835330743611247, "learning_rate": 5e-06, "loss": 0.5848, "step": 1490 }, { "epoch": 0.3246050638389959, "grad_norm": 0.8067787217591124, "learning_rate": 5e-06, "loss": 0.5651, "step": 1500 }, { "epoch": 0.32676909759792255, "grad_norm": 0.8055298656551628, "learning_rate": 5e-06, "loss": 0.5806, "step": 1510 }, { "epoch": 0.32893313135684915, "grad_norm": 0.7643337696325749, "learning_rate": 5e-06, "loss": 0.5836, "step": 1520 }, { "epoch": 0.3310971651157758, "grad_norm": 0.802754989395744, "learning_rate": 5e-06, "loss": 0.577, "step": 1530 }, { "epoch": 0.33326119887470246, "grad_norm": 0.8227460460988874, "learning_rate": 5e-06, "loss": 0.5781, "step": 1540 }, { "epoch": 0.3354252326336291, "grad_norm": 0.8157555450231291, "learning_rate": 5e-06, "loss": 0.5915, "step": 1550 }, { "epoch": 0.3375892663925557, "grad_norm": 0.8178511979744351, "learning_rate": 5e-06, "loss": 0.5844, "step": 1560 }, { "epoch": 0.33975330015148236, "grad_norm": 0.8640976250921749, "learning_rate": 5e-06, "loss": 0.5867, "step": 1570 }, { "epoch": 0.341917333910409, "grad_norm": 0.8225622251467227, "learning_rate": 5e-06, "loss": 0.5626, "step": 1580 }, { "epoch": 0.34408136766933567, "grad_norm": 0.809101202716206, "learning_rate": 5e-06, "loss": 0.5841, "step": 1590 }, { "epoch": 0.34624540142826227, "grad_norm": 0.8007366862509202, "learning_rate": 5e-06, "loss": 0.6002, "step": 1600 }, { "epoch": 0.3484094351871889, "grad_norm": 0.80921682472726, "learning_rate": 5e-06, "loss": 0.5756, "step": 1610 }, { "epoch": 0.3505734689461156, "grad_norm": 0.7919027933880338, "learning_rate": 5e-06, "loss": 0.5811, "step": 1620 }, { "epoch": 0.35273750270504217, "grad_norm": 0.8578709720525358, "learning_rate": 5e-06, "loss": 0.5844, "step": 1630 }, { "epoch": 0.3549015364639688, "grad_norm": 0.8919305010438147, "learning_rate": 5e-06, "loss": 0.5867, "step": 1640 }, { "epoch": 0.3570655702228955, "grad_norm": 0.8360856901044694, "learning_rate": 5e-06, "loss": 0.5959, "step": 1650 }, { "epoch": 0.35922960398182213, "grad_norm": 0.8675129695665484, "learning_rate": 5e-06, "loss": 0.5761, "step": 1660 }, { "epoch": 0.36139363774074873, "grad_norm": 0.7689087827995047, "learning_rate": 5e-06, "loss": 0.5861, "step": 1670 }, { "epoch": 0.3635576714996754, "grad_norm": 0.7720960735276896, "learning_rate": 5e-06, "loss": 0.5743, "step": 1680 }, { "epoch": 0.36572170525860204, "grad_norm": 0.7721866885573503, "learning_rate": 5e-06, "loss": 0.59, "step": 1690 }, { "epoch": 0.3678857390175287, "grad_norm": 0.7882290467361662, "learning_rate": 5e-06, "loss": 0.5755, "step": 1700 }, { "epoch": 0.3700497727764553, "grad_norm": 0.7556952201847303, "learning_rate": 5e-06, "loss": 0.5665, "step": 1710 }, { "epoch": 0.37221380653538194, "grad_norm": 0.791031843589153, "learning_rate": 5e-06, "loss": 0.5771, "step": 1720 }, { "epoch": 0.3743778402943086, "grad_norm": 0.7809696368341134, "learning_rate": 5e-06, "loss": 0.5653, "step": 1730 }, { "epoch": 0.37654187405323525, "grad_norm": 0.8629957045419663, "learning_rate": 5e-06, "loss": 0.5744, "step": 1740 }, { "epoch": 0.37870590781216185, "grad_norm": 0.735264733320356, "learning_rate": 5e-06, "loss": 0.568, "step": 1750 }, { "epoch": 0.3808699415710885, "grad_norm": 0.79820790046158, "learning_rate": 5e-06, "loss": 0.5756, "step": 1760 }, { "epoch": 0.38303397533001515, "grad_norm": 0.7990867854378455, "learning_rate": 5e-06, "loss": 0.5741, "step": 1770 }, { "epoch": 0.3851980090889418, "grad_norm": 0.8121981810970615, "learning_rate": 5e-06, "loss": 0.5784, "step": 1780 }, { "epoch": 0.3873620428478684, "grad_norm": 0.8133852852205034, "learning_rate": 5e-06, "loss": 0.5843, "step": 1790 }, { "epoch": 0.38952607660679506, "grad_norm": 0.8244168262600475, "learning_rate": 5e-06, "loss": 0.5711, "step": 1800 }, { "epoch": 0.3916901103657217, "grad_norm": 0.7650337026676514, "learning_rate": 5e-06, "loss": 0.567, "step": 1810 }, { "epoch": 0.39385414412464836, "grad_norm": 0.7884841014495525, "learning_rate": 5e-06, "loss": 0.587, "step": 1820 }, { "epoch": 0.39601817788357496, "grad_norm": 0.8218222575278361, "learning_rate": 5e-06, "loss": 0.5729, "step": 1830 }, { "epoch": 0.3981822116425016, "grad_norm": 0.7928094425107333, "learning_rate": 5e-06, "loss": 0.5677, "step": 1840 }, { "epoch": 0.40034624540142827, "grad_norm": 0.7768628011319236, "learning_rate": 5e-06, "loss": 0.5914, "step": 1850 }, { "epoch": 0.4025102791603549, "grad_norm": 0.7947010305390309, "learning_rate": 5e-06, "loss": 0.5748, "step": 1860 }, { "epoch": 0.4046743129192815, "grad_norm": 0.7697471130550405, "learning_rate": 5e-06, "loss": 0.5759, "step": 1870 }, { "epoch": 0.4068383466782082, "grad_norm": 0.796128902115479, "learning_rate": 5e-06, "loss": 0.5734, "step": 1880 }, { "epoch": 0.4090023804371348, "grad_norm": 0.7659679821804417, "learning_rate": 5e-06, "loss": 0.5631, "step": 1890 }, { "epoch": 0.4111664141960615, "grad_norm": 0.8580458852557561, "learning_rate": 5e-06, "loss": 0.5765, "step": 1900 }, { "epoch": 0.4133304479549881, "grad_norm": 0.7957186485957107, "learning_rate": 5e-06, "loss": 0.5658, "step": 1910 }, { "epoch": 0.41549448171391473, "grad_norm": 0.7853295564565586, "learning_rate": 5e-06, "loss": 0.5621, "step": 1920 }, { "epoch": 0.4176585154728414, "grad_norm": 0.720431628977774, "learning_rate": 5e-06, "loss": 0.565, "step": 1930 }, { "epoch": 0.41982254923176804, "grad_norm": 0.7826448016952756, "learning_rate": 5e-06, "loss": 0.5677, "step": 1940 }, { "epoch": 0.42198658299069464, "grad_norm": 0.7904824818785057, "learning_rate": 5e-06, "loss": 0.567, "step": 1950 }, { "epoch": 0.4241506167496213, "grad_norm": 0.7871987194350701, "learning_rate": 5e-06, "loss": 0.5812, "step": 1960 }, { "epoch": 0.42631465050854794, "grad_norm": 0.7967155088578817, "learning_rate": 5e-06, "loss": 0.559, "step": 1970 }, { "epoch": 0.4284786842674746, "grad_norm": 0.8505915926157366, "learning_rate": 5e-06, "loss": 0.5757, "step": 1980 }, { "epoch": 0.4306427180264012, "grad_norm": 0.7667510030206882, "learning_rate": 5e-06, "loss": 0.557, "step": 1990 }, { "epoch": 0.43280675178532785, "grad_norm": 0.8161411756414166, "learning_rate": 5e-06, "loss": 0.5886, "step": 2000 }, { "epoch": 0.4349707855442545, "grad_norm": 0.8557821729072761, "learning_rate": 5e-06, "loss": 0.5815, "step": 2010 }, { "epoch": 0.43713481930318115, "grad_norm": 0.7980272351612475, "learning_rate": 5e-06, "loss": 0.5782, "step": 2020 }, { "epoch": 0.43929885306210775, "grad_norm": 0.7855484364592483, "learning_rate": 5e-06, "loss": 0.565, "step": 2030 }, { "epoch": 0.4414628868210344, "grad_norm": 0.742503374275201, "learning_rate": 5e-06, "loss": 0.5726, "step": 2040 }, { "epoch": 0.44362692057996106, "grad_norm": 0.8265176138612144, "learning_rate": 5e-06, "loss": 0.565, "step": 2050 }, { "epoch": 0.4457909543388877, "grad_norm": 0.7910125817205834, "learning_rate": 5e-06, "loss": 0.5738, "step": 2060 }, { "epoch": 0.4479549880978143, "grad_norm": 0.7624732216492297, "learning_rate": 5e-06, "loss": 0.5539, "step": 2070 }, { "epoch": 0.45011902185674096, "grad_norm": 0.8024443330805618, "learning_rate": 5e-06, "loss": 0.5684, "step": 2080 }, { "epoch": 0.4522830556156676, "grad_norm": 0.8252242841372383, "learning_rate": 5e-06, "loss": 0.5638, "step": 2090 }, { "epoch": 0.45444708937459427, "grad_norm": 0.7918799931003818, "learning_rate": 5e-06, "loss": 0.5656, "step": 2100 }, { "epoch": 0.45661112313352087, "grad_norm": 0.8503297988500775, "learning_rate": 5e-06, "loss": 0.5878, "step": 2110 }, { "epoch": 0.4587751568924475, "grad_norm": 0.7881054994019235, "learning_rate": 5e-06, "loss": 0.5688, "step": 2120 }, { "epoch": 0.4609391906513742, "grad_norm": 0.8298283284928384, "learning_rate": 5e-06, "loss": 0.5662, "step": 2130 }, { "epoch": 0.4631032244103008, "grad_norm": 2.6984894013557374, "learning_rate": 5e-06, "loss": 0.5691, "step": 2140 }, { "epoch": 0.4652672581692274, "grad_norm": 0.8293615927885423, "learning_rate": 5e-06, "loss": 0.5624, "step": 2150 }, { "epoch": 0.4674312919281541, "grad_norm": 0.7908261626595905, "learning_rate": 5e-06, "loss": 0.5601, "step": 2160 }, { "epoch": 0.46959532568708073, "grad_norm": 0.8407395655282288, "learning_rate": 5e-06, "loss": 0.5789, "step": 2170 }, { "epoch": 0.47175935944600733, "grad_norm": 0.7855312267072224, "learning_rate": 5e-06, "loss": 0.5592, "step": 2180 }, { "epoch": 0.473923393204934, "grad_norm": 0.8177276022998019, "learning_rate": 5e-06, "loss": 0.5642, "step": 2190 }, { "epoch": 0.47608742696386064, "grad_norm": 0.7674903085772374, "learning_rate": 5e-06, "loss": 0.559, "step": 2200 }, { "epoch": 0.4782514607227873, "grad_norm": 0.8138473478647174, "learning_rate": 5e-06, "loss": 0.5744, "step": 2210 }, { "epoch": 0.4804154944817139, "grad_norm": 0.7985572678524506, "learning_rate": 5e-06, "loss": 0.5608, "step": 2220 }, { "epoch": 0.48257952824064054, "grad_norm": 0.7978238400519106, "learning_rate": 5e-06, "loss": 0.56, "step": 2230 }, { "epoch": 0.4847435619995672, "grad_norm": 0.786485961749514, "learning_rate": 5e-06, "loss": 0.5806, "step": 2240 }, { "epoch": 0.48690759575849385, "grad_norm": 0.7835696169083621, "learning_rate": 5e-06, "loss": 0.5813, "step": 2250 }, { "epoch": 0.48907162951742045, "grad_norm": 0.8038286577487808, "learning_rate": 5e-06, "loss": 0.5787, "step": 2260 }, { "epoch": 0.4912356632763471, "grad_norm": 0.7949579187720448, "learning_rate": 5e-06, "loss": 0.5603, "step": 2270 }, { "epoch": 0.49339969703527375, "grad_norm": 0.7996876857887395, "learning_rate": 5e-06, "loss": 0.564, "step": 2280 }, { "epoch": 0.4955637307942004, "grad_norm": 0.7523979963095976, "learning_rate": 5e-06, "loss": 0.572, "step": 2290 }, { "epoch": 0.497727764553127, "grad_norm": 0.7799060855112318, "learning_rate": 5e-06, "loss": 0.5512, "step": 2300 }, { "epoch": 0.49989179831205366, "grad_norm": 0.8354130726326037, "learning_rate": 5e-06, "loss": 0.5672, "step": 2310 }, { "epoch": 0.5020558320709803, "grad_norm": 0.8174289167975042, "learning_rate": 5e-06, "loss": 0.5745, "step": 2320 }, { "epoch": 0.504219865829907, "grad_norm": 0.7797704636452041, "learning_rate": 5e-06, "loss": 0.5785, "step": 2330 }, { "epoch": 0.5063838995888336, "grad_norm": 0.8049184278922541, "learning_rate": 5e-06, "loss": 0.5663, "step": 2340 }, { "epoch": 0.5085479333477603, "grad_norm": 0.79714737630827, "learning_rate": 5e-06, "loss": 0.5696, "step": 2350 }, { "epoch": 0.5107119671066869, "grad_norm": 0.8079273656180676, "learning_rate": 5e-06, "loss": 0.5611, "step": 2360 }, { "epoch": 0.5128760008656135, "grad_norm": 0.7996864982834264, "learning_rate": 5e-06, "loss": 0.5634, "step": 2370 }, { "epoch": 0.5150400346245402, "grad_norm": 0.7392249752105758, "learning_rate": 5e-06, "loss": 0.5441, "step": 2380 }, { "epoch": 0.5172040683834668, "grad_norm": 0.8235210489123114, "learning_rate": 5e-06, "loss": 0.5616, "step": 2390 }, { "epoch": 0.5193681021423934, "grad_norm": 0.7859990977815138, "learning_rate": 5e-06, "loss": 0.5738, "step": 2400 }, { "epoch": 0.5215321359013201, "grad_norm": 0.8006842059420661, "learning_rate": 5e-06, "loss": 0.5669, "step": 2410 }, { "epoch": 0.5236961696602467, "grad_norm": 0.8301226308008953, "learning_rate": 5e-06, "loss": 0.5585, "step": 2420 }, { "epoch": 0.5258602034191734, "grad_norm": 0.8116242656076512, "learning_rate": 5e-06, "loss": 0.5575, "step": 2430 }, { "epoch": 0.5280242371781, "grad_norm": 0.7951327769075789, "learning_rate": 5e-06, "loss": 0.5623, "step": 2440 }, { "epoch": 0.5301882709370266, "grad_norm": 0.7632924959396983, "learning_rate": 5e-06, "loss": 0.5557, "step": 2450 }, { "epoch": 0.5323523046959533, "grad_norm": 0.7703765895080564, "learning_rate": 5e-06, "loss": 0.5674, "step": 2460 }, { "epoch": 0.5345163384548799, "grad_norm": 0.7519955078038866, "learning_rate": 5e-06, "loss": 0.5517, "step": 2470 }, { "epoch": 0.5366803722138065, "grad_norm": 0.777945289031897, "learning_rate": 5e-06, "loss": 0.5503, "step": 2480 }, { "epoch": 0.5388444059727332, "grad_norm": 0.7925318160395382, "learning_rate": 5e-06, "loss": 0.5657, "step": 2490 }, { "epoch": 0.5410084397316598, "grad_norm": 0.8170912915049142, "learning_rate": 5e-06, "loss": 0.5666, "step": 2500 }, { "epoch": 0.5431724734905865, "grad_norm": 0.7848624682678103, "learning_rate": 5e-06, "loss": 0.5644, "step": 2510 }, { "epoch": 0.5453365072495131, "grad_norm": 0.8305872418987535, "learning_rate": 5e-06, "loss": 0.5579, "step": 2520 }, { "epoch": 0.5475005410084397, "grad_norm": 0.754778805221588, "learning_rate": 5e-06, "loss": 0.5447, "step": 2530 }, { "epoch": 0.5496645747673664, "grad_norm": 0.8433164922898455, "learning_rate": 5e-06, "loss": 0.577, "step": 2540 }, { "epoch": 0.551828608526293, "grad_norm": 0.7364207026132309, "learning_rate": 5e-06, "loss": 0.5502, "step": 2550 }, { "epoch": 0.5539926422852196, "grad_norm": 0.7703003076672548, "learning_rate": 5e-06, "loss": 0.5391, "step": 2560 }, { "epoch": 0.5561566760441463, "grad_norm": 0.7874453563557188, "learning_rate": 5e-06, "loss": 0.5579, "step": 2570 }, { "epoch": 0.5583207098030729, "grad_norm": 0.7946629512850162, "learning_rate": 5e-06, "loss": 0.5667, "step": 2580 }, { "epoch": 0.5604847435619996, "grad_norm": 0.7841342441931893, "learning_rate": 5e-06, "loss": 0.5552, "step": 2590 }, { "epoch": 0.5626487773209262, "grad_norm": 0.8770518493960907, "learning_rate": 5e-06, "loss": 0.5654, "step": 2600 }, { "epoch": 0.5648128110798528, "grad_norm": 0.7612803825837043, "learning_rate": 5e-06, "loss": 0.5481, "step": 2610 }, { "epoch": 0.5669768448387795, "grad_norm": 0.8112957896812497, "learning_rate": 5e-06, "loss": 0.5697, "step": 2620 }, { "epoch": 0.5691408785977061, "grad_norm": 0.8292780888969601, "learning_rate": 5e-06, "loss": 0.5631, "step": 2630 }, { "epoch": 0.5713049123566327, "grad_norm": 0.8534427911951067, "learning_rate": 5e-06, "loss": 0.5675, "step": 2640 }, { "epoch": 0.5734689461155594, "grad_norm": 0.7983159182220968, "learning_rate": 5e-06, "loss": 0.5651, "step": 2650 }, { "epoch": 0.575632979874486, "grad_norm": 0.8286911243296953, "learning_rate": 5e-06, "loss": 0.5661, "step": 2660 }, { "epoch": 0.5777970136334126, "grad_norm": 0.7873673354292886, "learning_rate": 5e-06, "loss": 0.5568, "step": 2670 }, { "epoch": 0.5799610473923393, "grad_norm": 0.8643973853008751, "learning_rate": 5e-06, "loss": 0.5529, "step": 2680 }, { "epoch": 0.5821250811512659, "grad_norm": 0.8297018704245965, "learning_rate": 5e-06, "loss": 0.5646, "step": 2690 }, { "epoch": 0.5842891149101926, "grad_norm": 0.8282391086423111, "learning_rate": 5e-06, "loss": 0.5704, "step": 2700 }, { "epoch": 0.5864531486691192, "grad_norm": 0.8185226749510477, "learning_rate": 5e-06, "loss": 0.5732, "step": 2710 }, { "epoch": 0.5886171824280458, "grad_norm": 0.7837221330454157, "learning_rate": 5e-06, "loss": 0.5668, "step": 2720 }, { "epoch": 0.5907812161869725, "grad_norm": 0.7830075768638615, "learning_rate": 5e-06, "loss": 0.5614, "step": 2730 }, { "epoch": 0.5929452499458991, "grad_norm": 0.7669203988650726, "learning_rate": 5e-06, "loss": 0.5632, "step": 2740 }, { "epoch": 0.5951092837048257, "grad_norm": 0.8064484665186933, "learning_rate": 5e-06, "loss": 0.5599, "step": 2750 }, { "epoch": 0.5972733174637525, "grad_norm": 0.8468294496439435, "learning_rate": 5e-06, "loss": 0.5451, "step": 2760 }, { "epoch": 0.599437351222679, "grad_norm": 0.9077279068821958, "learning_rate": 5e-06, "loss": 0.5724, "step": 2770 }, { "epoch": 0.6016013849816058, "grad_norm": 0.7929898760133751, "learning_rate": 5e-06, "loss": 0.5616, "step": 2780 }, { "epoch": 0.6037654187405324, "grad_norm": 0.7971762324043354, "learning_rate": 5e-06, "loss": 0.563, "step": 2790 }, { "epoch": 0.605929452499459, "grad_norm": 0.8288075182953192, "learning_rate": 5e-06, "loss": 0.5579, "step": 2800 }, { "epoch": 0.6080934862583857, "grad_norm": 0.8498140525184206, "learning_rate": 5e-06, "loss": 0.564, "step": 2810 }, { "epoch": 0.6102575200173123, "grad_norm": 0.8102245476480473, "learning_rate": 5e-06, "loss": 0.5609, "step": 2820 }, { "epoch": 0.6124215537762389, "grad_norm": 0.7890358297228374, "learning_rate": 5e-06, "loss": 0.5665, "step": 2830 }, { "epoch": 0.6145855875351656, "grad_norm": 0.7979304789718528, "learning_rate": 5e-06, "loss": 0.5485, "step": 2840 }, { "epoch": 0.6167496212940922, "grad_norm": 0.8003173415042142, "learning_rate": 5e-06, "loss": 0.5598, "step": 2850 }, { "epoch": 0.6189136550530189, "grad_norm": 0.7767086997147948, "learning_rate": 5e-06, "loss": 0.5516, "step": 2860 }, { "epoch": 0.6210776888119455, "grad_norm": 0.8045021838341775, "learning_rate": 5e-06, "loss": 0.5368, "step": 2870 }, { "epoch": 0.6232417225708721, "grad_norm": 0.7498417279355564, "learning_rate": 5e-06, "loss": 0.5518, "step": 2880 }, { "epoch": 0.6254057563297988, "grad_norm": 0.8253099324603884, "learning_rate": 5e-06, "loss": 0.5581, "step": 2890 }, { "epoch": 0.6275697900887254, "grad_norm": 0.8079450289566134, "learning_rate": 5e-06, "loss": 0.5488, "step": 2900 }, { "epoch": 0.629733823847652, "grad_norm": 0.7952980563003963, "learning_rate": 5e-06, "loss": 0.5685, "step": 2910 }, { "epoch": 0.6318978576065787, "grad_norm": 0.7835163645285544, "learning_rate": 5e-06, "loss": 0.5477, "step": 2920 }, { "epoch": 0.6340618913655053, "grad_norm": 0.8048903722703916, "learning_rate": 5e-06, "loss": 0.5596, "step": 2930 }, { "epoch": 0.636225925124432, "grad_norm": 0.81044303853561, "learning_rate": 5e-06, "loss": 0.5598, "step": 2940 }, { "epoch": 0.6383899588833586, "grad_norm": 0.8174138765074157, "learning_rate": 5e-06, "loss": 0.5505, "step": 2950 }, { "epoch": 0.6405539926422852, "grad_norm": 0.8475108197008772, "learning_rate": 5e-06, "loss": 0.5424, "step": 2960 }, { "epoch": 0.6427180264012119, "grad_norm": 0.8042022600558663, "learning_rate": 5e-06, "loss": 0.5539, "step": 2970 }, { "epoch": 0.6448820601601385, "grad_norm": 0.7700287206840692, "learning_rate": 5e-06, "loss": 0.5513, "step": 2980 }, { "epoch": 0.6470460939190651, "grad_norm": 0.8281997779232809, "learning_rate": 5e-06, "loss": 0.5467, "step": 2990 }, { "epoch": 0.6492101276779918, "grad_norm": 0.813584320341026, "learning_rate": 5e-06, "loss": 0.5603, "step": 3000 }, { "epoch": 0.6513741614369184, "grad_norm": 0.8585543401025539, "learning_rate": 5e-06, "loss": 0.5701, "step": 3010 }, { "epoch": 0.6535381951958451, "grad_norm": 0.836920539360064, "learning_rate": 5e-06, "loss": 0.5504, "step": 3020 }, { "epoch": 0.6557022289547717, "grad_norm": 1.0694200561637055, "learning_rate": 5e-06, "loss": 0.5536, "step": 3030 }, { "epoch": 0.6578662627136983, "grad_norm": 0.7556155594948578, "learning_rate": 5e-06, "loss": 0.5421, "step": 3040 }, { "epoch": 0.660030296472625, "grad_norm": 0.8263966978271399, "learning_rate": 5e-06, "loss": 0.542, "step": 3050 }, { "epoch": 0.6621943302315516, "grad_norm": 0.7959474918960274, "learning_rate": 5e-06, "loss": 0.5544, "step": 3060 }, { "epoch": 0.6643583639904782, "grad_norm": 0.782166640538943, "learning_rate": 5e-06, "loss": 0.5581, "step": 3070 }, { "epoch": 0.6665223977494049, "grad_norm": 0.8119704290024918, "learning_rate": 5e-06, "loss": 0.5652, "step": 3080 }, { "epoch": 0.6686864315083315, "grad_norm": 0.8281560870508221, "learning_rate": 5e-06, "loss": 0.5634, "step": 3090 }, { "epoch": 0.6708504652672582, "grad_norm": 0.8095783840417501, "learning_rate": 5e-06, "loss": 0.5601, "step": 3100 }, { "epoch": 0.6730144990261848, "grad_norm": 0.795068179288108, "learning_rate": 5e-06, "loss": 0.5678, "step": 3110 }, { "epoch": 0.6751785327851114, "grad_norm": 0.7792728010855305, "learning_rate": 5e-06, "loss": 0.5607, "step": 3120 }, { "epoch": 0.6773425665440381, "grad_norm": 0.8179451854658257, "learning_rate": 5e-06, "loss": 0.5435, "step": 3130 }, { "epoch": 0.6795066003029647, "grad_norm": 0.7667511572553716, "learning_rate": 5e-06, "loss": 0.5596, "step": 3140 }, { "epoch": 0.6816706340618913, "grad_norm": 0.8029813372366518, "learning_rate": 5e-06, "loss": 0.55, "step": 3150 }, { "epoch": 0.683834667820818, "grad_norm": 0.7987681487999668, "learning_rate": 5e-06, "loss": 0.5451, "step": 3160 }, { "epoch": 0.6859987015797446, "grad_norm": 0.779467033707979, "learning_rate": 5e-06, "loss": 0.543, "step": 3170 }, { "epoch": 0.6881627353386713, "grad_norm": 0.7628766043647324, "learning_rate": 5e-06, "loss": 0.549, "step": 3180 }, { "epoch": 0.6903267690975979, "grad_norm": 0.7850057643186282, "learning_rate": 5e-06, "loss": 0.5668, "step": 3190 }, { "epoch": 0.6924908028565245, "grad_norm": 0.8407107964724864, "learning_rate": 5e-06, "loss": 0.5501, "step": 3200 }, { "epoch": 0.6946548366154512, "grad_norm": 0.7767999916714133, "learning_rate": 5e-06, "loss": 0.543, "step": 3210 }, { "epoch": 0.6968188703743778, "grad_norm": 0.7996258238832009, "learning_rate": 5e-06, "loss": 0.5483, "step": 3220 }, { "epoch": 0.6989829041333044, "grad_norm": 0.8377643899045712, "learning_rate": 5e-06, "loss": 0.5487, "step": 3230 }, { "epoch": 0.7011469378922311, "grad_norm": 0.7514752154537118, "learning_rate": 5e-06, "loss": 0.5597, "step": 3240 }, { "epoch": 0.7033109716511577, "grad_norm": 0.8416197122709502, "learning_rate": 5e-06, "loss": 0.5565, "step": 3250 }, { "epoch": 0.7054750054100843, "grad_norm": 0.7931722916017413, "learning_rate": 5e-06, "loss": 0.5546, "step": 3260 }, { "epoch": 0.707639039169011, "grad_norm": 0.8217577869277688, "learning_rate": 5e-06, "loss": 0.5653, "step": 3270 }, { "epoch": 0.7098030729279377, "grad_norm": 0.7118453738807855, "learning_rate": 5e-06, "loss": 0.5402, "step": 3280 }, { "epoch": 0.7119671066868644, "grad_norm": 0.7697241307191445, "learning_rate": 5e-06, "loss": 0.5589, "step": 3290 }, { "epoch": 0.714131140445791, "grad_norm": 0.802274974087682, "learning_rate": 5e-06, "loss": 0.5551, "step": 3300 }, { "epoch": 0.7162951742047176, "grad_norm": 0.792023853084678, "learning_rate": 5e-06, "loss": 0.5493, "step": 3310 }, { "epoch": 0.7184592079636443, "grad_norm": 0.7544132296053949, "learning_rate": 5e-06, "loss": 0.5393, "step": 3320 }, { "epoch": 0.7206232417225709, "grad_norm": 0.7980914984239874, "learning_rate": 5e-06, "loss": 0.5433, "step": 3330 }, { "epoch": 0.7227872754814975, "grad_norm": 0.784437501269065, "learning_rate": 5e-06, "loss": 0.5568, "step": 3340 }, { "epoch": 0.7249513092404242, "grad_norm": 0.7614522809136514, "learning_rate": 5e-06, "loss": 0.5455, "step": 3350 }, { "epoch": 0.7271153429993508, "grad_norm": 0.7734683988885462, "learning_rate": 5e-06, "loss": 0.5501, "step": 3360 }, { "epoch": 0.7292793767582775, "grad_norm": 0.7866152633301271, "learning_rate": 5e-06, "loss": 0.556, "step": 3370 }, { "epoch": 0.7314434105172041, "grad_norm": 0.7702246648713589, "learning_rate": 5e-06, "loss": 0.5497, "step": 3380 }, { "epoch": 0.7336074442761307, "grad_norm": 0.7600423928776777, "learning_rate": 5e-06, "loss": 0.5505, "step": 3390 }, { "epoch": 0.7357714780350574, "grad_norm": 0.807615131927307, "learning_rate": 5e-06, "loss": 0.5634, "step": 3400 }, { "epoch": 0.737935511793984, "grad_norm": 0.7922705128025508, "learning_rate": 5e-06, "loss": 0.558, "step": 3410 }, { "epoch": 0.7400995455529106, "grad_norm": 0.8339598322147505, "learning_rate": 5e-06, "loss": 0.552, "step": 3420 }, { "epoch": 0.7422635793118373, "grad_norm": 0.7464262500668064, "learning_rate": 5e-06, "loss": 0.5483, "step": 3430 }, { "epoch": 0.7444276130707639, "grad_norm": 0.7741739625171083, "learning_rate": 5e-06, "loss": 0.5573, "step": 3440 }, { "epoch": 0.7465916468296906, "grad_norm": 0.7453424498568156, "learning_rate": 5e-06, "loss": 0.5438, "step": 3450 }, { "epoch": 0.7487556805886172, "grad_norm": 0.754303311641887, "learning_rate": 5e-06, "loss": 0.5548, "step": 3460 }, { "epoch": 0.7509197143475438, "grad_norm": 0.8689734619851907, "learning_rate": 5e-06, "loss": 0.556, "step": 3470 }, { "epoch": 0.7530837481064705, "grad_norm": 0.7773889781541226, "learning_rate": 5e-06, "loss": 0.5494, "step": 3480 }, { "epoch": 0.7552477818653971, "grad_norm": 0.7849782050400904, "learning_rate": 5e-06, "loss": 0.5537, "step": 3490 }, { "epoch": 0.7574118156243237, "grad_norm": 0.7637907513233742, "learning_rate": 5e-06, "loss": 0.5559, "step": 3500 }, { "epoch": 0.7595758493832504, "grad_norm": 0.7679605914857784, "learning_rate": 5e-06, "loss": 0.5576, "step": 3510 }, { "epoch": 0.761739883142177, "grad_norm": 0.828819439384587, "learning_rate": 5e-06, "loss": 0.566, "step": 3520 }, { "epoch": 0.7639039169011037, "grad_norm": 0.8446403352206237, "learning_rate": 5e-06, "loss": 0.5583, "step": 3530 }, { "epoch": 0.7660679506600303, "grad_norm": 0.8075329194131221, "learning_rate": 5e-06, "loss": 0.5592, "step": 3540 }, { "epoch": 0.7682319844189569, "grad_norm": 0.8442605962725928, "learning_rate": 5e-06, "loss": 0.5439, "step": 3550 }, { "epoch": 0.7703960181778836, "grad_norm": 0.8588068255220394, "learning_rate": 5e-06, "loss": 0.5587, "step": 3560 }, { "epoch": 0.7725600519368102, "grad_norm": 0.8524091917859525, "learning_rate": 5e-06, "loss": 0.5517, "step": 3570 }, { "epoch": 0.7747240856957368, "grad_norm": 0.8244971263231196, "learning_rate": 5e-06, "loss": 0.561, "step": 3580 }, { "epoch": 0.7768881194546635, "grad_norm": 0.7627478595291142, "learning_rate": 5e-06, "loss": 0.5641, "step": 3590 }, { "epoch": 0.7790521532135901, "grad_norm": 0.8086037426148818, "learning_rate": 5e-06, "loss": 0.5621, "step": 3600 }, { "epoch": 0.7812161869725168, "grad_norm": 0.791930301424169, "learning_rate": 5e-06, "loss": 0.5471, "step": 3610 }, { "epoch": 0.7833802207314434, "grad_norm": 0.7878241907870572, "learning_rate": 5e-06, "loss": 0.5521, "step": 3620 }, { "epoch": 0.78554425449037, "grad_norm": 0.7830841775537941, "learning_rate": 5e-06, "loss": 0.5467, "step": 3630 }, { "epoch": 0.7877082882492967, "grad_norm": 0.7623641467313392, "learning_rate": 5e-06, "loss": 0.5336, "step": 3640 }, { "epoch": 0.7898723220082233, "grad_norm": 0.8232677684209438, "learning_rate": 5e-06, "loss": 0.5651, "step": 3650 }, { "epoch": 0.7920363557671499, "grad_norm": 0.761362883743628, "learning_rate": 5e-06, "loss": 0.5597, "step": 3660 }, { "epoch": 0.7942003895260766, "grad_norm": 0.8637874664285103, "learning_rate": 5e-06, "loss": 0.5602, "step": 3670 }, { "epoch": 0.7963644232850032, "grad_norm": 0.8267668782394713, "learning_rate": 5e-06, "loss": 0.5561, "step": 3680 }, { "epoch": 0.7985284570439299, "grad_norm": 0.817446940423478, "learning_rate": 5e-06, "loss": 0.5575, "step": 3690 }, { "epoch": 0.8006924908028565, "grad_norm": 0.7818597023986829, "learning_rate": 5e-06, "loss": 0.5604, "step": 3700 }, { "epoch": 0.8028565245617831, "grad_norm": 0.776499232049182, "learning_rate": 5e-06, "loss": 0.5643, "step": 3710 }, { "epoch": 0.8050205583207098, "grad_norm": 0.8364781671818916, "learning_rate": 5e-06, "loss": 0.5438, "step": 3720 }, { "epoch": 0.8071845920796364, "grad_norm": 0.8397874518563361, "learning_rate": 5e-06, "loss": 0.5513, "step": 3730 }, { "epoch": 0.809348625838563, "grad_norm": 0.7969750238399832, "learning_rate": 5e-06, "loss": 0.5402, "step": 3740 }, { "epoch": 0.8115126595974897, "grad_norm": 0.9068956852645959, "learning_rate": 5e-06, "loss": 0.5569, "step": 3750 }, { "epoch": 0.8136766933564163, "grad_norm": 0.7536728350124055, "learning_rate": 5e-06, "loss": 0.5419, "step": 3760 }, { "epoch": 0.815840727115343, "grad_norm": 0.7676812187895282, "learning_rate": 5e-06, "loss": 0.547, "step": 3770 }, { "epoch": 0.8180047608742697, "grad_norm": 0.762833586705676, "learning_rate": 5e-06, "loss": 0.5521, "step": 3780 }, { "epoch": 0.8201687946331963, "grad_norm": 0.756771887345399, "learning_rate": 5e-06, "loss": 0.5556, "step": 3790 }, { "epoch": 0.822332828392123, "grad_norm": 0.8734848583188696, "learning_rate": 5e-06, "loss": 0.5545, "step": 3800 }, { "epoch": 0.8244968621510496, "grad_norm": 0.8004103342026402, "learning_rate": 5e-06, "loss": 0.5513, "step": 3810 }, { "epoch": 0.8266608959099762, "grad_norm": 0.7719483146665875, "learning_rate": 5e-06, "loss": 0.5446, "step": 3820 }, { "epoch": 0.8288249296689029, "grad_norm": 0.7744925484130326, "learning_rate": 5e-06, "loss": 0.5635, "step": 3830 }, { "epoch": 0.8309889634278295, "grad_norm": 0.7562447275301897, "learning_rate": 5e-06, "loss": 0.5444, "step": 3840 }, { "epoch": 0.8331529971867561, "grad_norm": 0.7924604852102323, "learning_rate": 5e-06, "loss": 0.5421, "step": 3850 }, { "epoch": 0.8353170309456828, "grad_norm": 0.8520564532781372, "learning_rate": 5e-06, "loss": 0.5601, "step": 3860 }, { "epoch": 0.8374810647046094, "grad_norm": 0.8746743288593334, "learning_rate": 5e-06, "loss": 0.5509, "step": 3870 }, { "epoch": 0.8396450984635361, "grad_norm": 0.7873244885756802, "learning_rate": 5e-06, "loss": 0.5472, "step": 3880 }, { "epoch": 0.8418091322224627, "grad_norm": 0.8209663148355055, "learning_rate": 5e-06, "loss": 0.5385, "step": 3890 }, { "epoch": 0.8439731659813893, "grad_norm": 0.819629928526103, "learning_rate": 5e-06, "loss": 0.5392, "step": 3900 }, { "epoch": 0.846137199740316, "grad_norm": 0.8584352651397266, "learning_rate": 5e-06, "loss": 0.5554, "step": 3910 }, { "epoch": 0.8483012334992426, "grad_norm": 0.8503960560319187, "learning_rate": 5e-06, "loss": 0.5363, "step": 3920 }, { "epoch": 0.8504652672581692, "grad_norm": 0.8108480185136313, "learning_rate": 5e-06, "loss": 0.5495, "step": 3930 }, { "epoch": 0.8526293010170959, "grad_norm": 0.7373181469886592, "learning_rate": 5e-06, "loss": 0.5734, "step": 3940 }, { "epoch": 0.8547933347760225, "grad_norm": 0.8163364176974437, "learning_rate": 5e-06, "loss": 0.5599, "step": 3950 }, { "epoch": 0.8569573685349492, "grad_norm": 0.7959374117805932, "learning_rate": 5e-06, "loss": 0.5454, "step": 3960 }, { "epoch": 0.8591214022938758, "grad_norm": 0.812330127649017, "learning_rate": 5e-06, "loss": 0.5471, "step": 3970 }, { "epoch": 0.8612854360528024, "grad_norm": 0.7945522211940125, "learning_rate": 5e-06, "loss": 0.5403, "step": 3980 }, { "epoch": 0.8634494698117291, "grad_norm": 0.7696968400441544, "learning_rate": 5e-06, "loss": 0.5423, "step": 3990 }, { "epoch": 0.8656135035706557, "grad_norm": 0.7450513825177912, "learning_rate": 5e-06, "loss": 0.5405, "step": 4000 }, { "epoch": 0.8677775373295823, "grad_norm": 0.8008938478720109, "learning_rate": 5e-06, "loss": 0.5453, "step": 4010 }, { "epoch": 0.869941571088509, "grad_norm": 0.7624217719794756, "learning_rate": 5e-06, "loss": 0.536, "step": 4020 }, { "epoch": 0.8721056048474356, "grad_norm": 0.7876625538590819, "learning_rate": 5e-06, "loss": 0.55, "step": 4030 }, { "epoch": 0.8742696386063623, "grad_norm": 0.7667825013844164, "learning_rate": 5e-06, "loss": 0.5425, "step": 4040 }, { "epoch": 0.8764336723652889, "grad_norm": 0.8391975491875862, "learning_rate": 5e-06, "loss": 0.5619, "step": 4050 }, { "epoch": 0.8785977061242155, "grad_norm": 0.7580142563713078, "learning_rate": 5e-06, "loss": 0.5395, "step": 4060 }, { "epoch": 0.8807617398831422, "grad_norm": 0.873766541325802, "learning_rate": 5e-06, "loss": 0.5498, "step": 4070 }, { "epoch": 0.8829257736420688, "grad_norm": 0.8045714640840378, "learning_rate": 5e-06, "loss": 0.5565, "step": 4080 }, { "epoch": 0.8850898074009954, "grad_norm": 0.7839455981681682, "learning_rate": 5e-06, "loss": 0.5568, "step": 4090 }, { "epoch": 0.8872538411599221, "grad_norm": 0.7507205318651358, "learning_rate": 5e-06, "loss": 0.5446, "step": 4100 }, { "epoch": 0.8894178749188487, "grad_norm": 0.7770875343736735, "learning_rate": 5e-06, "loss": 0.5478, "step": 4110 }, { "epoch": 0.8915819086777754, "grad_norm": 0.739795465311441, "learning_rate": 5e-06, "loss": 0.5488, "step": 4120 }, { "epoch": 0.893745942436702, "grad_norm": 0.8343986949867656, "learning_rate": 5e-06, "loss": 0.5502, "step": 4130 }, { "epoch": 0.8959099761956286, "grad_norm": 0.8072361062581804, "learning_rate": 5e-06, "loss": 0.5321, "step": 4140 }, { "epoch": 0.8980740099545553, "grad_norm": 0.7921719388331905, "learning_rate": 5e-06, "loss": 0.5469, "step": 4150 }, { "epoch": 0.9002380437134819, "grad_norm": 0.7996953628324828, "learning_rate": 5e-06, "loss": 0.5557, "step": 4160 }, { "epoch": 0.9024020774724085, "grad_norm": 0.7272138954178266, "learning_rate": 5e-06, "loss": 0.5472, "step": 4170 }, { "epoch": 0.9045661112313352, "grad_norm": 0.8812123709262756, "learning_rate": 5e-06, "loss": 0.5349, "step": 4180 }, { "epoch": 0.9067301449902618, "grad_norm": 0.8356868824967147, "learning_rate": 5e-06, "loss": 0.5362, "step": 4190 }, { "epoch": 0.9088941787491885, "grad_norm": 0.7972289983410239, "learning_rate": 5e-06, "loss": 0.5493, "step": 4200 }, { "epoch": 0.9110582125081151, "grad_norm": 0.8872937838904918, "learning_rate": 5e-06, "loss": 0.5406, "step": 4210 }, { "epoch": 0.9132222462670417, "grad_norm": 0.7649271319041719, "learning_rate": 5e-06, "loss": 0.5508, "step": 4220 }, { "epoch": 0.9153862800259684, "grad_norm": 0.8237087753008616, "learning_rate": 5e-06, "loss": 0.5448, "step": 4230 }, { "epoch": 0.917550313784895, "grad_norm": 0.7789310787368502, "learning_rate": 5e-06, "loss": 0.5477, "step": 4240 }, { "epoch": 0.9197143475438216, "grad_norm": 0.8216295495680673, "learning_rate": 5e-06, "loss": 0.562, "step": 4250 }, { "epoch": 0.9218783813027484, "grad_norm": 0.8122833746800567, "learning_rate": 5e-06, "loss": 0.5434, "step": 4260 }, { "epoch": 0.924042415061675, "grad_norm": 0.798727376063973, "learning_rate": 5e-06, "loss": 0.556, "step": 4270 }, { "epoch": 0.9262064488206015, "grad_norm": 0.7984668327444104, "learning_rate": 5e-06, "loss": 0.5397, "step": 4280 }, { "epoch": 0.9283704825795283, "grad_norm": 0.7812328766656631, "learning_rate": 5e-06, "loss": 0.5348, "step": 4290 }, { "epoch": 0.9305345163384549, "grad_norm": 0.7893412102379088, "learning_rate": 5e-06, "loss": 0.549, "step": 4300 }, { "epoch": 0.9326985500973816, "grad_norm": 0.7981646202301648, "learning_rate": 5e-06, "loss": 0.526, "step": 4310 }, { "epoch": 0.9348625838563082, "grad_norm": 0.7730443584141122, "learning_rate": 5e-06, "loss": 0.5339, "step": 4320 }, { "epoch": 0.9370266176152348, "grad_norm": 0.844663764946019, "learning_rate": 5e-06, "loss": 0.5458, "step": 4330 }, { "epoch": 0.9391906513741615, "grad_norm": 0.7794979343833512, "learning_rate": 5e-06, "loss": 0.5505, "step": 4340 }, { "epoch": 0.9413546851330881, "grad_norm": 0.7837049408297568, "learning_rate": 5e-06, "loss": 0.5497, "step": 4350 }, { "epoch": 0.9435187188920147, "grad_norm": 0.7854813181023578, "learning_rate": 5e-06, "loss": 0.5563, "step": 4360 }, { "epoch": 0.9456827526509414, "grad_norm": 0.8390893115478139, "learning_rate": 5e-06, "loss": 0.55, "step": 4370 }, { "epoch": 0.947846786409868, "grad_norm": 0.7933589395245781, "learning_rate": 5e-06, "loss": 0.5375, "step": 4380 }, { "epoch": 0.9500108201687947, "grad_norm": 0.7553926376395359, "learning_rate": 5e-06, "loss": 0.5527, "step": 4390 }, { "epoch": 0.9521748539277213, "grad_norm": 0.745261362871144, "learning_rate": 5e-06, "loss": 0.539, "step": 4400 }, { "epoch": 0.9543388876866479, "grad_norm": 0.8391407287157648, "learning_rate": 5e-06, "loss": 0.5523, "step": 4410 }, { "epoch": 0.9565029214455746, "grad_norm": 0.8243378126848461, "learning_rate": 5e-06, "loss": 0.5445, "step": 4420 }, { "epoch": 0.9586669552045012, "grad_norm": 0.8075823666456592, "learning_rate": 5e-06, "loss": 0.5377, "step": 4430 }, { "epoch": 0.9608309889634278, "grad_norm": 0.813436060772594, "learning_rate": 5e-06, "loss": 0.5519, "step": 4440 }, { "epoch": 0.9629950227223545, "grad_norm": 0.7806826995434077, "learning_rate": 5e-06, "loss": 0.5454, "step": 4450 }, { "epoch": 0.9651590564812811, "grad_norm": 0.7670805996089919, "learning_rate": 5e-06, "loss": 0.537, "step": 4460 }, { "epoch": 0.9673230902402078, "grad_norm": 0.82069231371166, "learning_rate": 5e-06, "loss": 0.5457, "step": 4470 }, { "epoch": 0.9694871239991344, "grad_norm": 0.8237227003500424, "learning_rate": 5e-06, "loss": 0.556, "step": 4480 }, { "epoch": 0.971651157758061, "grad_norm": 0.8426211166204151, "learning_rate": 5e-06, "loss": 0.5497, "step": 4490 }, { "epoch": 0.9738151915169877, "grad_norm": 0.7835202417017654, "learning_rate": 5e-06, "loss": 0.5303, "step": 4500 }, { "epoch": 0.9759792252759143, "grad_norm": 0.8392317298621986, "learning_rate": 5e-06, "loss": 0.5464, "step": 4510 }, { "epoch": 0.9781432590348409, "grad_norm": 0.7465598042444795, "learning_rate": 5e-06, "loss": 0.5419, "step": 4520 }, { "epoch": 0.9803072927937676, "grad_norm": 0.7544743383197097, "learning_rate": 5e-06, "loss": 0.5378, "step": 4530 }, { "epoch": 0.9824713265526942, "grad_norm": 0.764919840823264, "learning_rate": 5e-06, "loss": 0.5329, "step": 4540 }, { "epoch": 0.9846353603116209, "grad_norm": 0.7432743151815665, "learning_rate": 5e-06, "loss": 0.5488, "step": 4550 }, { "epoch": 0.9867993940705475, "grad_norm": 0.8962154593923901, "learning_rate": 5e-06, "loss": 0.544, "step": 4560 }, { "epoch": 0.9889634278294741, "grad_norm": 0.8515589470297252, "learning_rate": 5e-06, "loss": 0.5438, "step": 4570 }, { "epoch": 0.9911274615884008, "grad_norm": 0.7539916472553879, "learning_rate": 5e-06, "loss": 0.5368, "step": 4580 }, { "epoch": 0.9932914953473274, "grad_norm": 0.7966250502268061, "learning_rate": 5e-06, "loss": 0.5516, "step": 4590 }, { "epoch": 0.995455529106254, "grad_norm": 0.796275990986384, "learning_rate": 5e-06, "loss": 0.535, "step": 4600 }, { "epoch": 0.9976195628651807, "grad_norm": 0.7572730283592524, "learning_rate": 5e-06, "loss": 0.5399, "step": 4610 }, { "epoch": 0.9997835966241073, "grad_norm": 0.7866241644050972, "learning_rate": 5e-06, "loss": 0.5396, "step": 4620 }, { "epoch": 1.0, "eval_loss": 0.5429847836494446, "eval_runtime": 588.4696, "eval_samples_per_second": 26.448, "eval_steps_per_second": 0.415, "step": 4621 }, { "epoch": 1.001947630383034, "grad_norm": 0.8198402400349776, "learning_rate": 5e-06, "loss": 0.4682, "step": 4630 }, { "epoch": 1.0041116641419605, "grad_norm": 0.7801283113390183, "learning_rate": 5e-06, "loss": 0.4779, "step": 4640 }, { "epoch": 1.0062756979008873, "grad_norm": 0.7609745957390824, "learning_rate": 5e-06, "loss": 0.4627, "step": 4650 }, { "epoch": 1.008439731659814, "grad_norm": 0.7821883017630472, "learning_rate": 5e-06, "loss": 0.4713, "step": 4660 }, { "epoch": 1.0106037654187405, "grad_norm": 0.7260403031295831, "learning_rate": 5e-06, "loss": 0.4684, "step": 4670 }, { "epoch": 1.0127677991776671, "grad_norm": 0.7812573688271381, "learning_rate": 5e-06, "loss": 0.4736, "step": 4680 }, { "epoch": 1.0149318329365937, "grad_norm": 0.7822921675866278, "learning_rate": 5e-06, "loss": 0.4548, "step": 4690 }, { "epoch": 1.0170958666955205, "grad_norm": 0.7694090971084214, "learning_rate": 5e-06, "loss": 0.46, "step": 4700 }, { "epoch": 1.0192599004544471, "grad_norm": 0.7459628030829183, "learning_rate": 5e-06, "loss": 0.464, "step": 4710 }, { "epoch": 1.0214239342133737, "grad_norm": 0.715134160634303, "learning_rate": 5e-06, "loss": 0.4542, "step": 4720 }, { "epoch": 1.0235879679723003, "grad_norm": 0.7414824535187435, "learning_rate": 5e-06, "loss": 0.465, "step": 4730 }, { "epoch": 1.025752001731227, "grad_norm": 0.7453860186675011, "learning_rate": 5e-06, "loss": 0.4605, "step": 4740 }, { "epoch": 1.0279160354901538, "grad_norm": 0.7718909976941406, "learning_rate": 5e-06, "loss": 0.4632, "step": 4750 }, { "epoch": 1.0300800692490804, "grad_norm": 0.7645671037278959, "learning_rate": 5e-06, "loss": 0.4736, "step": 4760 }, { "epoch": 1.032244103008007, "grad_norm": 0.7718550788244495, "learning_rate": 5e-06, "loss": 0.4605, "step": 4770 }, { "epoch": 1.0344081367669336, "grad_norm": 0.7964499026649204, "learning_rate": 5e-06, "loss": 0.4833, "step": 4780 }, { "epoch": 1.0365721705258601, "grad_norm": 0.834083497700352, "learning_rate": 5e-06, "loss": 0.4766, "step": 4790 }, { "epoch": 1.0387362042847867, "grad_norm": 0.7319738782041046, "learning_rate": 5e-06, "loss": 0.4755, "step": 4800 }, { "epoch": 1.0409002380437136, "grad_norm": 0.7462008777592158, "learning_rate": 5e-06, "loss": 0.4688, "step": 4810 }, { "epoch": 1.0430642718026402, "grad_norm": 0.7694025080042929, "learning_rate": 5e-06, "loss": 0.4731, "step": 4820 }, { "epoch": 1.0452283055615668, "grad_norm": 0.7712938874733282, "learning_rate": 5e-06, "loss": 0.4783, "step": 4830 }, { "epoch": 1.0473923393204934, "grad_norm": 0.6854206720325331, "learning_rate": 5e-06, "loss": 0.4496, "step": 4840 }, { "epoch": 1.04955637307942, "grad_norm": 0.7296190822576524, "learning_rate": 5e-06, "loss": 0.4466, "step": 4850 }, { "epoch": 1.0517204068383468, "grad_norm": 0.8044988975714437, "learning_rate": 5e-06, "loss": 0.4763, "step": 4860 }, { "epoch": 1.0538844405972734, "grad_norm": 0.7550484269456027, "learning_rate": 5e-06, "loss": 0.4631, "step": 4870 }, { "epoch": 1.0560484743562, "grad_norm": 0.7716087062170465, "learning_rate": 5e-06, "loss": 0.4638, "step": 4880 }, { "epoch": 1.0582125081151266, "grad_norm": 0.7069074185268656, "learning_rate": 5e-06, "loss": 0.4592, "step": 4890 }, { "epoch": 1.0603765418740532, "grad_norm": 0.7406983646764749, "learning_rate": 5e-06, "loss": 0.4661, "step": 4900 }, { "epoch": 1.0625405756329798, "grad_norm": 0.8513739665965723, "learning_rate": 5e-06, "loss": 0.4779, "step": 4910 }, { "epoch": 1.0647046093919066, "grad_norm": 0.7741744083175378, "learning_rate": 5e-06, "loss": 0.4629, "step": 4920 }, { "epoch": 1.0668686431508332, "grad_norm": 0.841842230732826, "learning_rate": 5e-06, "loss": 0.4559, "step": 4930 }, { "epoch": 1.0690326769097598, "grad_norm": 0.7561799157279342, "learning_rate": 5e-06, "loss": 0.4661, "step": 4940 }, { "epoch": 1.0711967106686864, "grad_norm": 0.7452446191339013, "learning_rate": 5e-06, "loss": 0.4796, "step": 4950 }, { "epoch": 1.073360744427613, "grad_norm": 0.7219452780323502, "learning_rate": 5e-06, "loss": 0.4693, "step": 4960 }, { "epoch": 1.0755247781865398, "grad_norm": 0.8011050940745716, "learning_rate": 5e-06, "loss": 0.4625, "step": 4970 }, { "epoch": 1.0776888119454664, "grad_norm": 0.794254062662002, "learning_rate": 5e-06, "loss": 0.4734, "step": 4980 }, { "epoch": 1.079852845704393, "grad_norm": 0.7841880358208203, "learning_rate": 5e-06, "loss": 0.4735, "step": 4990 }, { "epoch": 1.0820168794633196, "grad_norm": 0.7541874362266996, "learning_rate": 5e-06, "loss": 0.4719, "step": 5000 }, { "epoch": 1.0841809132222462, "grad_norm": 0.7832163238689167, "learning_rate": 5e-06, "loss": 0.4781, "step": 5010 }, { "epoch": 1.086344946981173, "grad_norm": 0.7612558574099242, "learning_rate": 5e-06, "loss": 0.4677, "step": 5020 }, { "epoch": 1.0885089807400996, "grad_norm": 0.7457279545654795, "learning_rate": 5e-06, "loss": 0.4568, "step": 5030 }, { "epoch": 1.0906730144990262, "grad_norm": 0.7802158241592927, "learning_rate": 5e-06, "loss": 0.4695, "step": 5040 }, { "epoch": 1.0928370482579528, "grad_norm": 0.7523296252008893, "learning_rate": 5e-06, "loss": 0.459, "step": 5050 }, { "epoch": 1.0950010820168794, "grad_norm": 0.7317362216143073, "learning_rate": 5e-06, "loss": 0.4594, "step": 5060 }, { "epoch": 1.097165115775806, "grad_norm": 0.8089179922502759, "learning_rate": 5e-06, "loss": 0.4701, "step": 5070 }, { "epoch": 1.0993291495347328, "grad_norm": 0.7971117700530117, "learning_rate": 5e-06, "loss": 0.4647, "step": 5080 }, { "epoch": 1.1014931832936594, "grad_norm": 0.7797006135536613, "learning_rate": 5e-06, "loss": 0.4746, "step": 5090 }, { "epoch": 1.103657217052586, "grad_norm": 0.8125978920365499, "learning_rate": 5e-06, "loss": 0.4679, "step": 5100 }, { "epoch": 1.1058212508115126, "grad_norm": 0.7785309067797123, "learning_rate": 5e-06, "loss": 0.4621, "step": 5110 }, { "epoch": 1.1079852845704392, "grad_norm": 0.8022112834916565, "learning_rate": 5e-06, "loss": 0.4547, "step": 5120 }, { "epoch": 1.110149318329366, "grad_norm": 0.7644085293828123, "learning_rate": 5e-06, "loss": 0.4694, "step": 5130 }, { "epoch": 1.1123133520882926, "grad_norm": 0.7337804491792407, "learning_rate": 5e-06, "loss": 0.4594, "step": 5140 }, { "epoch": 1.1144773858472192, "grad_norm": 0.7430781593703488, "learning_rate": 5e-06, "loss": 0.4655, "step": 5150 }, { "epoch": 1.1166414196061458, "grad_norm": 0.7666150250001785, "learning_rate": 5e-06, "loss": 0.4601, "step": 5160 }, { "epoch": 1.1188054533650724, "grad_norm": 0.7926818454734849, "learning_rate": 5e-06, "loss": 0.475, "step": 5170 }, { "epoch": 1.120969487123999, "grad_norm": 0.7761288586447196, "learning_rate": 5e-06, "loss": 0.464, "step": 5180 }, { "epoch": 1.1231335208829258, "grad_norm": 0.7715150614179276, "learning_rate": 5e-06, "loss": 0.4683, "step": 5190 }, { "epoch": 1.1252975546418524, "grad_norm": 0.768487598686053, "learning_rate": 5e-06, "loss": 0.4597, "step": 5200 }, { "epoch": 1.127461588400779, "grad_norm": 0.7780154395955231, "learning_rate": 5e-06, "loss": 0.4815, "step": 5210 }, { "epoch": 1.1296256221597056, "grad_norm": 0.7754600935683129, "learning_rate": 5e-06, "loss": 0.4691, "step": 5220 }, { "epoch": 1.1317896559186322, "grad_norm": 0.7168967688429924, "learning_rate": 5e-06, "loss": 0.4576, "step": 5230 }, { "epoch": 1.133953689677559, "grad_norm": 0.7887975954709602, "learning_rate": 5e-06, "loss": 0.4656, "step": 5240 }, { "epoch": 1.1361177234364856, "grad_norm": 0.7711181920453322, "learning_rate": 5e-06, "loss": 0.4638, "step": 5250 }, { "epoch": 1.1382817571954122, "grad_norm": 0.8040928973948708, "learning_rate": 5e-06, "loss": 0.4629, "step": 5260 }, { "epoch": 1.1404457909543388, "grad_norm": 0.7480038005652744, "learning_rate": 5e-06, "loss": 0.4654, "step": 5270 }, { "epoch": 1.1426098247132654, "grad_norm": 0.7772151707383336, "learning_rate": 5e-06, "loss": 0.4728, "step": 5280 }, { "epoch": 1.1447738584721923, "grad_norm": 0.7915680198334887, "learning_rate": 5e-06, "loss": 0.4576, "step": 5290 }, { "epoch": 1.1469378922311189, "grad_norm": 0.7591374087955354, "learning_rate": 5e-06, "loss": 0.4707, "step": 5300 }, { "epoch": 1.1491019259900455, "grad_norm": 0.7805726783738297, "learning_rate": 5e-06, "loss": 0.4656, "step": 5310 }, { "epoch": 1.151265959748972, "grad_norm": 0.7721078057930081, "learning_rate": 5e-06, "loss": 0.4711, "step": 5320 }, { "epoch": 1.1534299935078987, "grad_norm": 0.7400720348849441, "learning_rate": 5e-06, "loss": 0.4702, "step": 5330 }, { "epoch": 1.1555940272668255, "grad_norm": 0.7435323729236517, "learning_rate": 5e-06, "loss": 0.4778, "step": 5340 }, { "epoch": 1.157758061025752, "grad_norm": 0.7911198407125987, "learning_rate": 5e-06, "loss": 0.4665, "step": 5350 }, { "epoch": 1.1599220947846787, "grad_norm": 0.7610463033369455, "learning_rate": 5e-06, "loss": 0.4605, "step": 5360 }, { "epoch": 1.1620861285436053, "grad_norm": 0.7711693447277651, "learning_rate": 5e-06, "loss": 0.4726, "step": 5370 }, { "epoch": 1.1642501623025319, "grad_norm": 0.7859946817663731, "learning_rate": 5e-06, "loss": 0.4617, "step": 5380 }, { "epoch": 1.1664141960614585, "grad_norm": 0.7890418189022593, "learning_rate": 5e-06, "loss": 0.4713, "step": 5390 }, { "epoch": 1.1685782298203853, "grad_norm": 0.7220427522209382, "learning_rate": 5e-06, "loss": 0.4572, "step": 5400 }, { "epoch": 1.1707422635793119, "grad_norm": 0.7713515210815366, "learning_rate": 5e-06, "loss": 0.4769, "step": 5410 }, { "epoch": 1.1729062973382385, "grad_norm": 0.77210689086503, "learning_rate": 5e-06, "loss": 0.4796, "step": 5420 }, { "epoch": 1.175070331097165, "grad_norm": 0.741038220204444, "learning_rate": 5e-06, "loss": 0.4574, "step": 5430 }, { "epoch": 1.1772343648560917, "grad_norm": 0.7831344931379454, "learning_rate": 5e-06, "loss": 0.4566, "step": 5440 }, { "epoch": 1.1793983986150183, "grad_norm": 0.7345331592568521, "learning_rate": 5e-06, "loss": 0.4626, "step": 5450 }, { "epoch": 1.181562432373945, "grad_norm": 0.7820999080210016, "learning_rate": 5e-06, "loss": 0.4578, "step": 5460 }, { "epoch": 1.1837264661328717, "grad_norm": 0.7853647666817763, "learning_rate": 5e-06, "loss": 0.4706, "step": 5470 }, { "epoch": 1.1858904998917983, "grad_norm": 0.7893513383734171, "learning_rate": 5e-06, "loss": 0.4645, "step": 5480 }, { "epoch": 1.1880545336507249, "grad_norm": 0.7936708468940181, "learning_rate": 5e-06, "loss": 0.4609, "step": 5490 }, { "epoch": 1.1902185674096515, "grad_norm": 0.7199140376687585, "learning_rate": 5e-06, "loss": 0.458, "step": 5500 }, { "epoch": 1.1923826011685783, "grad_norm": 0.7381096208206134, "learning_rate": 5e-06, "loss": 0.4659, "step": 5510 }, { "epoch": 1.194546634927505, "grad_norm": 0.8083844653278189, "learning_rate": 5e-06, "loss": 0.4683, "step": 5520 }, { "epoch": 1.1967106686864315, "grad_norm": 0.7791767856954264, "learning_rate": 5e-06, "loss": 0.4727, "step": 5530 }, { "epoch": 1.198874702445358, "grad_norm": 0.732693234829825, "learning_rate": 5e-06, "loss": 0.4707, "step": 5540 }, { "epoch": 1.2010387362042847, "grad_norm": 0.7467511842772838, "learning_rate": 5e-06, "loss": 0.4573, "step": 5550 }, { "epoch": 1.2032027699632115, "grad_norm": 0.7816698329179276, "learning_rate": 5e-06, "loss": 0.4775, "step": 5560 }, { "epoch": 1.2053668037221381, "grad_norm": 0.7443727351984066, "learning_rate": 5e-06, "loss": 0.4669, "step": 5570 }, { "epoch": 1.2075308374810647, "grad_norm": 0.7425842695291263, "learning_rate": 5e-06, "loss": 0.4583, "step": 5580 }, { "epoch": 1.2096948712399913, "grad_norm": 0.7240789385956823, "learning_rate": 5e-06, "loss": 0.4576, "step": 5590 }, { "epoch": 1.211858904998918, "grad_norm": 0.8030670523927637, "learning_rate": 5e-06, "loss": 0.4684, "step": 5600 }, { "epoch": 1.2140229387578447, "grad_norm": 0.780934768631753, "learning_rate": 5e-06, "loss": 0.4592, "step": 5610 }, { "epoch": 1.2161869725167713, "grad_norm": 0.7842677396611177, "learning_rate": 5e-06, "loss": 0.4641, "step": 5620 }, { "epoch": 1.218351006275698, "grad_norm": 0.8130606689249475, "learning_rate": 5e-06, "loss": 0.466, "step": 5630 }, { "epoch": 1.2205150400346245, "grad_norm": 0.7815641772761741, "learning_rate": 5e-06, "loss": 0.4741, "step": 5640 }, { "epoch": 1.2226790737935511, "grad_norm": 0.7490386272049809, "learning_rate": 5e-06, "loss": 0.4645, "step": 5650 }, { "epoch": 1.224843107552478, "grad_norm": 0.8212925702802806, "learning_rate": 5e-06, "loss": 0.4832, "step": 5660 }, { "epoch": 1.2270071413114045, "grad_norm": 0.7630549664820707, "learning_rate": 5e-06, "loss": 0.4604, "step": 5670 }, { "epoch": 1.2291711750703311, "grad_norm": 0.8239583313017402, "learning_rate": 5e-06, "loss": 0.4708, "step": 5680 }, { "epoch": 1.2313352088292577, "grad_norm": 0.760630893549288, "learning_rate": 5e-06, "loss": 0.4694, "step": 5690 }, { "epoch": 1.2334992425881843, "grad_norm": 0.7748580962778236, "learning_rate": 5e-06, "loss": 0.4676, "step": 5700 }, { "epoch": 1.235663276347111, "grad_norm": 0.7601867125563785, "learning_rate": 5e-06, "loss": 0.4643, "step": 5710 }, { "epoch": 1.2378273101060377, "grad_norm": 0.7954825352678104, "learning_rate": 5e-06, "loss": 0.4609, "step": 5720 }, { "epoch": 1.2399913438649643, "grad_norm": 0.7751059458000068, "learning_rate": 5e-06, "loss": 0.459, "step": 5730 }, { "epoch": 1.242155377623891, "grad_norm": 0.7718457396804564, "learning_rate": 5e-06, "loss": 0.4626, "step": 5740 }, { "epoch": 1.2443194113828175, "grad_norm": 0.7754578421273243, "learning_rate": 5e-06, "loss": 0.4638, "step": 5750 }, { "epoch": 1.2464834451417441, "grad_norm": 0.7548864065745818, "learning_rate": 5e-06, "loss": 0.4584, "step": 5760 }, { "epoch": 1.2486474789006707, "grad_norm": 0.771889502196166, "learning_rate": 5e-06, "loss": 0.4717, "step": 5770 }, { "epoch": 1.2508115126595976, "grad_norm": 0.7651949593744706, "learning_rate": 5e-06, "loss": 0.4581, "step": 5780 }, { "epoch": 1.2529755464185242, "grad_norm": 0.7561690262824943, "learning_rate": 5e-06, "loss": 0.4611, "step": 5790 }, { "epoch": 1.2551395801774508, "grad_norm": 0.800376700365697, "learning_rate": 5e-06, "loss": 0.4522, "step": 5800 }, { "epoch": 1.2573036139363774, "grad_norm": 0.7501757361110241, "learning_rate": 5e-06, "loss": 0.4702, "step": 5810 }, { "epoch": 1.259467647695304, "grad_norm": 0.7926927152467118, "learning_rate": 5e-06, "loss": 0.4594, "step": 5820 }, { "epoch": 1.2616316814542308, "grad_norm": 0.7757167167590527, "learning_rate": 5e-06, "loss": 0.4698, "step": 5830 }, { "epoch": 1.2637957152131574, "grad_norm": 0.7543427971274005, "learning_rate": 5e-06, "loss": 0.4609, "step": 5840 }, { "epoch": 1.265959748972084, "grad_norm": 0.7029065517148244, "learning_rate": 5e-06, "loss": 0.4576, "step": 5850 }, { "epoch": 1.2681237827310106, "grad_norm": 0.7949904996002711, "learning_rate": 5e-06, "loss": 0.4728, "step": 5860 }, { "epoch": 1.2702878164899372, "grad_norm": 0.8061316601164296, "learning_rate": 5e-06, "loss": 0.4683, "step": 5870 }, { "epoch": 1.272451850248864, "grad_norm": 0.7679124071082001, "learning_rate": 5e-06, "loss": 0.4636, "step": 5880 }, { "epoch": 1.2746158840077906, "grad_norm": 0.7268154855749079, "learning_rate": 5e-06, "loss": 0.4613, "step": 5890 }, { "epoch": 1.2767799177667172, "grad_norm": 0.7581450065958767, "learning_rate": 5e-06, "loss": 0.4846, "step": 5900 }, { "epoch": 1.2789439515256438, "grad_norm": 0.7728758954937528, "learning_rate": 5e-06, "loss": 0.4679, "step": 5910 }, { "epoch": 1.2811079852845704, "grad_norm": 0.7757132304247395, "learning_rate": 5e-06, "loss": 0.4735, "step": 5920 }, { "epoch": 1.2832720190434972, "grad_norm": 0.7385618172633933, "learning_rate": 5e-06, "loss": 0.449, "step": 5930 }, { "epoch": 1.2854360528024238, "grad_norm": 0.7462092766571883, "learning_rate": 5e-06, "loss": 0.4607, "step": 5940 }, { "epoch": 1.2876000865613504, "grad_norm": 0.7571240627556616, "learning_rate": 5e-06, "loss": 0.4617, "step": 5950 }, { "epoch": 1.289764120320277, "grad_norm": 0.788671204299716, "learning_rate": 5e-06, "loss": 0.4663, "step": 5960 }, { "epoch": 1.2919281540792036, "grad_norm": 0.7494295917052568, "learning_rate": 5e-06, "loss": 0.4653, "step": 5970 }, { "epoch": 1.2940921878381304, "grad_norm": 0.7384360351296172, "learning_rate": 5e-06, "loss": 0.465, "step": 5980 }, { "epoch": 1.2962562215970568, "grad_norm": 0.7352800951816788, "learning_rate": 5e-06, "loss": 0.4826, "step": 5990 }, { "epoch": 1.2984202553559836, "grad_norm": 0.7602237811687234, "learning_rate": 5e-06, "loss": 0.4567, "step": 6000 }, { "epoch": 1.3005842891149102, "grad_norm": 0.7530207047121187, "learning_rate": 5e-06, "loss": 0.4684, "step": 6010 }, { "epoch": 1.3027483228738368, "grad_norm": 0.7356178071902953, "learning_rate": 5e-06, "loss": 0.4651, "step": 6020 }, { "epoch": 1.3049123566327634, "grad_norm": 0.7797947935109668, "learning_rate": 5e-06, "loss": 0.466, "step": 6030 }, { "epoch": 1.30707639039169, "grad_norm": 0.7574370101991291, "learning_rate": 5e-06, "loss": 0.4711, "step": 6040 }, { "epoch": 1.3092404241506168, "grad_norm": 0.7688758966039535, "learning_rate": 5e-06, "loss": 0.4583, "step": 6050 }, { "epoch": 1.3114044579095434, "grad_norm": 0.7745547556636165, "learning_rate": 5e-06, "loss": 0.4761, "step": 6060 }, { "epoch": 1.31356849166847, "grad_norm": 0.7728585651214986, "learning_rate": 5e-06, "loss": 0.4749, "step": 6070 }, { "epoch": 1.3157325254273966, "grad_norm": 0.8170846148828539, "learning_rate": 5e-06, "loss": 0.4657, "step": 6080 }, { "epoch": 1.3178965591863232, "grad_norm": 0.7264085797482112, "learning_rate": 5e-06, "loss": 0.4582, "step": 6090 }, { "epoch": 1.32006059294525, "grad_norm": 0.719801823367032, "learning_rate": 5e-06, "loss": 0.4538, "step": 6100 }, { "epoch": 1.3222246267041766, "grad_norm": 0.7607932021425476, "learning_rate": 5e-06, "loss": 0.4715, "step": 6110 }, { "epoch": 1.3243886604631032, "grad_norm": 0.775171785274517, "learning_rate": 5e-06, "loss": 0.4647, "step": 6120 }, { "epoch": 1.3265526942220298, "grad_norm": 0.7619426266950845, "learning_rate": 5e-06, "loss": 0.472, "step": 6130 }, { "epoch": 1.3287167279809564, "grad_norm": 0.7591540829634035, "learning_rate": 5e-06, "loss": 0.4571, "step": 6140 }, { "epoch": 1.3308807617398832, "grad_norm": 0.7905338255727203, "learning_rate": 5e-06, "loss": 0.4732, "step": 6150 }, { "epoch": 1.3330447954988098, "grad_norm": 0.8612647077572982, "learning_rate": 5e-06, "loss": 0.4702, "step": 6160 }, { "epoch": 1.3352088292577364, "grad_norm": 0.7786711376086402, "learning_rate": 5e-06, "loss": 0.4717, "step": 6170 }, { "epoch": 1.337372863016663, "grad_norm": 0.7846310010615345, "learning_rate": 5e-06, "loss": 0.4625, "step": 6180 }, { "epoch": 1.3395368967755896, "grad_norm": 0.8811606228034371, "learning_rate": 5e-06, "loss": 0.4702, "step": 6190 }, { "epoch": 1.3417009305345164, "grad_norm": 0.8146235287621797, "learning_rate": 5e-06, "loss": 0.4596, "step": 6200 }, { "epoch": 1.343864964293443, "grad_norm": 0.7533772816124655, "learning_rate": 5e-06, "loss": 0.4582, "step": 6210 }, { "epoch": 1.3460289980523696, "grad_norm": 0.7830512487247953, "learning_rate": 5e-06, "loss": 0.4736, "step": 6220 }, { "epoch": 1.3481930318112962, "grad_norm": 0.7797798896935799, "learning_rate": 5e-06, "loss": 0.4681, "step": 6230 }, { "epoch": 1.3503570655702228, "grad_norm": 0.7932078006037683, "learning_rate": 5e-06, "loss": 0.4712, "step": 6240 }, { "epoch": 1.3525210993291497, "grad_norm": 0.8036931232108603, "learning_rate": 5e-06, "loss": 0.4672, "step": 6250 }, { "epoch": 1.3546851330880763, "grad_norm": 0.7368268217791855, "learning_rate": 5e-06, "loss": 0.4695, "step": 6260 }, { "epoch": 1.3568491668470029, "grad_norm": 0.7359272221589768, "learning_rate": 5e-06, "loss": 0.466, "step": 6270 }, { "epoch": 1.3590132006059295, "grad_norm": 0.8200622976978171, "learning_rate": 5e-06, "loss": 0.4723, "step": 6280 }, { "epoch": 1.361177234364856, "grad_norm": 0.7939699615671225, "learning_rate": 5e-06, "loss": 0.468, "step": 6290 }, { "epoch": 1.3633412681237829, "grad_norm": 0.800957707217982, "learning_rate": 5e-06, "loss": 0.4529, "step": 6300 }, { "epoch": 1.3655053018827092, "grad_norm": 0.80512867222786, "learning_rate": 5e-06, "loss": 0.4659, "step": 6310 }, { "epoch": 1.367669335641636, "grad_norm": 0.7360035804063453, "learning_rate": 5e-06, "loss": 0.4635, "step": 6320 }, { "epoch": 1.3698333694005627, "grad_norm": 0.7716321408040111, "learning_rate": 5e-06, "loss": 0.454, "step": 6330 }, { "epoch": 1.3719974031594893, "grad_norm": 0.776027207843901, "learning_rate": 5e-06, "loss": 0.4613, "step": 6340 }, { "epoch": 1.3741614369184159, "grad_norm": 0.7658728763444741, "learning_rate": 5e-06, "loss": 0.4713, "step": 6350 }, { "epoch": 1.3763254706773425, "grad_norm": 0.7884808842340179, "learning_rate": 5e-06, "loss": 0.4746, "step": 6360 }, { "epoch": 1.3784895044362693, "grad_norm": 0.7775593156734237, "learning_rate": 5e-06, "loss": 0.4677, "step": 6370 }, { "epoch": 1.3806535381951959, "grad_norm": 0.7949890338718408, "learning_rate": 5e-06, "loss": 0.4635, "step": 6380 }, { "epoch": 1.3828175719541225, "grad_norm": 0.7828096111334193, "learning_rate": 5e-06, "loss": 0.462, "step": 6390 }, { "epoch": 1.384981605713049, "grad_norm": 0.7814745763059232, "learning_rate": 5e-06, "loss": 0.4718, "step": 6400 }, { "epoch": 1.3871456394719757, "grad_norm": 0.854764057136108, "learning_rate": 5e-06, "loss": 0.4734, "step": 6410 }, { "epoch": 1.3893096732309025, "grad_norm": 0.8266509384735298, "learning_rate": 5e-06, "loss": 0.4698, "step": 6420 }, { "epoch": 1.391473706989829, "grad_norm": 0.7213913037415728, "learning_rate": 5e-06, "loss": 0.4487, "step": 6430 }, { "epoch": 1.3936377407487557, "grad_norm": 0.7231480436825534, "learning_rate": 5e-06, "loss": 0.459, "step": 6440 }, { "epoch": 1.3958017745076823, "grad_norm": 0.7509805363624873, "learning_rate": 5e-06, "loss": 0.4621, "step": 6450 }, { "epoch": 1.3979658082666089, "grad_norm": 0.737834748034084, "learning_rate": 5e-06, "loss": 0.4695, "step": 6460 }, { "epoch": 1.4001298420255357, "grad_norm": 0.7620010469419662, "learning_rate": 5e-06, "loss": 0.4623, "step": 6470 }, { "epoch": 1.4022938757844623, "grad_norm": 0.7323866973353454, "learning_rate": 5e-06, "loss": 0.4765, "step": 6480 }, { "epoch": 1.404457909543389, "grad_norm": 0.7648536342318661, "learning_rate": 5e-06, "loss": 0.4644, "step": 6490 }, { "epoch": 1.4066219433023155, "grad_norm": 0.7674198954299694, "learning_rate": 5e-06, "loss": 0.4642, "step": 6500 }, { "epoch": 1.408785977061242, "grad_norm": 0.746120158345663, "learning_rate": 5e-06, "loss": 0.4597, "step": 6510 }, { "epoch": 1.410950010820169, "grad_norm": 0.7650565264216188, "learning_rate": 5e-06, "loss": 0.4594, "step": 6520 }, { "epoch": 1.4131140445790955, "grad_norm": 0.7369052966841463, "learning_rate": 5e-06, "loss": 0.4599, "step": 6530 }, { "epoch": 1.415278078338022, "grad_norm": 0.7661658293815887, "learning_rate": 5e-06, "loss": 0.4788, "step": 6540 }, { "epoch": 1.4174421120969487, "grad_norm": 0.8245544955088241, "learning_rate": 5e-06, "loss": 0.4759, "step": 6550 }, { "epoch": 1.4196061458558753, "grad_norm": 0.7739803616267428, "learning_rate": 5e-06, "loss": 0.4634, "step": 6560 }, { "epoch": 1.4217701796148021, "grad_norm": 0.7539199622313375, "learning_rate": 5e-06, "loss": 0.4675, "step": 6570 }, { "epoch": 1.4239342133737285, "grad_norm": 0.7650706870481234, "learning_rate": 5e-06, "loss": 0.4635, "step": 6580 }, { "epoch": 1.4260982471326553, "grad_norm": 0.7303061857380676, "learning_rate": 5e-06, "loss": 0.4528, "step": 6590 }, { "epoch": 1.428262280891582, "grad_norm": 0.7448893505433495, "learning_rate": 5e-06, "loss": 0.4664, "step": 6600 }, { "epoch": 1.4304263146505085, "grad_norm": 0.8093720993778148, "learning_rate": 5e-06, "loss": 0.4847, "step": 6610 }, { "epoch": 1.432590348409435, "grad_norm": 0.7534159752837184, "learning_rate": 5e-06, "loss": 0.4787, "step": 6620 }, { "epoch": 1.4347543821683617, "grad_norm": 0.7650400003197116, "learning_rate": 5e-06, "loss": 0.4682, "step": 6630 }, { "epoch": 1.4369184159272885, "grad_norm": 0.7965801032383258, "learning_rate": 5e-06, "loss": 0.4766, "step": 6640 }, { "epoch": 1.4390824496862151, "grad_norm": 0.8096153226598906, "learning_rate": 5e-06, "loss": 0.4708, "step": 6650 }, { "epoch": 1.4412464834451417, "grad_norm": 0.7872971777972805, "learning_rate": 5e-06, "loss": 0.4651, "step": 6660 }, { "epoch": 1.4434105172040683, "grad_norm": 0.7309523643397555, "learning_rate": 5e-06, "loss": 0.448, "step": 6670 }, { "epoch": 1.445574550962995, "grad_norm": 0.7739858235533356, "learning_rate": 5e-06, "loss": 0.463, "step": 6680 }, { "epoch": 1.4477385847219217, "grad_norm": 0.7215324022117396, "learning_rate": 5e-06, "loss": 0.4534, "step": 6690 }, { "epoch": 1.4499026184808483, "grad_norm": 0.8295795696846178, "learning_rate": 5e-06, "loss": 0.455, "step": 6700 }, { "epoch": 1.452066652239775, "grad_norm": 0.796699047578508, "learning_rate": 5e-06, "loss": 0.4675, "step": 6710 }, { "epoch": 1.4542306859987015, "grad_norm": 0.7661917101852191, "learning_rate": 5e-06, "loss": 0.4612, "step": 6720 }, { "epoch": 1.4563947197576281, "grad_norm": 0.7292322135619679, "learning_rate": 5e-06, "loss": 0.4723, "step": 6730 }, { "epoch": 1.458558753516555, "grad_norm": 0.7274092722777359, "learning_rate": 5e-06, "loss": 0.4668, "step": 6740 }, { "epoch": 1.4607227872754815, "grad_norm": 0.7908003272383125, "learning_rate": 5e-06, "loss": 0.4707, "step": 6750 }, { "epoch": 1.4628868210344081, "grad_norm": 0.7645468887204627, "learning_rate": 5e-06, "loss": 0.4681, "step": 6760 }, { "epoch": 1.4650508547933347, "grad_norm": 0.770270828639935, "learning_rate": 5e-06, "loss": 0.471, "step": 6770 }, { "epoch": 1.4672148885522613, "grad_norm": 0.7506554829741943, "learning_rate": 5e-06, "loss": 0.4792, "step": 6780 }, { "epoch": 1.4693789223111882, "grad_norm": 0.8131214331089905, "learning_rate": 5e-06, "loss": 0.4661, "step": 6790 }, { "epoch": 1.4715429560701148, "grad_norm": 0.7015838324010227, "learning_rate": 5e-06, "loss": 0.4711, "step": 6800 }, { "epoch": 1.4737069898290414, "grad_norm": 0.7861814435987762, "learning_rate": 5e-06, "loss": 0.4613, "step": 6810 }, { "epoch": 1.475871023587968, "grad_norm": 0.7591866835488571, "learning_rate": 5e-06, "loss": 0.4636, "step": 6820 }, { "epoch": 1.4780350573468946, "grad_norm": 0.7404595294517535, "learning_rate": 5e-06, "loss": 0.461, "step": 6830 }, { "epoch": 1.4801990911058214, "grad_norm": 0.7697305030650266, "learning_rate": 5e-06, "loss": 0.4598, "step": 6840 }, { "epoch": 1.482363124864748, "grad_norm": 0.784141929033681, "learning_rate": 5e-06, "loss": 0.4702, "step": 6850 }, { "epoch": 1.4845271586236746, "grad_norm": 0.8136152308153741, "learning_rate": 5e-06, "loss": 0.4552, "step": 6860 }, { "epoch": 1.4866911923826012, "grad_norm": 0.7345552940901736, "learning_rate": 5e-06, "loss": 0.4767, "step": 6870 }, { "epoch": 1.4888552261415278, "grad_norm": 0.7776472050120381, "learning_rate": 5e-06, "loss": 0.4744, "step": 6880 }, { "epoch": 1.4910192599004546, "grad_norm": 0.751123599938991, "learning_rate": 5e-06, "loss": 0.4678, "step": 6890 }, { "epoch": 1.493183293659381, "grad_norm": 0.7437167540645744, "learning_rate": 5e-06, "loss": 0.4659, "step": 6900 }, { "epoch": 1.4953473274183078, "grad_norm": 0.7721598263875197, "learning_rate": 5e-06, "loss": 0.482, "step": 6910 }, { "epoch": 1.4975113611772344, "grad_norm": 0.7817667866593803, "learning_rate": 5e-06, "loss": 0.4666, "step": 6920 }, { "epoch": 1.499675394936161, "grad_norm": 0.7694139195197549, "learning_rate": 5e-06, "loss": 0.466, "step": 6930 }, { "epoch": 1.5018394286950878, "grad_norm": 0.7795215699875335, "learning_rate": 5e-06, "loss": 0.4632, "step": 6940 }, { "epoch": 1.5040034624540142, "grad_norm": 0.7378131555326808, "learning_rate": 5e-06, "loss": 0.4703, "step": 6950 }, { "epoch": 1.506167496212941, "grad_norm": 0.7502895598759765, "learning_rate": 5e-06, "loss": 0.4675, "step": 6960 }, { "epoch": 1.5083315299718676, "grad_norm": 0.8323332633475088, "learning_rate": 5e-06, "loss": 0.4803, "step": 6970 }, { "epoch": 1.5104955637307942, "grad_norm": 0.7892580411151743, "learning_rate": 5e-06, "loss": 0.4765, "step": 6980 }, { "epoch": 1.5126595974897208, "grad_norm": 0.7875592663001918, "learning_rate": 5e-06, "loss": 0.4679, "step": 6990 }, { "epoch": 1.5148236312486474, "grad_norm": 0.7634571975953766, "learning_rate": 5e-06, "loss": 0.47, "step": 7000 }, { "epoch": 1.5169876650075742, "grad_norm": 0.7756829723486675, "learning_rate": 5e-06, "loss": 0.464, "step": 7010 }, { "epoch": 1.5191516987665008, "grad_norm": 0.7581561828919042, "learning_rate": 5e-06, "loss": 0.4561, "step": 7020 }, { "epoch": 1.5213157325254274, "grad_norm": 0.8138028391909014, "learning_rate": 5e-06, "loss": 0.4738, "step": 7030 }, { "epoch": 1.523479766284354, "grad_norm": 0.7713503967397953, "learning_rate": 5e-06, "loss": 0.4536, "step": 7040 }, { "epoch": 1.5256438000432806, "grad_norm": 0.7721407676831082, "learning_rate": 5e-06, "loss": 0.477, "step": 7050 }, { "epoch": 1.5278078338022074, "grad_norm": 0.7835072836836937, "learning_rate": 5e-06, "loss": 0.4781, "step": 7060 }, { "epoch": 1.5299718675611338, "grad_norm": 0.8062247813326742, "learning_rate": 5e-06, "loss": 0.4729, "step": 7070 }, { "epoch": 1.5321359013200606, "grad_norm": 0.7136619017556197, "learning_rate": 5e-06, "loss": 0.4531, "step": 7080 }, { "epoch": 1.5342999350789872, "grad_norm": 0.78043707583408, "learning_rate": 5e-06, "loss": 0.465, "step": 7090 }, { "epoch": 1.5364639688379138, "grad_norm": 0.7933315893100612, "learning_rate": 5e-06, "loss": 0.4655, "step": 7100 }, { "epoch": 1.5386280025968406, "grad_norm": 0.7476138813332562, "learning_rate": 5e-06, "loss": 0.4642, "step": 7110 }, { "epoch": 1.540792036355767, "grad_norm": 0.791596305147064, "learning_rate": 5e-06, "loss": 0.4656, "step": 7120 }, { "epoch": 1.5429560701146938, "grad_norm": 0.7765122295933571, "learning_rate": 5e-06, "loss": 0.4578, "step": 7130 }, { "epoch": 1.5451201038736204, "grad_norm": 0.7721038031057484, "learning_rate": 5e-06, "loss": 0.4724, "step": 7140 }, { "epoch": 1.547284137632547, "grad_norm": 0.7889869463184822, "learning_rate": 5e-06, "loss": 0.472, "step": 7150 }, { "epoch": 1.5494481713914738, "grad_norm": 0.7326495041461524, "learning_rate": 5e-06, "loss": 0.4587, "step": 7160 }, { "epoch": 1.5516122051504002, "grad_norm": 0.7713670879576949, "learning_rate": 5e-06, "loss": 0.472, "step": 7170 }, { "epoch": 1.553776238909327, "grad_norm": 0.7864345761195741, "learning_rate": 5e-06, "loss": 0.4708, "step": 7180 }, { "epoch": 1.5559402726682536, "grad_norm": 0.7669143468828357, "learning_rate": 5e-06, "loss": 0.4622, "step": 7190 }, { "epoch": 1.5581043064271802, "grad_norm": 0.8036141108986897, "learning_rate": 5e-06, "loss": 0.4779, "step": 7200 }, { "epoch": 1.560268340186107, "grad_norm": 0.7606518551283948, "learning_rate": 5e-06, "loss": 0.4677, "step": 7210 }, { "epoch": 1.5624323739450334, "grad_norm": 0.7411431196268208, "learning_rate": 5e-06, "loss": 0.4659, "step": 7220 }, { "epoch": 1.5645964077039602, "grad_norm": 0.7463278194705449, "learning_rate": 5e-06, "loss": 0.4636, "step": 7230 }, { "epoch": 1.5667604414628868, "grad_norm": 0.7916035934870831, "learning_rate": 5e-06, "loss": 0.4844, "step": 7240 }, { "epoch": 1.5689244752218134, "grad_norm": 0.8240660319984418, "learning_rate": 5e-06, "loss": 0.4683, "step": 7250 }, { "epoch": 1.5710885089807403, "grad_norm": 0.800318824146685, "learning_rate": 5e-06, "loss": 0.4633, "step": 7260 }, { "epoch": 1.5732525427396666, "grad_norm": 0.8094080756414094, "learning_rate": 5e-06, "loss": 0.4636, "step": 7270 }, { "epoch": 1.5754165764985935, "grad_norm": 0.774844371732236, "learning_rate": 5e-06, "loss": 0.4587, "step": 7280 }, { "epoch": 1.57758061025752, "grad_norm": 0.8210197993957168, "learning_rate": 5e-06, "loss": 0.4676, "step": 7290 }, { "epoch": 1.5797446440164467, "grad_norm": 0.7926225329081739, "learning_rate": 5e-06, "loss": 0.4746, "step": 7300 }, { "epoch": 1.5819086777753733, "grad_norm": 0.7714683122387773, "learning_rate": 5e-06, "loss": 0.4696, "step": 7310 }, { "epoch": 1.5840727115342998, "grad_norm": 0.7668988486606838, "learning_rate": 5e-06, "loss": 0.4637, "step": 7320 }, { "epoch": 1.5862367452932267, "grad_norm": 0.8068671779651346, "learning_rate": 5e-06, "loss": 0.4625, "step": 7330 }, { "epoch": 1.5884007790521533, "grad_norm": 0.7503714168675623, "learning_rate": 5e-06, "loss": 0.4626, "step": 7340 }, { "epoch": 1.5905648128110799, "grad_norm": 0.766762311923028, "learning_rate": 5e-06, "loss": 0.4607, "step": 7350 }, { "epoch": 1.5927288465700065, "grad_norm": 0.7400989288703183, "learning_rate": 5e-06, "loss": 0.4586, "step": 7360 }, { "epoch": 1.594892880328933, "grad_norm": 0.7420053327580863, "learning_rate": 5e-06, "loss": 0.4611, "step": 7370 }, { "epoch": 1.5970569140878599, "grad_norm": 0.7848315895928035, "learning_rate": 5e-06, "loss": 0.4622, "step": 7380 }, { "epoch": 1.5992209478467863, "grad_norm": 0.7844109009233191, "learning_rate": 5e-06, "loss": 0.4726, "step": 7390 }, { "epoch": 1.601384981605713, "grad_norm": 0.717574518313775, "learning_rate": 5e-06, "loss": 0.4768, "step": 7400 }, { "epoch": 1.6035490153646397, "grad_norm": 0.7684187592976545, "learning_rate": 5e-06, "loss": 0.4696, "step": 7410 }, { "epoch": 1.6057130491235663, "grad_norm": 0.7309659679104782, "learning_rate": 5e-06, "loss": 0.4626, "step": 7420 }, { "epoch": 1.607877082882493, "grad_norm": 0.7644663148759375, "learning_rate": 5e-06, "loss": 0.4772, "step": 7430 }, { "epoch": 1.6100411166414195, "grad_norm": 0.7764874598382566, "learning_rate": 5e-06, "loss": 0.4646, "step": 7440 }, { "epoch": 1.6122051504003463, "grad_norm": 0.7620772054300151, "learning_rate": 5e-06, "loss": 0.4586, "step": 7450 }, { "epoch": 1.6143691841592729, "grad_norm": 0.7820555180817107, "learning_rate": 5e-06, "loss": 0.4636, "step": 7460 }, { "epoch": 1.6165332179181995, "grad_norm": 0.7494625835808725, "learning_rate": 5e-06, "loss": 0.4585, "step": 7470 }, { "epoch": 1.6186972516771263, "grad_norm": 0.7521807522464258, "learning_rate": 5e-06, "loss": 0.4584, "step": 7480 }, { "epoch": 1.6208612854360527, "grad_norm": 0.7581980433304982, "learning_rate": 5e-06, "loss": 0.4588, "step": 7490 }, { "epoch": 1.6230253191949795, "grad_norm": 0.7627156140971617, "learning_rate": 5e-06, "loss": 0.4741, "step": 7500 }, { "epoch": 1.625189352953906, "grad_norm": 0.7905461292081447, "learning_rate": 5e-06, "loss": 0.4658, "step": 7510 }, { "epoch": 1.6273533867128327, "grad_norm": 0.8124704291753462, "learning_rate": 5e-06, "loss": 0.4735, "step": 7520 }, { "epoch": 1.6295174204717595, "grad_norm": 0.749724335823781, "learning_rate": 5e-06, "loss": 0.4629, "step": 7530 }, { "epoch": 1.631681454230686, "grad_norm": 0.7544817597703268, "learning_rate": 5e-06, "loss": 0.4664, "step": 7540 }, { "epoch": 1.6338454879896127, "grad_norm": 0.7703639179989715, "learning_rate": 5e-06, "loss": 0.4514, "step": 7550 }, { "epoch": 1.6360095217485393, "grad_norm": 0.790414972968879, "learning_rate": 5e-06, "loss": 0.4647, "step": 7560 }, { "epoch": 1.638173555507466, "grad_norm": 0.777968424055043, "learning_rate": 5e-06, "loss": 0.4687, "step": 7570 }, { "epoch": 1.6403375892663925, "grad_norm": 0.8168837011418277, "learning_rate": 5e-06, "loss": 0.4748, "step": 7580 }, { "epoch": 1.642501623025319, "grad_norm": 0.7880218866379387, "learning_rate": 5e-06, "loss": 0.463, "step": 7590 }, { "epoch": 1.644665656784246, "grad_norm": 0.7622801106079738, "learning_rate": 5e-06, "loss": 0.4687, "step": 7600 }, { "epoch": 1.6468296905431725, "grad_norm": 0.7536846157018885, "learning_rate": 5e-06, "loss": 0.4646, "step": 7610 }, { "epoch": 1.6489937243020991, "grad_norm": 0.7798435120288673, "learning_rate": 5e-06, "loss": 0.4709, "step": 7620 }, { "epoch": 1.6511577580610257, "grad_norm": 0.7550086843977464, "learning_rate": 5e-06, "loss": 0.4641, "step": 7630 }, { "epoch": 1.6533217918199523, "grad_norm": 0.7528156940459134, "learning_rate": 5e-06, "loss": 0.4668, "step": 7640 }, { "epoch": 1.6554858255788791, "grad_norm": 0.7718809368052605, "learning_rate": 5e-06, "loss": 0.4616, "step": 7650 }, { "epoch": 1.6576498593378055, "grad_norm": 0.7784564880148578, "learning_rate": 5e-06, "loss": 0.4667, "step": 7660 }, { "epoch": 1.6598138930967323, "grad_norm": 0.7397166601008868, "learning_rate": 5e-06, "loss": 0.4691, "step": 7670 }, { "epoch": 1.661977926855659, "grad_norm": 0.7881087574473704, "learning_rate": 5e-06, "loss": 0.4562, "step": 7680 }, { "epoch": 1.6641419606145855, "grad_norm": 0.7536684489497238, "learning_rate": 5e-06, "loss": 0.4581, "step": 7690 }, { "epoch": 1.6663059943735123, "grad_norm": 0.7981078552903653, "learning_rate": 5e-06, "loss": 0.4644, "step": 7700 }, { "epoch": 1.6684700281324387, "grad_norm": 0.7019669743910957, "learning_rate": 5e-06, "loss": 0.4636, "step": 7710 }, { "epoch": 1.6706340618913655, "grad_norm": 0.7977655257782118, "learning_rate": 5e-06, "loss": 0.4721, "step": 7720 }, { "epoch": 1.6727980956502921, "grad_norm": 0.8017386100927005, "learning_rate": 5e-06, "loss": 0.469, "step": 7730 }, { "epoch": 1.6749621294092187, "grad_norm": 0.771451526799465, "learning_rate": 5e-06, "loss": 0.4596, "step": 7740 }, { "epoch": 1.6771261631681456, "grad_norm": 0.7227699836985734, "learning_rate": 5e-06, "loss": 0.4546, "step": 7750 }, { "epoch": 1.679290196927072, "grad_norm": 0.7676454649666475, "learning_rate": 5e-06, "loss": 0.4663, "step": 7760 }, { "epoch": 1.6814542306859988, "grad_norm": 0.8226610282436956, "learning_rate": 5e-06, "loss": 0.4745, "step": 7770 }, { "epoch": 1.6836182644449253, "grad_norm": 0.7771996492106517, "learning_rate": 5e-06, "loss": 0.459, "step": 7780 }, { "epoch": 1.685782298203852, "grad_norm": 0.7845408388142724, "learning_rate": 5e-06, "loss": 0.4662, "step": 7790 }, { "epoch": 1.6879463319627788, "grad_norm": 0.8358661114189557, "learning_rate": 5e-06, "loss": 0.4616, "step": 7800 }, { "epoch": 1.6901103657217051, "grad_norm": 0.745445671430633, "learning_rate": 5e-06, "loss": 0.4633, "step": 7810 }, { "epoch": 1.692274399480632, "grad_norm": 0.7806093109649215, "learning_rate": 5e-06, "loss": 0.4713, "step": 7820 }, { "epoch": 1.6944384332395586, "grad_norm": 0.7931013296476084, "learning_rate": 5e-06, "loss": 0.4518, "step": 7830 }, { "epoch": 1.6966024669984852, "grad_norm": 0.7523743415278165, "learning_rate": 5e-06, "loss": 0.468, "step": 7840 }, { "epoch": 1.698766500757412, "grad_norm": 0.7657925402229067, "learning_rate": 5e-06, "loss": 0.4653, "step": 7850 }, { "epoch": 1.7009305345163384, "grad_norm": 0.7623478102612719, "learning_rate": 5e-06, "loss": 0.4714, "step": 7860 }, { "epoch": 1.7030945682752652, "grad_norm": 0.7981733209185972, "learning_rate": 5e-06, "loss": 0.4699, "step": 7870 }, { "epoch": 1.7052586020341918, "grad_norm": 0.7781587550663129, "learning_rate": 5e-06, "loss": 0.4765, "step": 7880 }, { "epoch": 1.7074226357931184, "grad_norm": 0.7587942982310993, "learning_rate": 5e-06, "loss": 0.4771, "step": 7890 }, { "epoch": 1.709586669552045, "grad_norm": 0.7698688590200006, "learning_rate": 5e-06, "loss": 0.4647, "step": 7900 }, { "epoch": 1.7117507033109716, "grad_norm": 0.804203999005989, "learning_rate": 5e-06, "loss": 0.4686, "step": 7910 }, { "epoch": 1.7139147370698984, "grad_norm": 0.7911772144270678, "learning_rate": 5e-06, "loss": 0.4639, "step": 7920 }, { "epoch": 1.716078770828825, "grad_norm": 0.7836618578593015, "learning_rate": 5e-06, "loss": 0.4666, "step": 7930 }, { "epoch": 1.7182428045877516, "grad_norm": 0.788575508265406, "learning_rate": 5e-06, "loss": 0.4729, "step": 7940 }, { "epoch": 1.7204068383466782, "grad_norm": 0.7861934781471758, "learning_rate": 5e-06, "loss": 0.4555, "step": 7950 }, { "epoch": 1.7225708721056048, "grad_norm": 0.7536239150523527, "learning_rate": 5e-06, "loss": 0.4694, "step": 7960 }, { "epoch": 1.7247349058645316, "grad_norm": 0.7316152360900233, "learning_rate": 5e-06, "loss": 0.4654, "step": 7970 }, { "epoch": 1.726898939623458, "grad_norm": 0.7959312951820343, "learning_rate": 5e-06, "loss": 0.4742, "step": 7980 }, { "epoch": 1.7290629733823848, "grad_norm": 0.7328251440184733, "learning_rate": 5e-06, "loss": 0.4636, "step": 7990 }, { "epoch": 1.7312270071413114, "grad_norm": 0.8721623540661089, "learning_rate": 5e-06, "loss": 0.4727, "step": 8000 }, { "epoch": 1.733391040900238, "grad_norm": 0.7788156040311068, "learning_rate": 5e-06, "loss": 0.479, "step": 8010 }, { "epoch": 1.7355550746591648, "grad_norm": 0.7352632852244723, "learning_rate": 5e-06, "loss": 0.4694, "step": 8020 }, { "epoch": 1.7377191084180912, "grad_norm": 0.8294281676756476, "learning_rate": 5e-06, "loss": 0.4756, "step": 8030 }, { "epoch": 1.739883142177018, "grad_norm": 0.7872766681208978, "learning_rate": 5e-06, "loss": 0.4714, "step": 8040 }, { "epoch": 1.7420471759359446, "grad_norm": 0.7549853443648625, "learning_rate": 5e-06, "loss": 0.4793, "step": 8050 }, { "epoch": 1.7442112096948712, "grad_norm": 0.7752211889605723, "learning_rate": 5e-06, "loss": 0.4568, "step": 8060 }, { "epoch": 1.746375243453798, "grad_norm": 0.7656375809085874, "learning_rate": 5e-06, "loss": 0.4626, "step": 8070 }, { "epoch": 1.7485392772127244, "grad_norm": 0.7758127700797092, "learning_rate": 5e-06, "loss": 0.4583, "step": 8080 }, { "epoch": 1.7507033109716512, "grad_norm": 0.7634613626649636, "learning_rate": 5e-06, "loss": 0.4595, "step": 8090 }, { "epoch": 1.7528673447305778, "grad_norm": 0.7698077132883445, "learning_rate": 5e-06, "loss": 0.4717, "step": 8100 }, { "epoch": 1.7550313784895044, "grad_norm": 0.7383242906768342, "learning_rate": 5e-06, "loss": 0.4626, "step": 8110 }, { "epoch": 1.7571954122484312, "grad_norm": 0.823790637914916, "learning_rate": 5e-06, "loss": 0.4617, "step": 8120 }, { "epoch": 1.7593594460073576, "grad_norm": 0.7581731295515508, "learning_rate": 5e-06, "loss": 0.4599, "step": 8130 }, { "epoch": 1.7615234797662844, "grad_norm": 0.7863922041923643, "learning_rate": 5e-06, "loss": 0.4633, "step": 8140 }, { "epoch": 1.763687513525211, "grad_norm": 0.7258530440371272, "learning_rate": 5e-06, "loss": 0.4695, "step": 8150 }, { "epoch": 1.7658515472841376, "grad_norm": 0.7617292499788662, "learning_rate": 5e-06, "loss": 0.4584, "step": 8160 }, { "epoch": 1.7680155810430642, "grad_norm": 0.7835216384518215, "learning_rate": 5e-06, "loss": 0.4776, "step": 8170 }, { "epoch": 1.7701796148019908, "grad_norm": 0.7905903337960178, "learning_rate": 5e-06, "loss": 0.4575, "step": 8180 }, { "epoch": 1.7723436485609176, "grad_norm": 0.7470273698032939, "learning_rate": 5e-06, "loss": 0.4519, "step": 8190 }, { "epoch": 1.7745076823198442, "grad_norm": 0.7672368207649455, "learning_rate": 5e-06, "loss": 0.4648, "step": 8200 }, { "epoch": 1.7766717160787708, "grad_norm": 0.7316674700923429, "learning_rate": 5e-06, "loss": 0.4724, "step": 8210 }, { "epoch": 1.7788357498376974, "grad_norm": 0.7360630882150441, "learning_rate": 5e-06, "loss": 0.4638, "step": 8220 }, { "epoch": 1.780999783596624, "grad_norm": 0.781016234417425, "learning_rate": 5e-06, "loss": 0.4728, "step": 8230 }, { "epoch": 1.7831638173555509, "grad_norm": 0.7876473046986651, "learning_rate": 5e-06, "loss": 0.4592, "step": 8240 }, { "epoch": 1.7853278511144772, "grad_norm": 0.7404645309296104, "learning_rate": 5e-06, "loss": 0.4624, "step": 8250 }, { "epoch": 1.787491884873404, "grad_norm": 0.7684119788472853, "learning_rate": 5e-06, "loss": 0.4635, "step": 8260 }, { "epoch": 1.7896559186323306, "grad_norm": 0.7663695712294315, "learning_rate": 5e-06, "loss": 0.465, "step": 8270 }, { "epoch": 1.7918199523912572, "grad_norm": 0.7590055968711897, "learning_rate": 5e-06, "loss": 0.4643, "step": 8280 }, { "epoch": 1.793983986150184, "grad_norm": 0.7895074154532604, "learning_rate": 5e-06, "loss": 0.4671, "step": 8290 }, { "epoch": 1.7961480199091104, "grad_norm": 0.7962575230042065, "learning_rate": 5e-06, "loss": 0.472, "step": 8300 }, { "epoch": 1.7983120536680373, "grad_norm": 0.7478573445116543, "learning_rate": 5e-06, "loss": 0.4626, "step": 8310 }, { "epoch": 1.8004760874269639, "grad_norm": 0.762202720740558, "learning_rate": 5e-06, "loss": 0.4515, "step": 8320 }, { "epoch": 1.8026401211858905, "grad_norm": 0.8404740937407859, "learning_rate": 5e-06, "loss": 0.4674, "step": 8330 }, { "epoch": 1.8048041549448173, "grad_norm": 0.7952968170552206, "learning_rate": 5e-06, "loss": 0.4619, "step": 8340 }, { "epoch": 1.8069681887037436, "grad_norm": 0.7820220237908069, "learning_rate": 5e-06, "loss": 0.4667, "step": 8350 }, { "epoch": 1.8091322224626705, "grad_norm": 0.7888823174892542, "learning_rate": 5e-06, "loss": 0.4732, "step": 8360 }, { "epoch": 1.811296256221597, "grad_norm": 0.7350957126888091, "learning_rate": 5e-06, "loss": 0.4532, "step": 8370 }, { "epoch": 1.8134602899805237, "grad_norm": 0.6957708222610044, "learning_rate": 5e-06, "loss": 0.4497, "step": 8380 }, { "epoch": 1.8156243237394505, "grad_norm": 0.8053515701021636, "learning_rate": 5e-06, "loss": 0.4785, "step": 8390 }, { "epoch": 1.8177883574983769, "grad_norm": 0.6995897457091669, "learning_rate": 5e-06, "loss": 0.4551, "step": 8400 }, { "epoch": 1.8199523912573037, "grad_norm": 0.7753814167641445, "learning_rate": 5e-06, "loss": 0.4621, "step": 8410 }, { "epoch": 1.8221164250162303, "grad_norm": 0.7661880273668203, "learning_rate": 5e-06, "loss": 0.4609, "step": 8420 }, { "epoch": 1.8242804587751569, "grad_norm": 0.7609799960654735, "learning_rate": 5e-06, "loss": 0.4736, "step": 8430 }, { "epoch": 1.8264444925340837, "grad_norm": 0.7841320073323229, "learning_rate": 5e-06, "loss": 0.4575, "step": 8440 }, { "epoch": 1.82860852629301, "grad_norm": 0.8063573574026484, "learning_rate": 5e-06, "loss": 0.4617, "step": 8450 }, { "epoch": 1.830772560051937, "grad_norm": 0.7854293405760782, "learning_rate": 5e-06, "loss": 0.4777, "step": 8460 }, { "epoch": 1.8329365938108635, "grad_norm": 0.7408682039776677, "learning_rate": 5e-06, "loss": 0.4614, "step": 8470 }, { "epoch": 1.83510062756979, "grad_norm": 0.7231058234976815, "learning_rate": 5e-06, "loss": 0.4624, "step": 8480 }, { "epoch": 1.8372646613287167, "grad_norm": 0.800682041278424, "learning_rate": 5e-06, "loss": 0.4641, "step": 8490 }, { "epoch": 1.8394286950876433, "grad_norm": 0.7673472834532682, "learning_rate": 5e-06, "loss": 0.4606, "step": 8500 }, { "epoch": 1.84159272884657, "grad_norm": 0.7619493537629672, "learning_rate": 5e-06, "loss": 0.4706, "step": 8510 }, { "epoch": 1.8437567626054965, "grad_norm": 0.7972147582688511, "learning_rate": 5e-06, "loss": 0.4592, "step": 8520 }, { "epoch": 1.8459207963644233, "grad_norm": 0.7625120492458264, "learning_rate": 5e-06, "loss": 0.4571, "step": 8530 }, { "epoch": 1.84808483012335, "grad_norm": 0.8283056055978331, "learning_rate": 5e-06, "loss": 0.4739, "step": 8540 }, { "epoch": 1.8502488638822765, "grad_norm": 0.7573851435355785, "learning_rate": 5e-06, "loss": 0.4671, "step": 8550 }, { "epoch": 1.8524128976412033, "grad_norm": 0.8293614574747807, "learning_rate": 5e-06, "loss": 0.4652, "step": 8560 }, { "epoch": 1.8545769314001297, "grad_norm": 0.7412074084724187, "learning_rate": 5e-06, "loss": 0.4592, "step": 8570 }, { "epoch": 1.8567409651590565, "grad_norm": 0.8475011905897842, "learning_rate": 5e-06, "loss": 0.4588, "step": 8580 }, { "epoch": 1.858904998917983, "grad_norm": 0.7431837931733989, "learning_rate": 5e-06, "loss": 0.453, "step": 8590 }, { "epoch": 1.8610690326769097, "grad_norm": 0.743103879188485, "learning_rate": 5e-06, "loss": 0.4685, "step": 8600 }, { "epoch": 1.8632330664358365, "grad_norm": 0.7512142776663842, "learning_rate": 5e-06, "loss": 0.4685, "step": 8610 }, { "epoch": 1.865397100194763, "grad_norm": 0.733739499974568, "learning_rate": 5e-06, "loss": 0.4625, "step": 8620 }, { "epoch": 1.8675611339536897, "grad_norm": 0.7976859851306687, "learning_rate": 5e-06, "loss": 0.4676, "step": 8630 }, { "epoch": 1.8697251677126163, "grad_norm": 0.7661212684653242, "learning_rate": 5e-06, "loss": 0.475, "step": 8640 }, { "epoch": 1.871889201471543, "grad_norm": 0.771734215920023, "learning_rate": 5e-06, "loss": 0.4621, "step": 8650 }, { "epoch": 1.8740532352304697, "grad_norm": 0.766268946678346, "learning_rate": 5e-06, "loss": 0.4531, "step": 8660 }, { "epoch": 1.8762172689893961, "grad_norm": 0.7754183196315589, "learning_rate": 5e-06, "loss": 0.4673, "step": 8670 }, { "epoch": 1.878381302748323, "grad_norm": 0.7534048500045719, "learning_rate": 5e-06, "loss": 0.467, "step": 8680 }, { "epoch": 1.8805453365072495, "grad_norm": 0.7764377429907858, "learning_rate": 5e-06, "loss": 0.4579, "step": 8690 }, { "epoch": 1.8827093702661761, "grad_norm": 0.7698958247598046, "learning_rate": 5e-06, "loss": 0.4786, "step": 8700 }, { "epoch": 1.884873404025103, "grad_norm": 0.7651418118029404, "learning_rate": 5e-06, "loss": 0.4594, "step": 8710 }, { "epoch": 1.8870374377840293, "grad_norm": 0.7590832586194605, "learning_rate": 5e-06, "loss": 0.46, "step": 8720 }, { "epoch": 1.8892014715429561, "grad_norm": 0.7731735855097023, "learning_rate": 5e-06, "loss": 0.4727, "step": 8730 }, { "epoch": 1.8913655053018827, "grad_norm": 0.7856091416226185, "learning_rate": 5e-06, "loss": 0.4503, "step": 8740 }, { "epoch": 1.8935295390608093, "grad_norm": 0.7722457110486987, "learning_rate": 5e-06, "loss": 0.4605, "step": 8750 }, { "epoch": 1.895693572819736, "grad_norm": 0.8198508677971693, "learning_rate": 5e-06, "loss": 0.4673, "step": 8760 }, { "epoch": 1.8978576065786625, "grad_norm": 0.7910065087434878, "learning_rate": 5e-06, "loss": 0.4619, "step": 8770 }, { "epoch": 1.9000216403375894, "grad_norm": 0.7485212969421532, "learning_rate": 5e-06, "loss": 0.4687, "step": 8780 }, { "epoch": 1.902185674096516, "grad_norm": 0.8215859586315127, "learning_rate": 5e-06, "loss": 0.4555, "step": 8790 }, { "epoch": 1.9043497078554426, "grad_norm": 0.7909926394009462, "learning_rate": 5e-06, "loss": 0.4666, "step": 8800 }, { "epoch": 1.9065137416143692, "grad_norm": 0.7403806845841657, "learning_rate": 5e-06, "loss": 0.4722, "step": 8810 }, { "epoch": 1.9086777753732957, "grad_norm": 0.7617013904099731, "learning_rate": 5e-06, "loss": 0.4553, "step": 8820 }, { "epoch": 1.9108418091322226, "grad_norm": 0.7849518527403264, "learning_rate": 5e-06, "loss": 0.4612, "step": 8830 }, { "epoch": 1.913005842891149, "grad_norm": 0.7899431737849835, "learning_rate": 5e-06, "loss": 0.4795, "step": 8840 }, { "epoch": 1.9151698766500758, "grad_norm": 0.7232707414732422, "learning_rate": 5e-06, "loss": 0.4661, "step": 8850 }, { "epoch": 1.9173339104090024, "grad_norm": 0.7715859939569867, "learning_rate": 5e-06, "loss": 0.4729, "step": 8860 }, { "epoch": 1.919497944167929, "grad_norm": 0.778365711933746, "learning_rate": 5e-06, "loss": 0.466, "step": 8870 }, { "epoch": 1.9216619779268558, "grad_norm": 0.7315745037821393, "learning_rate": 5e-06, "loss": 0.458, "step": 8880 }, { "epoch": 1.9238260116857822, "grad_norm": 0.7850569825810849, "learning_rate": 5e-06, "loss": 0.4657, "step": 8890 }, { "epoch": 1.925990045444709, "grad_norm": 0.7935430629873113, "learning_rate": 5e-06, "loss": 0.4704, "step": 8900 }, { "epoch": 1.9281540792036356, "grad_norm": 0.8082828639182642, "learning_rate": 5e-06, "loss": 0.4694, "step": 8910 }, { "epoch": 1.9303181129625622, "grad_norm": 0.7982975795262971, "learning_rate": 5e-06, "loss": 0.4595, "step": 8920 }, { "epoch": 1.932482146721489, "grad_norm": 0.7278908896393382, "learning_rate": 5e-06, "loss": 0.4705, "step": 8930 }, { "epoch": 1.9346461804804154, "grad_norm": 0.7593550485177655, "learning_rate": 5e-06, "loss": 0.4769, "step": 8940 }, { "epoch": 1.9368102142393422, "grad_norm": 0.7560080244005302, "learning_rate": 5e-06, "loss": 0.4508, "step": 8950 }, { "epoch": 1.9389742479982688, "grad_norm": 0.7523700953082769, "learning_rate": 5e-06, "loss": 0.4583, "step": 8960 }, { "epoch": 1.9411382817571954, "grad_norm": 0.7729932814097218, "learning_rate": 5e-06, "loss": 0.46, "step": 8970 }, { "epoch": 1.9433023155161222, "grad_norm": 0.7859581817688281, "learning_rate": 5e-06, "loss": 0.4657, "step": 8980 }, { "epoch": 1.9454663492750486, "grad_norm": 0.7649621801554861, "learning_rate": 5e-06, "loss": 0.4673, "step": 8990 }, { "epoch": 1.9476303830339754, "grad_norm": 0.7697491440260914, "learning_rate": 5e-06, "loss": 0.4635, "step": 9000 }, { "epoch": 1.949794416792902, "grad_norm": 0.783274985348169, "learning_rate": 5e-06, "loss": 0.4634, "step": 9010 }, { "epoch": 1.9519584505518286, "grad_norm": 0.715096846477804, "learning_rate": 5e-06, "loss": 0.4693, "step": 9020 }, { "epoch": 1.9541224843107552, "grad_norm": 0.8477280634175208, "learning_rate": 5e-06, "loss": 0.4597, "step": 9030 }, { "epoch": 1.9562865180696818, "grad_norm": 0.7592259661678222, "learning_rate": 5e-06, "loss": 0.459, "step": 9040 }, { "epoch": 1.9584505518286086, "grad_norm": 0.855762642211244, "learning_rate": 5e-06, "loss": 0.4552, "step": 9050 }, { "epoch": 1.9606145855875352, "grad_norm": 0.7724079364010088, "learning_rate": 5e-06, "loss": 0.4585, "step": 9060 }, { "epoch": 1.9627786193464618, "grad_norm": 0.7472414181252647, "learning_rate": 5e-06, "loss": 0.4575, "step": 9070 }, { "epoch": 1.9649426531053884, "grad_norm": 0.77667179344492, "learning_rate": 5e-06, "loss": 0.4702, "step": 9080 }, { "epoch": 1.967106686864315, "grad_norm": 0.7738412500478078, "learning_rate": 5e-06, "loss": 0.4548, "step": 9090 }, { "epoch": 1.9692707206232418, "grad_norm": 0.6873551934444759, "learning_rate": 5e-06, "loss": 0.4618, "step": 9100 }, { "epoch": 1.9714347543821682, "grad_norm": 0.7616982926840125, "learning_rate": 5e-06, "loss": 0.4633, "step": 9110 }, { "epoch": 1.973598788141095, "grad_norm": 0.7359315671968003, "learning_rate": 5e-06, "loss": 0.4507, "step": 9120 }, { "epoch": 1.9757628219000216, "grad_norm": 0.7862965542736634, "learning_rate": 5e-06, "loss": 0.4752, "step": 9130 }, { "epoch": 1.9779268556589482, "grad_norm": 0.7833594304960272, "learning_rate": 5e-06, "loss": 0.4665, "step": 9140 }, { "epoch": 1.980090889417875, "grad_norm": 0.7749375306947249, "learning_rate": 5e-06, "loss": 0.4606, "step": 9150 }, { "epoch": 1.9822549231768014, "grad_norm": 0.7755483335751571, "learning_rate": 5e-06, "loss": 0.46, "step": 9160 }, { "epoch": 1.9844189569357282, "grad_norm": 0.7318259540031191, "learning_rate": 5e-06, "loss": 0.4557, "step": 9170 }, { "epoch": 1.9865829906946548, "grad_norm": 0.7415991714178316, "learning_rate": 5e-06, "loss": 0.4699, "step": 9180 }, { "epoch": 1.9887470244535814, "grad_norm": 0.8918970620944271, "learning_rate": 5e-06, "loss": 0.4584, "step": 9190 }, { "epoch": 1.9909110582125082, "grad_norm": 0.7854257183159034, "learning_rate": 5e-06, "loss": 0.4659, "step": 9200 }, { "epoch": 1.9930750919714346, "grad_norm": 0.743885959515272, "learning_rate": 5e-06, "loss": 0.4614, "step": 9210 }, { "epoch": 1.9952391257303614, "grad_norm": 0.7564617143226638, "learning_rate": 5e-06, "loss": 0.4629, "step": 9220 }, { "epoch": 1.997403159489288, "grad_norm": 0.7956935106295134, "learning_rate": 5e-06, "loss": 0.46, "step": 9230 }, { "epoch": 1.9995671932482146, "grad_norm": 0.7902840252478065, "learning_rate": 5e-06, "loss": 0.4605, "step": 9240 }, { "epoch": 2.0, "eval_loss": 0.5312886238098145, "eval_runtime": 585.8565, "eval_samples_per_second": 26.566, "eval_steps_per_second": 0.416, "step": 9242 }, { "epoch": 2.0017312270071415, "grad_norm": 0.8374428329689654, "learning_rate": 5e-06, "loss": 0.3932, "step": 9250 }, { "epoch": 2.003895260766068, "grad_norm": 0.7293070552754123, "learning_rate": 5e-06, "loss": 0.3703, "step": 9260 }, { "epoch": 2.0060592945249947, "grad_norm": 0.7267288150501345, "learning_rate": 5e-06, "loss": 0.3791, "step": 9270 }, { "epoch": 2.008223328283921, "grad_norm": 0.7328996332443125, "learning_rate": 5e-06, "loss": 0.3682, "step": 9280 }, { "epoch": 2.010387362042848, "grad_norm": 0.769240951148092, "learning_rate": 5e-06, "loss": 0.3771, "step": 9290 }, { "epoch": 2.0125513958017747, "grad_norm": 0.782608875094357, "learning_rate": 5e-06, "loss": 0.3835, "step": 9300 }, { "epoch": 2.014715429560701, "grad_norm": 0.7343404957266344, "learning_rate": 5e-06, "loss": 0.3672, "step": 9310 }, { "epoch": 2.016879463319628, "grad_norm": 0.7361975290672562, "learning_rate": 5e-06, "loss": 0.3638, "step": 9320 }, { "epoch": 2.0190434970785542, "grad_norm": 0.8170669306278748, "learning_rate": 5e-06, "loss": 0.3833, "step": 9330 }, { "epoch": 2.021207530837481, "grad_norm": 0.7599248424010396, "learning_rate": 5e-06, "loss": 0.3799, "step": 9340 }, { "epoch": 2.023371564596408, "grad_norm": 0.7854230438584365, "learning_rate": 5e-06, "loss": 0.3774, "step": 9350 }, { "epoch": 2.0255355983553343, "grad_norm": 0.747309033166393, "learning_rate": 5e-06, "loss": 0.3837, "step": 9360 }, { "epoch": 2.027699632114261, "grad_norm": 0.7554391399636565, "learning_rate": 5e-06, "loss": 0.373, "step": 9370 }, { "epoch": 2.0298636658731875, "grad_norm": 0.7369701594292731, "learning_rate": 5e-06, "loss": 0.373, "step": 9380 }, { "epoch": 2.0320276996321143, "grad_norm": 0.7920170859213356, "learning_rate": 5e-06, "loss": 0.3829, "step": 9390 }, { "epoch": 2.034191733391041, "grad_norm": 0.7385616800337693, "learning_rate": 5e-06, "loss": 0.3732, "step": 9400 }, { "epoch": 2.0363557671499675, "grad_norm": 0.7370778661512036, "learning_rate": 5e-06, "loss": 0.3634, "step": 9410 }, { "epoch": 2.0385198009088943, "grad_norm": 0.7064048496785564, "learning_rate": 5e-06, "loss": 0.3725, "step": 9420 }, { "epoch": 2.0406838346678207, "grad_norm": 0.7588040574820386, "learning_rate": 5e-06, "loss": 0.3803, "step": 9430 }, { "epoch": 2.0428478684267475, "grad_norm": 0.6837725115579112, "learning_rate": 5e-06, "loss": 0.3711, "step": 9440 }, { "epoch": 2.0450119021856743, "grad_norm": 0.7072237203949913, "learning_rate": 5e-06, "loss": 0.3737, "step": 9450 }, { "epoch": 2.0471759359446007, "grad_norm": 0.7715770726086181, "learning_rate": 5e-06, "loss": 0.3779, "step": 9460 }, { "epoch": 2.0493399697035275, "grad_norm": 0.7603839434200408, "learning_rate": 5e-06, "loss": 0.3785, "step": 9470 }, { "epoch": 2.051504003462454, "grad_norm": 0.7811463267122977, "learning_rate": 5e-06, "loss": 0.3755, "step": 9480 }, { "epoch": 2.0536680372213807, "grad_norm": 0.7223273231471339, "learning_rate": 5e-06, "loss": 0.3775, "step": 9490 }, { "epoch": 2.0558320709803075, "grad_norm": 0.7489458096255904, "learning_rate": 5e-06, "loss": 0.3752, "step": 9500 }, { "epoch": 2.057996104739234, "grad_norm": 0.7169025560301021, "learning_rate": 5e-06, "loss": 0.3838, "step": 9510 }, { "epoch": 2.0601601384981607, "grad_norm": 0.8241052771764841, "learning_rate": 5e-06, "loss": 0.3763, "step": 9520 }, { "epoch": 2.062324172257087, "grad_norm": 0.7344203339500558, "learning_rate": 5e-06, "loss": 0.3759, "step": 9530 }, { "epoch": 2.064488206016014, "grad_norm": 0.7469451548032326, "learning_rate": 5e-06, "loss": 0.3768, "step": 9540 }, { "epoch": 2.0666522397749403, "grad_norm": 0.7084397676411069, "learning_rate": 5e-06, "loss": 0.3764, "step": 9550 }, { "epoch": 2.068816273533867, "grad_norm": 0.7938766312720807, "learning_rate": 5e-06, "loss": 0.383, "step": 9560 }, { "epoch": 2.070980307292794, "grad_norm": 0.766097768652044, "learning_rate": 5e-06, "loss": 0.376, "step": 9570 }, { "epoch": 2.0731443410517203, "grad_norm": 0.7626098304896327, "learning_rate": 5e-06, "loss": 0.3735, "step": 9580 }, { "epoch": 2.075308374810647, "grad_norm": 0.7506050168232735, "learning_rate": 5e-06, "loss": 0.3763, "step": 9590 }, { "epoch": 2.0774724085695735, "grad_norm": 0.7505245897537008, "learning_rate": 5e-06, "loss": 0.3781, "step": 9600 }, { "epoch": 2.0796364423285003, "grad_norm": 0.7826239773634133, "learning_rate": 5e-06, "loss": 0.3756, "step": 9610 }, { "epoch": 2.081800476087427, "grad_norm": 0.7278522309360076, "learning_rate": 5e-06, "loss": 0.3752, "step": 9620 }, { "epoch": 2.0839645098463535, "grad_norm": 0.7756181784692793, "learning_rate": 5e-06, "loss": 0.3764, "step": 9630 }, { "epoch": 2.0861285436052803, "grad_norm": 0.7359055323127612, "learning_rate": 5e-06, "loss": 0.3722, "step": 9640 }, { "epoch": 2.0882925773642067, "grad_norm": 0.7577112116476528, "learning_rate": 5e-06, "loss": 0.3847, "step": 9650 }, { "epoch": 2.0904566111231335, "grad_norm": 0.8136878823112847, "learning_rate": 5e-06, "loss": 0.3897, "step": 9660 }, { "epoch": 2.0926206448820603, "grad_norm": 0.7427782161695468, "learning_rate": 5e-06, "loss": 0.3782, "step": 9670 }, { "epoch": 2.0947846786409867, "grad_norm": 0.8424273390908451, "learning_rate": 5e-06, "loss": 0.3879, "step": 9680 }, { "epoch": 2.0969487123999135, "grad_norm": 0.7658654614761568, "learning_rate": 5e-06, "loss": 0.3809, "step": 9690 }, { "epoch": 2.09911274615884, "grad_norm": 0.7636850827656342, "learning_rate": 5e-06, "loss": 0.3802, "step": 9700 }, { "epoch": 2.1012767799177667, "grad_norm": 0.7406666012902577, "learning_rate": 5e-06, "loss": 0.3737, "step": 9710 }, { "epoch": 2.1034408136766936, "grad_norm": 0.731920134607451, "learning_rate": 5e-06, "loss": 0.3668, "step": 9720 }, { "epoch": 2.10560484743562, "grad_norm": 0.7932449176977977, "learning_rate": 5e-06, "loss": 0.3793, "step": 9730 }, { "epoch": 2.1077688811945468, "grad_norm": 0.7397705079775352, "learning_rate": 5e-06, "loss": 0.379, "step": 9740 }, { "epoch": 2.109932914953473, "grad_norm": 0.8144870875402731, "learning_rate": 5e-06, "loss": 0.3831, "step": 9750 }, { "epoch": 2.1120969487124, "grad_norm": 0.7451950427223708, "learning_rate": 5e-06, "loss": 0.3762, "step": 9760 }, { "epoch": 2.1142609824713268, "grad_norm": 0.7355015739740769, "learning_rate": 5e-06, "loss": 0.3778, "step": 9770 }, { "epoch": 2.116425016230253, "grad_norm": 0.7536357069669827, "learning_rate": 5e-06, "loss": 0.3758, "step": 9780 }, { "epoch": 2.11858904998918, "grad_norm": 0.7708756762306975, "learning_rate": 5e-06, "loss": 0.3738, "step": 9790 }, { "epoch": 2.1207530837481063, "grad_norm": 0.7626855696919476, "learning_rate": 5e-06, "loss": 0.3847, "step": 9800 }, { "epoch": 2.122917117507033, "grad_norm": 0.7158584629999891, "learning_rate": 5e-06, "loss": 0.3753, "step": 9810 }, { "epoch": 2.1250811512659595, "grad_norm": 0.7466487488136114, "learning_rate": 5e-06, "loss": 0.3764, "step": 9820 }, { "epoch": 2.1272451850248864, "grad_norm": 0.7404349219412076, "learning_rate": 5e-06, "loss": 0.3721, "step": 9830 }, { "epoch": 2.129409218783813, "grad_norm": 0.7668568690340375, "learning_rate": 5e-06, "loss": 0.3768, "step": 9840 }, { "epoch": 2.1315732525427395, "grad_norm": 0.7268904811888163, "learning_rate": 5e-06, "loss": 0.3712, "step": 9850 }, { "epoch": 2.1337372863016664, "grad_norm": 0.7719283305769358, "learning_rate": 5e-06, "loss": 0.3902, "step": 9860 }, { "epoch": 2.1359013200605927, "grad_norm": 0.7079289824393394, "learning_rate": 5e-06, "loss": 0.3779, "step": 9870 }, { "epoch": 2.1380653538195196, "grad_norm": 0.7571867101155465, "learning_rate": 5e-06, "loss": 0.3836, "step": 9880 }, { "epoch": 2.1402293875784464, "grad_norm": 0.7549119684648362, "learning_rate": 5e-06, "loss": 0.3851, "step": 9890 }, { "epoch": 2.1423934213373728, "grad_norm": 0.7941986493329641, "learning_rate": 5e-06, "loss": 0.3776, "step": 9900 }, { "epoch": 2.1445574550962996, "grad_norm": 0.7402941631803586, "learning_rate": 5e-06, "loss": 0.376, "step": 9910 }, { "epoch": 2.146721488855226, "grad_norm": 0.7853141015981866, "learning_rate": 5e-06, "loss": 0.3757, "step": 9920 }, { "epoch": 2.1488855226141528, "grad_norm": 0.7502860009884765, "learning_rate": 5e-06, "loss": 0.3818, "step": 9930 }, { "epoch": 2.1510495563730796, "grad_norm": 0.760008223381561, "learning_rate": 5e-06, "loss": 0.3849, "step": 9940 }, { "epoch": 2.153213590132006, "grad_norm": 0.7940769908926166, "learning_rate": 5e-06, "loss": 0.3804, "step": 9950 }, { "epoch": 2.155377623890933, "grad_norm": 0.739081181426962, "learning_rate": 5e-06, "loss": 0.3773, "step": 9960 }, { "epoch": 2.157541657649859, "grad_norm": 0.7503458217565563, "learning_rate": 5e-06, "loss": 0.3712, "step": 9970 }, { "epoch": 2.159705691408786, "grad_norm": 0.7598931433539674, "learning_rate": 5e-06, "loss": 0.3844, "step": 9980 }, { "epoch": 2.161869725167713, "grad_norm": 0.7494704002208226, "learning_rate": 5e-06, "loss": 0.3895, "step": 9990 }, { "epoch": 2.164033758926639, "grad_norm": 0.7853450199962944, "learning_rate": 5e-06, "loss": 0.382, "step": 10000 }, { "epoch": 2.166197792685566, "grad_norm": 0.7578503693676106, "learning_rate": 5e-06, "loss": 0.383, "step": 10010 }, { "epoch": 2.1683618264444924, "grad_norm": 0.7484549962239924, "learning_rate": 5e-06, "loss": 0.3855, "step": 10020 }, { "epoch": 2.170525860203419, "grad_norm": 0.7614013434918991, "learning_rate": 5e-06, "loss": 0.3696, "step": 10030 }, { "epoch": 2.172689893962346, "grad_norm": 0.7392030074985807, "learning_rate": 5e-06, "loss": 0.3729, "step": 10040 }, { "epoch": 2.1748539277212724, "grad_norm": 0.7415769297940891, "learning_rate": 5e-06, "loss": 0.3767, "step": 10050 }, { "epoch": 2.177017961480199, "grad_norm": 0.744537273333133, "learning_rate": 5e-06, "loss": 0.376, "step": 10060 }, { "epoch": 2.1791819952391256, "grad_norm": 0.7295839586667225, "learning_rate": 5e-06, "loss": 0.3824, "step": 10070 }, { "epoch": 2.1813460289980524, "grad_norm": 0.749316196787019, "learning_rate": 5e-06, "loss": 0.385, "step": 10080 }, { "epoch": 2.1835100627569792, "grad_norm": 0.8144715088789032, "learning_rate": 5e-06, "loss": 0.3869, "step": 10090 }, { "epoch": 2.1856740965159056, "grad_norm": 0.7376466691462915, "learning_rate": 5e-06, "loss": 0.382, "step": 10100 }, { "epoch": 2.1878381302748324, "grad_norm": 0.7553394493000064, "learning_rate": 5e-06, "loss": 0.3831, "step": 10110 }, { "epoch": 2.190002164033759, "grad_norm": 0.7571271156204371, "learning_rate": 5e-06, "loss": 0.3851, "step": 10120 }, { "epoch": 2.1921661977926856, "grad_norm": 0.7574287608767191, "learning_rate": 5e-06, "loss": 0.3783, "step": 10130 }, { "epoch": 2.194330231551612, "grad_norm": 0.7883853379655534, "learning_rate": 5e-06, "loss": 0.3862, "step": 10140 }, { "epoch": 2.196494265310539, "grad_norm": 0.7491644336765493, "learning_rate": 5e-06, "loss": 0.3824, "step": 10150 }, { "epoch": 2.1986582990694656, "grad_norm": 0.8176175667235511, "learning_rate": 5e-06, "loss": 0.382, "step": 10160 }, { "epoch": 2.200822332828392, "grad_norm": 0.7397237145396862, "learning_rate": 5e-06, "loss": 0.3821, "step": 10170 }, { "epoch": 2.202986366587319, "grad_norm": 0.6965521397762487, "learning_rate": 5e-06, "loss": 0.3696, "step": 10180 }, { "epoch": 2.205150400346245, "grad_norm": 0.7532643826436776, "learning_rate": 5e-06, "loss": 0.3794, "step": 10190 }, { "epoch": 2.207314434105172, "grad_norm": 0.7654434113655321, "learning_rate": 5e-06, "loss": 0.3777, "step": 10200 }, { "epoch": 2.209478467864099, "grad_norm": 0.7712364055070057, "learning_rate": 5e-06, "loss": 0.3776, "step": 10210 }, { "epoch": 2.2116425016230252, "grad_norm": 0.731992016899879, "learning_rate": 5e-06, "loss": 0.3867, "step": 10220 }, { "epoch": 2.213806535381952, "grad_norm": 0.7463982586618078, "learning_rate": 5e-06, "loss": 0.3816, "step": 10230 }, { "epoch": 2.2159705691408784, "grad_norm": 0.7624502219381054, "learning_rate": 5e-06, "loss": 0.3764, "step": 10240 }, { "epoch": 2.2181346028998052, "grad_norm": 0.7686079427165029, "learning_rate": 5e-06, "loss": 0.3812, "step": 10250 }, { "epoch": 2.220298636658732, "grad_norm": 0.7572457495264667, "learning_rate": 5e-06, "loss": 0.3823, "step": 10260 }, { "epoch": 2.2224626704176584, "grad_norm": 0.8311579714029802, "learning_rate": 5e-06, "loss": 0.3894, "step": 10270 }, { "epoch": 2.2246267041765853, "grad_norm": 0.753836212037088, "learning_rate": 5e-06, "loss": 0.3812, "step": 10280 }, { "epoch": 2.2267907379355116, "grad_norm": 0.7777095091483516, "learning_rate": 5e-06, "loss": 0.3745, "step": 10290 }, { "epoch": 2.2289547716944385, "grad_norm": 0.7345531886014162, "learning_rate": 5e-06, "loss": 0.3814, "step": 10300 }, { "epoch": 2.2311188054533653, "grad_norm": 0.7492651509334102, "learning_rate": 5e-06, "loss": 0.3895, "step": 10310 }, { "epoch": 2.2332828392122916, "grad_norm": 0.7734399829479766, "learning_rate": 5e-06, "loss": 0.3899, "step": 10320 }, { "epoch": 2.2354468729712185, "grad_norm": 0.8170059562572255, "learning_rate": 5e-06, "loss": 0.3721, "step": 10330 }, { "epoch": 2.237610906730145, "grad_norm": 0.7418910648043867, "learning_rate": 5e-06, "loss": 0.3679, "step": 10340 }, { "epoch": 2.2397749404890717, "grad_norm": 0.7707008155582549, "learning_rate": 5e-06, "loss": 0.3785, "step": 10350 }, { "epoch": 2.241938974247998, "grad_norm": 0.7508948653468445, "learning_rate": 5e-06, "loss": 0.3858, "step": 10360 }, { "epoch": 2.244103008006925, "grad_norm": 0.7280052352796699, "learning_rate": 5e-06, "loss": 0.3812, "step": 10370 }, { "epoch": 2.2462670417658517, "grad_norm": 0.7361553110496917, "learning_rate": 5e-06, "loss": 0.3848, "step": 10380 }, { "epoch": 2.248431075524778, "grad_norm": 0.8110758388068879, "learning_rate": 5e-06, "loss": 0.3859, "step": 10390 }, { "epoch": 2.250595109283705, "grad_norm": 0.7585969507359466, "learning_rate": 5e-06, "loss": 0.37, "step": 10400 }, { "epoch": 2.2527591430426313, "grad_norm": 0.7530247958679976, "learning_rate": 5e-06, "loss": 0.387, "step": 10410 }, { "epoch": 2.254923176801558, "grad_norm": 0.7793241812111504, "learning_rate": 5e-06, "loss": 0.3785, "step": 10420 }, { "epoch": 2.257087210560485, "grad_norm": 0.7359589759600907, "learning_rate": 5e-06, "loss": 0.3818, "step": 10430 }, { "epoch": 2.2592512443194113, "grad_norm": 0.7074654913334124, "learning_rate": 5e-06, "loss": 0.3814, "step": 10440 }, { "epoch": 2.261415278078338, "grad_norm": 0.8029074288686185, "learning_rate": 5e-06, "loss": 0.3939, "step": 10450 }, { "epoch": 2.2635793118372645, "grad_norm": 0.7773602608387922, "learning_rate": 5e-06, "loss": 0.3882, "step": 10460 }, { "epoch": 2.2657433455961913, "grad_norm": 0.7803799314229609, "learning_rate": 5e-06, "loss": 0.3861, "step": 10470 }, { "epoch": 2.267907379355118, "grad_norm": 0.7317655840742071, "learning_rate": 5e-06, "loss": 0.3846, "step": 10480 }, { "epoch": 2.2700714131140445, "grad_norm": 0.7535304850406851, "learning_rate": 5e-06, "loss": 0.3842, "step": 10490 }, { "epoch": 2.2722354468729713, "grad_norm": 0.818389549198698, "learning_rate": 5e-06, "loss": 0.3864, "step": 10500 }, { "epoch": 2.2743994806318977, "grad_norm": 0.7972883371387612, "learning_rate": 5e-06, "loss": 0.3764, "step": 10510 }, { "epoch": 2.2765635143908245, "grad_norm": 0.7684174991828006, "learning_rate": 5e-06, "loss": 0.3923, "step": 10520 }, { "epoch": 2.2787275481497513, "grad_norm": 0.7272379216618123, "learning_rate": 5e-06, "loss": 0.3703, "step": 10530 }, { "epoch": 2.2808915819086777, "grad_norm": 0.7527601200579448, "learning_rate": 5e-06, "loss": 0.3822, "step": 10540 }, { "epoch": 2.2830556156676045, "grad_norm": 0.7661450606626169, "learning_rate": 5e-06, "loss": 0.3836, "step": 10550 }, { "epoch": 2.285219649426531, "grad_norm": 0.779962904036604, "learning_rate": 5e-06, "loss": 0.3833, "step": 10560 }, { "epoch": 2.2873836831854577, "grad_norm": 0.7826624741492424, "learning_rate": 5e-06, "loss": 0.3858, "step": 10570 }, { "epoch": 2.2895477169443845, "grad_norm": 0.745367739386281, "learning_rate": 5e-06, "loss": 0.3814, "step": 10580 }, { "epoch": 2.291711750703311, "grad_norm": 0.730641892603881, "learning_rate": 5e-06, "loss": 0.3772, "step": 10590 }, { "epoch": 2.2938757844622377, "grad_norm": 0.7669396242881692, "learning_rate": 5e-06, "loss": 0.3839, "step": 10600 }, { "epoch": 2.296039818221164, "grad_norm": 0.7319224793844378, "learning_rate": 5e-06, "loss": 0.3711, "step": 10610 }, { "epoch": 2.298203851980091, "grad_norm": 0.7954183700469217, "learning_rate": 5e-06, "loss": 0.3861, "step": 10620 }, { "epoch": 2.3003678857390177, "grad_norm": 0.7564733111942853, "learning_rate": 5e-06, "loss": 0.3716, "step": 10630 }, { "epoch": 2.302531919497944, "grad_norm": 0.7563469175859392, "learning_rate": 5e-06, "loss": 0.3817, "step": 10640 }, { "epoch": 2.304695953256871, "grad_norm": 0.740370173672384, "learning_rate": 5e-06, "loss": 0.3785, "step": 10650 }, { "epoch": 2.3068599870157973, "grad_norm": 0.8008691481523956, "learning_rate": 5e-06, "loss": 0.3869, "step": 10660 }, { "epoch": 2.309024020774724, "grad_norm": 0.757170756724089, "learning_rate": 5e-06, "loss": 0.3883, "step": 10670 }, { "epoch": 2.311188054533651, "grad_norm": 0.7445447389513807, "learning_rate": 5e-06, "loss": 0.3739, "step": 10680 }, { "epoch": 2.3133520882925773, "grad_norm": 0.7614800896552029, "learning_rate": 5e-06, "loss": 0.3835, "step": 10690 }, { "epoch": 2.315516122051504, "grad_norm": 0.7507557413618469, "learning_rate": 5e-06, "loss": 0.3776, "step": 10700 }, { "epoch": 2.3176801558104305, "grad_norm": 0.7721474736234565, "learning_rate": 5e-06, "loss": 0.3773, "step": 10710 }, { "epoch": 2.3198441895693573, "grad_norm": 0.7313755931401409, "learning_rate": 5e-06, "loss": 0.3895, "step": 10720 }, { "epoch": 2.322008223328284, "grad_norm": 0.7814799741219387, "learning_rate": 5e-06, "loss": 0.3889, "step": 10730 }, { "epoch": 2.3241722570872105, "grad_norm": 0.7774874848897156, "learning_rate": 5e-06, "loss": 0.3791, "step": 10740 }, { "epoch": 2.3263362908461374, "grad_norm": 0.7708201057745674, "learning_rate": 5e-06, "loss": 0.3888, "step": 10750 }, { "epoch": 2.3285003246050637, "grad_norm": 0.7317774627158627, "learning_rate": 5e-06, "loss": 0.3715, "step": 10760 }, { "epoch": 2.3306643583639906, "grad_norm": 0.7271265535921656, "learning_rate": 5e-06, "loss": 0.3776, "step": 10770 }, { "epoch": 2.332828392122917, "grad_norm": 0.8285018373906101, "learning_rate": 5e-06, "loss": 0.3891, "step": 10780 }, { "epoch": 2.3349924258818437, "grad_norm": 0.8019035513607989, "learning_rate": 5e-06, "loss": 0.3841, "step": 10790 }, { "epoch": 2.3371564596407706, "grad_norm": 0.7544365537673512, "learning_rate": 5e-06, "loss": 0.386, "step": 10800 }, { "epoch": 2.339320493399697, "grad_norm": 0.774623922213998, "learning_rate": 5e-06, "loss": 0.3872, "step": 10810 }, { "epoch": 2.3414845271586238, "grad_norm": 0.7287036153983298, "learning_rate": 5e-06, "loss": 0.3851, "step": 10820 }, { "epoch": 2.34364856091755, "grad_norm": 0.7831306302240653, "learning_rate": 5e-06, "loss": 0.3852, "step": 10830 }, { "epoch": 2.345812594676477, "grad_norm": 0.7455400541087714, "learning_rate": 5e-06, "loss": 0.3832, "step": 10840 }, { "epoch": 2.347976628435404, "grad_norm": 0.7911713615620226, "learning_rate": 5e-06, "loss": 0.3942, "step": 10850 }, { "epoch": 2.35014066219433, "grad_norm": 0.8028233557037938, "learning_rate": 5e-06, "loss": 0.388, "step": 10860 }, { "epoch": 2.352304695953257, "grad_norm": 0.7864179290759362, "learning_rate": 5e-06, "loss": 0.3824, "step": 10870 }, { "epoch": 2.3544687297121834, "grad_norm": 0.7554701896856139, "learning_rate": 5e-06, "loss": 0.393, "step": 10880 }, { "epoch": 2.35663276347111, "grad_norm": 0.723649579527871, "learning_rate": 5e-06, "loss": 0.3848, "step": 10890 }, { "epoch": 2.3587967972300365, "grad_norm": 0.7542043146491088, "learning_rate": 5e-06, "loss": 0.3772, "step": 10900 }, { "epoch": 2.3609608309889634, "grad_norm": 0.7831233351849556, "learning_rate": 5e-06, "loss": 0.3796, "step": 10910 }, { "epoch": 2.36312486474789, "grad_norm": 0.7789860410163189, "learning_rate": 5e-06, "loss": 0.3829, "step": 10920 }, { "epoch": 2.3652888985068166, "grad_norm": 0.7371403610126779, "learning_rate": 5e-06, "loss": 0.3893, "step": 10930 }, { "epoch": 2.3674529322657434, "grad_norm": 0.7725505898139313, "learning_rate": 5e-06, "loss": 0.3884, "step": 10940 }, { "epoch": 2.3696169660246698, "grad_norm": 0.7347104158979803, "learning_rate": 5e-06, "loss": 0.3905, "step": 10950 }, { "epoch": 2.3717809997835966, "grad_norm": 0.7657492814282947, "learning_rate": 5e-06, "loss": 0.3879, "step": 10960 }, { "epoch": 2.3739450335425234, "grad_norm": 0.7535828749091847, "learning_rate": 5e-06, "loss": 0.3875, "step": 10970 }, { "epoch": 2.3761090673014498, "grad_norm": 0.7577252254306687, "learning_rate": 5e-06, "loss": 0.3779, "step": 10980 }, { "epoch": 2.3782731010603766, "grad_norm": 0.754449061571296, "learning_rate": 5e-06, "loss": 0.384, "step": 10990 }, { "epoch": 2.380437134819303, "grad_norm": 0.7286408029795997, "learning_rate": 5e-06, "loss": 0.3812, "step": 11000 }, { "epoch": 2.38260116857823, "grad_norm": 0.7630893647158181, "learning_rate": 5e-06, "loss": 0.381, "step": 11010 }, { "epoch": 2.3847652023371566, "grad_norm": 0.7962306496261992, "learning_rate": 5e-06, "loss": 0.3876, "step": 11020 }, { "epoch": 2.386929236096083, "grad_norm": 0.7677708510965319, "learning_rate": 5e-06, "loss": 0.3845, "step": 11030 }, { "epoch": 2.38909326985501, "grad_norm": 0.7570281432515957, "learning_rate": 5e-06, "loss": 0.3867, "step": 11040 }, { "epoch": 2.391257303613936, "grad_norm": 0.8132940105852546, "learning_rate": 5e-06, "loss": 0.3782, "step": 11050 }, { "epoch": 2.393421337372863, "grad_norm": 0.739800914008478, "learning_rate": 5e-06, "loss": 0.3823, "step": 11060 }, { "epoch": 2.39558537113179, "grad_norm": 0.7714600709258795, "learning_rate": 5e-06, "loss": 0.3934, "step": 11070 }, { "epoch": 2.397749404890716, "grad_norm": 0.8296198735762226, "learning_rate": 5e-06, "loss": 0.3737, "step": 11080 }, { "epoch": 2.399913438649643, "grad_norm": 0.7783027532344778, "learning_rate": 5e-06, "loss": 0.3898, "step": 11090 }, { "epoch": 2.4020774724085694, "grad_norm": 0.7605774127832313, "learning_rate": 5e-06, "loss": 0.3786, "step": 11100 }, { "epoch": 2.404241506167496, "grad_norm": 0.7161203998165129, "learning_rate": 5e-06, "loss": 0.3729, "step": 11110 }, { "epoch": 2.406405539926423, "grad_norm": 0.8001326738731909, "learning_rate": 5e-06, "loss": 0.3895, "step": 11120 }, { "epoch": 2.4085695736853494, "grad_norm": 0.7852408542137601, "learning_rate": 5e-06, "loss": 0.3921, "step": 11130 }, { "epoch": 2.4107336074442762, "grad_norm": 0.7641378077706924, "learning_rate": 5e-06, "loss": 0.3881, "step": 11140 }, { "epoch": 2.4128976412032026, "grad_norm": 0.7369344114660852, "learning_rate": 5e-06, "loss": 0.3848, "step": 11150 }, { "epoch": 2.4150616749621294, "grad_norm": 0.7728142277616102, "learning_rate": 5e-06, "loss": 0.388, "step": 11160 }, { "epoch": 2.4172257087210562, "grad_norm": 0.7938596540542148, "learning_rate": 5e-06, "loss": 0.3754, "step": 11170 }, { "epoch": 2.4193897424799826, "grad_norm": 0.7277868215488438, "learning_rate": 5e-06, "loss": 0.3803, "step": 11180 }, { "epoch": 2.4215537762389094, "grad_norm": 0.7682514933014444, "learning_rate": 5e-06, "loss": 0.3789, "step": 11190 }, { "epoch": 2.423717809997836, "grad_norm": 0.769584672945386, "learning_rate": 5e-06, "loss": 0.3844, "step": 11200 }, { "epoch": 2.4258818437567626, "grad_norm": 0.7295995976083359, "learning_rate": 5e-06, "loss": 0.3795, "step": 11210 }, { "epoch": 2.4280458775156895, "grad_norm": 0.7547865237203139, "learning_rate": 5e-06, "loss": 0.3828, "step": 11220 }, { "epoch": 2.430209911274616, "grad_norm": 0.8150532849289923, "learning_rate": 5e-06, "loss": 0.3858, "step": 11230 }, { "epoch": 2.4323739450335427, "grad_norm": 0.8095778619727545, "learning_rate": 5e-06, "loss": 0.3924, "step": 11240 }, { "epoch": 2.434537978792469, "grad_norm": 0.7612807753325169, "learning_rate": 5e-06, "loss": 0.3825, "step": 11250 }, { "epoch": 2.436702012551396, "grad_norm": 0.84684876401696, "learning_rate": 5e-06, "loss": 0.3912, "step": 11260 }, { "epoch": 2.4388660463103227, "grad_norm": 0.7505318974427521, "learning_rate": 5e-06, "loss": 0.3821, "step": 11270 }, { "epoch": 2.441030080069249, "grad_norm": 0.7463667758343082, "learning_rate": 5e-06, "loss": 0.3779, "step": 11280 }, { "epoch": 2.443194113828176, "grad_norm": 0.7930997256650184, "learning_rate": 5e-06, "loss": 0.3824, "step": 11290 }, { "epoch": 2.4453581475871022, "grad_norm": 0.7232751240140999, "learning_rate": 5e-06, "loss": 0.3712, "step": 11300 }, { "epoch": 2.447522181346029, "grad_norm": 0.7452902861534266, "learning_rate": 5e-06, "loss": 0.3829, "step": 11310 }, { "epoch": 2.449686215104956, "grad_norm": 0.7894533038001047, "learning_rate": 5e-06, "loss": 0.3901, "step": 11320 }, { "epoch": 2.4518502488638823, "grad_norm": 0.7822422085940205, "learning_rate": 5e-06, "loss": 0.3894, "step": 11330 }, { "epoch": 2.454014282622809, "grad_norm": 0.7790104348646221, "learning_rate": 5e-06, "loss": 0.3772, "step": 11340 }, { "epoch": 2.4561783163817354, "grad_norm": 0.7674597626994477, "learning_rate": 5e-06, "loss": 0.3952, "step": 11350 }, { "epoch": 2.4583423501406623, "grad_norm": 0.7389051490728133, "learning_rate": 5e-06, "loss": 0.3886, "step": 11360 }, { "epoch": 2.4605063838995886, "grad_norm": 0.7648071653406447, "learning_rate": 5e-06, "loss": 0.3785, "step": 11370 }, { "epoch": 2.4626704176585155, "grad_norm": 0.7485263164164858, "learning_rate": 5e-06, "loss": 0.3834, "step": 11380 }, { "epoch": 2.4648344514174423, "grad_norm": 0.7571813819723854, "learning_rate": 5e-06, "loss": 0.3891, "step": 11390 }, { "epoch": 2.4669984851763687, "grad_norm": 0.79013779876267, "learning_rate": 5e-06, "loss": 0.3787, "step": 11400 }, { "epoch": 2.4691625189352955, "grad_norm": 0.7899502403657093, "learning_rate": 5e-06, "loss": 0.3792, "step": 11410 }, { "epoch": 2.471326552694222, "grad_norm": 0.79065022776973, "learning_rate": 5e-06, "loss": 0.3849, "step": 11420 }, { "epoch": 2.4734905864531487, "grad_norm": 0.7579456308475134, "learning_rate": 5e-06, "loss": 0.3906, "step": 11430 }, { "epoch": 2.4756546202120755, "grad_norm": 0.7684812839256481, "learning_rate": 5e-06, "loss": 0.3846, "step": 11440 }, { "epoch": 2.477818653971002, "grad_norm": 0.7541428841884413, "learning_rate": 5e-06, "loss": 0.3949, "step": 11450 }, { "epoch": 2.4799826877299287, "grad_norm": 0.773906564919006, "learning_rate": 5e-06, "loss": 0.3906, "step": 11460 }, { "epoch": 2.482146721488855, "grad_norm": 0.8020255649891496, "learning_rate": 5e-06, "loss": 0.3925, "step": 11470 }, { "epoch": 2.484310755247782, "grad_norm": 0.7615441009856345, "learning_rate": 5e-06, "loss": 0.4048, "step": 11480 }, { "epoch": 2.4864747890067083, "grad_norm": 0.7653863910560466, "learning_rate": 5e-06, "loss": 0.3849, "step": 11490 }, { "epoch": 2.488638822765635, "grad_norm": 0.7982102560926431, "learning_rate": 5e-06, "loss": 0.3791, "step": 11500 }, { "epoch": 2.490802856524562, "grad_norm": 0.7646849476306503, "learning_rate": 5e-06, "loss": 0.3915, "step": 11510 }, { "epoch": 2.4929668902834883, "grad_norm": 0.7926740161568403, "learning_rate": 5e-06, "loss": 0.3847, "step": 11520 }, { "epoch": 2.495130924042415, "grad_norm": 0.7857256749815332, "learning_rate": 5e-06, "loss": 0.3901, "step": 11530 }, { "epoch": 2.4972949578013415, "grad_norm": 0.7811301407036569, "learning_rate": 5e-06, "loss": 0.3822, "step": 11540 }, { "epoch": 2.4994589915602683, "grad_norm": 0.7612258678572895, "learning_rate": 5e-06, "loss": 0.3873, "step": 11550 }, { "epoch": 2.501623025319195, "grad_norm": 0.7510835321762522, "learning_rate": 5e-06, "loss": 0.3907, "step": 11560 }, { "epoch": 2.5037870590781215, "grad_norm": 0.7672276862765546, "learning_rate": 5e-06, "loss": 0.3828, "step": 11570 }, { "epoch": 2.5059510928370483, "grad_norm": 0.7628691005049446, "learning_rate": 5e-06, "loss": 0.3837, "step": 11580 }, { "epoch": 2.5081151265959747, "grad_norm": 0.7471986630619987, "learning_rate": 5e-06, "loss": 0.3863, "step": 11590 }, { "epoch": 2.5102791603549015, "grad_norm": 0.7706154307172854, "learning_rate": 5e-06, "loss": 0.3896, "step": 11600 }, { "epoch": 2.5124431941138283, "grad_norm": 0.7343855290129636, "learning_rate": 5e-06, "loss": 0.387, "step": 11610 }, { "epoch": 2.5146072278727547, "grad_norm": 0.7503993070592432, "learning_rate": 5e-06, "loss": 0.3947, "step": 11620 }, { "epoch": 2.5167712616316815, "grad_norm": 0.7894715279800384, "learning_rate": 5e-06, "loss": 0.3896, "step": 11630 }, { "epoch": 2.518935295390608, "grad_norm": 0.7565826940704703, "learning_rate": 5e-06, "loss": 0.3873, "step": 11640 }, { "epoch": 2.5210993291495347, "grad_norm": 0.7680776687871832, "learning_rate": 5e-06, "loss": 0.3938, "step": 11650 }, { "epoch": 2.5232633629084615, "grad_norm": 0.7896385741117732, "learning_rate": 5e-06, "loss": 0.379, "step": 11660 }, { "epoch": 2.525427396667388, "grad_norm": 0.7482445016179938, "learning_rate": 5e-06, "loss": 0.387, "step": 11670 }, { "epoch": 2.5275914304263147, "grad_norm": 0.7788787667528324, "learning_rate": 5e-06, "loss": 0.3912, "step": 11680 }, { "epoch": 2.529755464185241, "grad_norm": 0.7802367273135542, "learning_rate": 5e-06, "loss": 0.391, "step": 11690 }, { "epoch": 2.531919497944168, "grad_norm": 0.7907791607502596, "learning_rate": 5e-06, "loss": 0.3889, "step": 11700 }, { "epoch": 2.5340835317030947, "grad_norm": 0.735159844361493, "learning_rate": 5e-06, "loss": 0.3866, "step": 11710 }, { "epoch": 2.536247565462021, "grad_norm": 0.8002646416305854, "learning_rate": 5e-06, "loss": 0.3797, "step": 11720 }, { "epoch": 2.538411599220948, "grad_norm": 0.744137757018372, "learning_rate": 5e-06, "loss": 0.3851, "step": 11730 }, { "epoch": 2.5405756329798743, "grad_norm": 0.8089884009703747, "learning_rate": 5e-06, "loss": 0.3876, "step": 11740 }, { "epoch": 2.542739666738801, "grad_norm": 0.7409059443870979, "learning_rate": 5e-06, "loss": 0.3817, "step": 11750 }, { "epoch": 2.544903700497728, "grad_norm": 0.7474137046476967, "learning_rate": 5e-06, "loss": 0.3809, "step": 11760 }, { "epoch": 2.5470677342566543, "grad_norm": 0.7328688526405038, "learning_rate": 5e-06, "loss": 0.3847, "step": 11770 }, { "epoch": 2.549231768015581, "grad_norm": 0.7309917622922488, "learning_rate": 5e-06, "loss": 0.3821, "step": 11780 }, { "epoch": 2.5513958017745075, "grad_norm": 0.760296512703151, "learning_rate": 5e-06, "loss": 0.3847, "step": 11790 }, { "epoch": 2.5535598355334344, "grad_norm": 0.7675760208277899, "learning_rate": 5e-06, "loss": 0.3784, "step": 11800 }, { "epoch": 2.555723869292361, "grad_norm": 0.7239541839213967, "learning_rate": 5e-06, "loss": 0.3893, "step": 11810 }, { "epoch": 2.5578879030512875, "grad_norm": 0.7575635250945256, "learning_rate": 5e-06, "loss": 0.3866, "step": 11820 }, { "epoch": 2.5600519368102144, "grad_norm": 0.8205016916935968, "learning_rate": 5e-06, "loss": 0.3884, "step": 11830 }, { "epoch": 2.5622159705691407, "grad_norm": 0.7630887493655477, "learning_rate": 5e-06, "loss": 0.3822, "step": 11840 }, { "epoch": 2.5643800043280676, "grad_norm": 0.7483344691244171, "learning_rate": 5e-06, "loss": 0.3861, "step": 11850 }, { "epoch": 2.5665440380869944, "grad_norm": 0.7836573802364927, "learning_rate": 5e-06, "loss": 0.3859, "step": 11860 }, { "epoch": 2.5687080718459208, "grad_norm": 0.7758135442923644, "learning_rate": 5e-06, "loss": 0.389, "step": 11870 }, { "epoch": 2.5708721056048476, "grad_norm": 0.8156880128058609, "learning_rate": 5e-06, "loss": 0.3815, "step": 11880 }, { "epoch": 2.573036139363774, "grad_norm": 0.769477713621125, "learning_rate": 5e-06, "loss": 0.3887, "step": 11890 }, { "epoch": 2.5752001731227008, "grad_norm": 0.7539503843193486, "learning_rate": 5e-06, "loss": 0.3746, "step": 11900 }, { "epoch": 2.5773642068816276, "grad_norm": 0.7665614539752887, "learning_rate": 5e-06, "loss": 0.383, "step": 11910 }, { "epoch": 2.579528240640554, "grad_norm": 0.78863002604713, "learning_rate": 5e-06, "loss": 0.3932, "step": 11920 }, { "epoch": 2.581692274399481, "grad_norm": 0.7860997141249391, "learning_rate": 5e-06, "loss": 0.3822, "step": 11930 }, { "epoch": 2.583856308158407, "grad_norm": 0.7450886855994443, "learning_rate": 5e-06, "loss": 0.3847, "step": 11940 }, { "epoch": 2.586020341917334, "grad_norm": 0.7184291709731961, "learning_rate": 5e-06, "loss": 0.4006, "step": 11950 }, { "epoch": 2.588184375676261, "grad_norm": 0.7395095877340896, "learning_rate": 5e-06, "loss": 0.3805, "step": 11960 }, { "epoch": 2.590348409435187, "grad_norm": 0.7326541647113222, "learning_rate": 5e-06, "loss": 0.3926, "step": 11970 }, { "epoch": 2.5925124431941136, "grad_norm": 0.7590091456087876, "learning_rate": 5e-06, "loss": 0.39, "step": 11980 }, { "epoch": 2.5946764769530404, "grad_norm": 0.7514373484529209, "learning_rate": 5e-06, "loss": 0.3805, "step": 11990 }, { "epoch": 2.596840510711967, "grad_norm": 0.7974717819972812, "learning_rate": 5e-06, "loss": 0.3814, "step": 12000 }, { "epoch": 2.599004544470894, "grad_norm": 0.7663442947904777, "learning_rate": 5e-06, "loss": 0.3919, "step": 12010 }, { "epoch": 2.6011685782298204, "grad_norm": 0.7869861761124946, "learning_rate": 5e-06, "loss": 0.3883, "step": 12020 }, { "epoch": 2.6033326119887468, "grad_norm": 0.7503773304052421, "learning_rate": 5e-06, "loss": 0.3877, "step": 12030 }, { "epoch": 2.6054966457476736, "grad_norm": 0.776303816000796, "learning_rate": 5e-06, "loss": 0.3985, "step": 12040 }, { "epoch": 2.6076606795066004, "grad_norm": 0.759982286533669, "learning_rate": 5e-06, "loss": 0.3899, "step": 12050 }, { "epoch": 2.609824713265527, "grad_norm": 0.8131552609328792, "learning_rate": 5e-06, "loss": 0.3884, "step": 12060 }, { "epoch": 2.6119887470244536, "grad_norm": 0.7476915162359377, "learning_rate": 5e-06, "loss": 0.3857, "step": 12070 }, { "epoch": 2.61415278078338, "grad_norm": 0.8030188195536405, "learning_rate": 5e-06, "loss": 0.3848, "step": 12080 }, { "epoch": 2.616316814542307, "grad_norm": 0.7550164235383711, "learning_rate": 5e-06, "loss": 0.3903, "step": 12090 }, { "epoch": 2.6184808483012336, "grad_norm": 0.759412823543429, "learning_rate": 5e-06, "loss": 0.3832, "step": 12100 }, { "epoch": 2.62064488206016, "grad_norm": 0.7595851928907493, "learning_rate": 5e-06, "loss": 0.3882, "step": 12110 }, { "epoch": 2.622808915819087, "grad_norm": 0.7557311993407309, "learning_rate": 5e-06, "loss": 0.3927, "step": 12120 }, { "epoch": 2.624972949578013, "grad_norm": 0.7935821634398855, "learning_rate": 5e-06, "loss": 0.3848, "step": 12130 }, { "epoch": 2.62713698333694, "grad_norm": 0.7407643308725581, "learning_rate": 5e-06, "loss": 0.3871, "step": 12140 }, { "epoch": 2.629301017095867, "grad_norm": 0.729366375580239, "learning_rate": 5e-06, "loss": 0.3873, "step": 12150 }, { "epoch": 2.631465050854793, "grad_norm": 0.7561366246431421, "learning_rate": 5e-06, "loss": 0.3822, "step": 12160 }, { "epoch": 2.63362908461372, "grad_norm": 0.7667634155030516, "learning_rate": 5e-06, "loss": 0.3809, "step": 12170 }, { "epoch": 2.6357931183726464, "grad_norm": 0.7885773621343677, "learning_rate": 5e-06, "loss": 0.3817, "step": 12180 }, { "epoch": 2.6379571521315732, "grad_norm": 0.7440441002401869, "learning_rate": 5e-06, "loss": 0.3987, "step": 12190 }, { "epoch": 2.6401211858905, "grad_norm": 0.7810250085742799, "learning_rate": 5e-06, "loss": 0.3782, "step": 12200 }, { "epoch": 2.6422852196494264, "grad_norm": 0.8150657736573159, "learning_rate": 5e-06, "loss": 0.3896, "step": 12210 }, { "epoch": 2.6444492534083532, "grad_norm": 0.7279096250776349, "learning_rate": 5e-06, "loss": 0.3821, "step": 12220 }, { "epoch": 2.6466132871672796, "grad_norm": 0.766036376483687, "learning_rate": 5e-06, "loss": 0.3853, "step": 12230 }, { "epoch": 2.6487773209262064, "grad_norm": 0.7629756533767561, "learning_rate": 5e-06, "loss": 0.3848, "step": 12240 }, { "epoch": 2.6509413546851333, "grad_norm": 0.7258995252608367, "learning_rate": 5e-06, "loss": 0.3873, "step": 12250 }, { "epoch": 2.6531053884440596, "grad_norm": 0.7487644012773005, "learning_rate": 5e-06, "loss": 0.389, "step": 12260 }, { "epoch": 2.6552694222029865, "grad_norm": 0.7730940322471979, "learning_rate": 5e-06, "loss": 0.3919, "step": 12270 }, { "epoch": 2.657433455961913, "grad_norm": 0.7261832821091123, "learning_rate": 5e-06, "loss": 0.3772, "step": 12280 }, { "epoch": 2.6595974897208396, "grad_norm": 0.7841634979405185, "learning_rate": 5e-06, "loss": 0.3955, "step": 12290 }, { "epoch": 2.6617615234797665, "grad_norm": 0.7443154389593502, "learning_rate": 5e-06, "loss": 0.3998, "step": 12300 }, { "epoch": 2.663925557238693, "grad_norm": 0.792617400623899, "learning_rate": 5e-06, "loss": 0.3838, "step": 12310 }, { "epoch": 2.6660895909976197, "grad_norm": 0.7430565465776605, "learning_rate": 5e-06, "loss": 0.3926, "step": 12320 }, { "epoch": 2.668253624756546, "grad_norm": 0.7601315336074875, "learning_rate": 5e-06, "loss": 0.3935, "step": 12330 }, { "epoch": 2.670417658515473, "grad_norm": 0.7349512479796254, "learning_rate": 5e-06, "loss": 0.386, "step": 12340 }, { "epoch": 2.6725816922743997, "grad_norm": 0.7492793091950313, "learning_rate": 5e-06, "loss": 0.3939, "step": 12350 }, { "epoch": 2.674745726033326, "grad_norm": 0.8020327135634121, "learning_rate": 5e-06, "loss": 0.3933, "step": 12360 }, { "epoch": 2.676909759792253, "grad_norm": 0.7821248575887879, "learning_rate": 5e-06, "loss": 0.3889, "step": 12370 }, { "epoch": 2.6790737935511793, "grad_norm": 0.7764574267508418, "learning_rate": 5e-06, "loss": 0.3834, "step": 12380 }, { "epoch": 2.681237827310106, "grad_norm": 0.7581668979481178, "learning_rate": 5e-06, "loss": 0.3894, "step": 12390 }, { "epoch": 2.683401861069033, "grad_norm": 0.7312153550603386, "learning_rate": 5e-06, "loss": 0.3871, "step": 12400 }, { "epoch": 2.6855658948279593, "grad_norm": 0.8290124241375181, "learning_rate": 5e-06, "loss": 0.3778, "step": 12410 }, { "epoch": 2.687729928586886, "grad_norm": 0.7317732999168243, "learning_rate": 5e-06, "loss": 0.3863, "step": 12420 }, { "epoch": 2.6898939623458125, "grad_norm": 0.7638469327765998, "learning_rate": 5e-06, "loss": 0.3829, "step": 12430 }, { "epoch": 2.6920579961047393, "grad_norm": 0.7949708235476729, "learning_rate": 5e-06, "loss": 0.3926, "step": 12440 }, { "epoch": 2.694222029863666, "grad_norm": 0.7550935663260143, "learning_rate": 5e-06, "loss": 0.3783, "step": 12450 }, { "epoch": 2.6963860636225925, "grad_norm": 0.8313055045861929, "learning_rate": 5e-06, "loss": 0.3817, "step": 12460 }, { "epoch": 2.6985500973815193, "grad_norm": 0.8005354523317457, "learning_rate": 5e-06, "loss": 0.3868, "step": 12470 }, { "epoch": 2.7007141311404457, "grad_norm": 0.7670070573325029, "learning_rate": 5e-06, "loss": 0.3815, "step": 12480 }, { "epoch": 2.7028781648993725, "grad_norm": 0.7508083437598002, "learning_rate": 5e-06, "loss": 0.3882, "step": 12490 }, { "epoch": 2.7050421986582993, "grad_norm": 0.7787188976232684, "learning_rate": 5e-06, "loss": 0.3872, "step": 12500 }, { "epoch": 2.7072062324172257, "grad_norm": 0.7683661547819274, "learning_rate": 5e-06, "loss": 0.3849, "step": 12510 }, { "epoch": 2.7093702661761525, "grad_norm": 0.7391247866655715, "learning_rate": 5e-06, "loss": 0.3806, "step": 12520 }, { "epoch": 2.711534299935079, "grad_norm": 0.7565014644999606, "learning_rate": 5e-06, "loss": 0.3733, "step": 12530 }, { "epoch": 2.7136983336940057, "grad_norm": 0.8085171262046021, "learning_rate": 5e-06, "loss": 0.3737, "step": 12540 }, { "epoch": 2.7158623674529325, "grad_norm": 0.7858515305447751, "learning_rate": 5e-06, "loss": 0.3875, "step": 12550 }, { "epoch": 2.718026401211859, "grad_norm": 0.7365212017152488, "learning_rate": 5e-06, "loss": 0.3908, "step": 12560 }, { "epoch": 2.7201904349707853, "grad_norm": 0.7744330424464411, "learning_rate": 5e-06, "loss": 0.385, "step": 12570 }, { "epoch": 2.722354468729712, "grad_norm": 0.7564513841846556, "learning_rate": 5e-06, "loss": 0.3867, "step": 12580 }, { "epoch": 2.724518502488639, "grad_norm": 0.7750108310504817, "learning_rate": 5e-06, "loss": 0.3892, "step": 12590 }, { "epoch": 2.7266825362475657, "grad_norm": 0.7782061698199593, "learning_rate": 5e-06, "loss": 0.3952, "step": 12600 }, { "epoch": 2.728846570006492, "grad_norm": 0.7822075760319362, "learning_rate": 5e-06, "loss": 0.3981, "step": 12610 }, { "epoch": 2.7310106037654185, "grad_norm": 0.7764669704006216, "learning_rate": 5e-06, "loss": 0.3873, "step": 12620 }, { "epoch": 2.7331746375243453, "grad_norm": 0.746571146392982, "learning_rate": 5e-06, "loss": 0.3857, "step": 12630 }, { "epoch": 2.735338671283272, "grad_norm": 0.7727176603555808, "learning_rate": 5e-06, "loss": 0.3849, "step": 12640 }, { "epoch": 2.7375027050421985, "grad_norm": 0.7716153482496758, "learning_rate": 5e-06, "loss": 0.3942, "step": 12650 }, { "epoch": 2.7396667388011253, "grad_norm": 0.7506221057021802, "learning_rate": 5e-06, "loss": 0.3828, "step": 12660 }, { "epoch": 2.7418307725600517, "grad_norm": 0.765605207662551, "learning_rate": 5e-06, "loss": 0.3896, "step": 12670 }, { "epoch": 2.7439948063189785, "grad_norm": 0.7936436590710892, "learning_rate": 5e-06, "loss": 0.384, "step": 12680 }, { "epoch": 2.7461588400779053, "grad_norm": 0.7417315221453376, "learning_rate": 5e-06, "loss": 0.3864, "step": 12690 }, { "epoch": 2.7483228738368317, "grad_norm": 0.7638299083549244, "learning_rate": 5e-06, "loss": 0.3882, "step": 12700 }, { "epoch": 2.7504869075957585, "grad_norm": 0.7401370791792725, "learning_rate": 5e-06, "loss": 0.3736, "step": 12710 }, { "epoch": 2.752650941354685, "grad_norm": 0.7604282539682512, "learning_rate": 5e-06, "loss": 0.386, "step": 12720 }, { "epoch": 2.7548149751136117, "grad_norm": 0.7593327654847288, "learning_rate": 5e-06, "loss": 0.3874, "step": 12730 }, { "epoch": 2.7569790088725386, "grad_norm": 0.762010718640477, "learning_rate": 5e-06, "loss": 0.3728, "step": 12740 }, { "epoch": 2.759143042631465, "grad_norm": 0.7464411042463334, "learning_rate": 5e-06, "loss": 0.3853, "step": 12750 }, { "epoch": 2.7613070763903917, "grad_norm": 0.7527404376235197, "learning_rate": 5e-06, "loss": 0.3805, "step": 12760 }, { "epoch": 2.763471110149318, "grad_norm": 0.7413553870174003, "learning_rate": 5e-06, "loss": 0.3827, "step": 12770 }, { "epoch": 2.765635143908245, "grad_norm": 0.8112331079474291, "learning_rate": 5e-06, "loss": 0.3912, "step": 12780 }, { "epoch": 2.7677991776671718, "grad_norm": 0.7606524264570311, "learning_rate": 5e-06, "loss": 0.384, "step": 12790 }, { "epoch": 2.769963211426098, "grad_norm": 0.7688787180412004, "learning_rate": 5e-06, "loss": 0.386, "step": 12800 }, { "epoch": 2.772127245185025, "grad_norm": 0.7709908015790655, "learning_rate": 5e-06, "loss": 0.3911, "step": 12810 }, { "epoch": 2.7742912789439513, "grad_norm": 0.8256088144670327, "learning_rate": 5e-06, "loss": 0.394, "step": 12820 }, { "epoch": 2.776455312702878, "grad_norm": 0.76732285875986, "learning_rate": 5e-06, "loss": 0.3825, "step": 12830 }, { "epoch": 2.778619346461805, "grad_norm": 0.7623048216172926, "learning_rate": 5e-06, "loss": 0.3812, "step": 12840 }, { "epoch": 2.7807833802207313, "grad_norm": 0.7770120662585185, "learning_rate": 5e-06, "loss": 0.3825, "step": 12850 }, { "epoch": 2.782947413979658, "grad_norm": 0.7659653162936821, "learning_rate": 5e-06, "loss": 0.3802, "step": 12860 }, { "epoch": 2.7851114477385845, "grad_norm": 0.7708274852623006, "learning_rate": 5e-06, "loss": 0.4, "step": 12870 }, { "epoch": 2.7872754814975114, "grad_norm": 0.7743267205110804, "learning_rate": 5e-06, "loss": 0.3833, "step": 12880 }, { "epoch": 2.789439515256438, "grad_norm": 0.7749387611421825, "learning_rate": 5e-06, "loss": 0.3953, "step": 12890 }, { "epoch": 2.7916035490153646, "grad_norm": 0.7586126347403827, "learning_rate": 5e-06, "loss": 0.3853, "step": 12900 }, { "epoch": 2.7937675827742914, "grad_norm": 0.7500835141793593, "learning_rate": 5e-06, "loss": 0.3929, "step": 12910 }, { "epoch": 2.7959316165332178, "grad_norm": 0.74663392656643, "learning_rate": 5e-06, "loss": 0.3795, "step": 12920 }, { "epoch": 2.7980956502921446, "grad_norm": 0.7279989402913795, "learning_rate": 5e-06, "loss": 0.3934, "step": 12930 }, { "epoch": 2.8002596840510714, "grad_norm": 0.782757718266062, "learning_rate": 5e-06, "loss": 0.3902, "step": 12940 }, { "epoch": 2.8024237178099978, "grad_norm": 0.7541794890470066, "learning_rate": 5e-06, "loss": 0.3905, "step": 12950 }, { "epoch": 2.8045877515689246, "grad_norm": 0.7511833194905094, "learning_rate": 5e-06, "loss": 0.3803, "step": 12960 }, { "epoch": 2.806751785327851, "grad_norm": 0.7345651454318538, "learning_rate": 5e-06, "loss": 0.3838, "step": 12970 }, { "epoch": 2.808915819086778, "grad_norm": 0.7430756885822685, "learning_rate": 5e-06, "loss": 0.375, "step": 12980 }, { "epoch": 2.8110798528457046, "grad_norm": 0.7503404811162424, "learning_rate": 5e-06, "loss": 0.3899, "step": 12990 }, { "epoch": 2.813243886604631, "grad_norm": 0.7839538272866499, "learning_rate": 5e-06, "loss": 0.4, "step": 13000 }, { "epoch": 2.815407920363558, "grad_norm": 0.8075664166536066, "learning_rate": 5e-06, "loss": 0.3869, "step": 13010 }, { "epoch": 2.817571954122484, "grad_norm": 0.8026133483602639, "learning_rate": 5e-06, "loss": 0.3957, "step": 13020 }, { "epoch": 2.819735987881411, "grad_norm": 0.744545263944405, "learning_rate": 5e-06, "loss": 0.386, "step": 13030 }, { "epoch": 2.821900021640338, "grad_norm": 0.7535839428276422, "learning_rate": 5e-06, "loss": 0.3808, "step": 13040 }, { "epoch": 2.824064055399264, "grad_norm": 0.7957853276701218, "learning_rate": 5e-06, "loss": 0.3793, "step": 13050 }, { "epoch": 2.826228089158191, "grad_norm": 0.7665463998428926, "learning_rate": 5e-06, "loss": 0.3817, "step": 13060 }, { "epoch": 2.8283921229171174, "grad_norm": 0.7747068764554864, "learning_rate": 5e-06, "loss": 0.3899, "step": 13070 }, { "epoch": 2.830556156676044, "grad_norm": 0.7529729147926931, "learning_rate": 5e-06, "loss": 0.3986, "step": 13080 }, { "epoch": 2.832720190434971, "grad_norm": 0.7714949651199653, "learning_rate": 5e-06, "loss": 0.3849, "step": 13090 }, { "epoch": 2.8348842241938974, "grad_norm": 0.757669054423436, "learning_rate": 5e-06, "loss": 0.3973, "step": 13100 }, { "epoch": 2.8370482579528242, "grad_norm": 0.7109714152621212, "learning_rate": 5e-06, "loss": 0.3951, "step": 13110 }, { "epoch": 2.8392122917117506, "grad_norm": 0.7676176881192389, "learning_rate": 5e-06, "loss": 0.3878, "step": 13120 }, { "epoch": 2.8413763254706774, "grad_norm": 0.7850663677878515, "learning_rate": 5e-06, "loss": 0.3932, "step": 13130 }, { "epoch": 2.8435403592296042, "grad_norm": 0.7225327094498046, "learning_rate": 5e-06, "loss": 0.3831, "step": 13140 }, { "epoch": 2.8457043929885306, "grad_norm": 0.7817333715491703, "learning_rate": 5e-06, "loss": 0.3895, "step": 13150 }, { "epoch": 2.847868426747457, "grad_norm": 0.7709262647174362, "learning_rate": 5e-06, "loss": 0.3903, "step": 13160 }, { "epoch": 2.850032460506384, "grad_norm": 0.7521690368016893, "learning_rate": 5e-06, "loss": 0.3841, "step": 13170 }, { "epoch": 2.8521964942653106, "grad_norm": 0.7654288882047556, "learning_rate": 5e-06, "loss": 0.3965, "step": 13180 }, { "epoch": 2.8543605280242375, "grad_norm": 0.7797805693245492, "learning_rate": 5e-06, "loss": 0.3882, "step": 13190 }, { "epoch": 2.856524561783164, "grad_norm": 0.767159401606924, "learning_rate": 5e-06, "loss": 0.3967, "step": 13200 }, { "epoch": 2.85868859554209, "grad_norm": 0.7700593597456216, "learning_rate": 5e-06, "loss": 0.3818, "step": 13210 }, { "epoch": 2.860852629301017, "grad_norm": 0.7699670336942503, "learning_rate": 5e-06, "loss": 0.3981, "step": 13220 }, { "epoch": 2.863016663059944, "grad_norm": 0.7532067312686156, "learning_rate": 5e-06, "loss": 0.3845, "step": 13230 }, { "epoch": 2.86518069681887, "grad_norm": 0.7700933480079477, "learning_rate": 5e-06, "loss": 0.3916, "step": 13240 }, { "epoch": 2.867344730577797, "grad_norm": 0.7466578230564417, "learning_rate": 5e-06, "loss": 0.3894, "step": 13250 }, { "epoch": 2.8695087643367234, "grad_norm": 0.7937501563778528, "learning_rate": 5e-06, "loss": 0.3816, "step": 13260 }, { "epoch": 2.8716727980956502, "grad_norm": 0.7389001187669237, "learning_rate": 5e-06, "loss": 0.3728, "step": 13270 }, { "epoch": 2.873836831854577, "grad_norm": 0.7578905999176232, "learning_rate": 5e-06, "loss": 0.3812, "step": 13280 }, { "epoch": 2.8760008656135034, "grad_norm": 0.8151694337796392, "learning_rate": 5e-06, "loss": 0.3861, "step": 13290 }, { "epoch": 2.8781648993724303, "grad_norm": 0.7471342559011176, "learning_rate": 5e-06, "loss": 0.3888, "step": 13300 }, { "epoch": 2.8803289331313566, "grad_norm": 0.7422252462621732, "learning_rate": 5e-06, "loss": 0.3922, "step": 13310 }, { "epoch": 2.8824929668902834, "grad_norm": 0.7615451869859242, "learning_rate": 5e-06, "loss": 0.3829, "step": 13320 }, { "epoch": 2.8846570006492103, "grad_norm": 0.8256789814798213, "learning_rate": 5e-06, "loss": 0.3952, "step": 13330 }, { "epoch": 2.8868210344081366, "grad_norm": 0.76680451948221, "learning_rate": 5e-06, "loss": 0.3956, "step": 13340 }, { "epoch": 2.8889850681670635, "grad_norm": 0.7439841068189852, "learning_rate": 5e-06, "loss": 0.3776, "step": 13350 }, { "epoch": 2.89114910192599, "grad_norm": 0.7398497595894248, "learning_rate": 5e-06, "loss": 0.3877, "step": 13360 }, { "epoch": 2.8933131356849167, "grad_norm": 0.7739248139645774, "learning_rate": 5e-06, "loss": 0.3929, "step": 13370 }, { "epoch": 2.8954771694438435, "grad_norm": 0.8440378872399906, "learning_rate": 5e-06, "loss": 0.3876, "step": 13380 }, { "epoch": 2.89764120320277, "grad_norm": 0.7705375133560814, "learning_rate": 5e-06, "loss": 0.3871, "step": 13390 }, { "epoch": 2.8998052369616967, "grad_norm": 0.7344474907711162, "learning_rate": 5e-06, "loss": 0.3988, "step": 13400 }, { "epoch": 2.901969270720623, "grad_norm": 0.7691718865505679, "learning_rate": 5e-06, "loss": 0.392, "step": 13410 }, { "epoch": 2.90413330447955, "grad_norm": 0.7491038351612413, "learning_rate": 5e-06, "loss": 0.3927, "step": 13420 }, { "epoch": 2.9062973382384767, "grad_norm": 0.7519583645050334, "learning_rate": 5e-06, "loss": 0.3845, "step": 13430 }, { "epoch": 2.908461371997403, "grad_norm": 0.8244800954009015, "learning_rate": 5e-06, "loss": 0.3832, "step": 13440 }, { "epoch": 2.91062540575633, "grad_norm": 0.8157251210241125, "learning_rate": 5e-06, "loss": 0.3805, "step": 13450 }, { "epoch": 2.9127894395152563, "grad_norm": 0.7721623213908909, "learning_rate": 5e-06, "loss": 0.3942, "step": 13460 }, { "epoch": 2.914953473274183, "grad_norm": 0.7726737654084024, "learning_rate": 5e-06, "loss": 0.3931, "step": 13470 }, { "epoch": 2.91711750703311, "grad_norm": 0.7550960174025876, "learning_rate": 5e-06, "loss": 0.3885, "step": 13480 }, { "epoch": 2.9192815407920363, "grad_norm": 0.7679327991153526, "learning_rate": 5e-06, "loss": 0.3964, "step": 13490 }, { "epoch": 2.921445574550963, "grad_norm": 0.7823759384711927, "learning_rate": 5e-06, "loss": 0.3809, "step": 13500 }, { "epoch": 2.9236096083098895, "grad_norm": 0.7512641558224188, "learning_rate": 5e-06, "loss": 0.3842, "step": 13510 }, { "epoch": 2.9257736420688163, "grad_norm": 0.782519582204221, "learning_rate": 5e-06, "loss": 0.3942, "step": 13520 }, { "epoch": 2.927937675827743, "grad_norm": 0.7935222646576868, "learning_rate": 5e-06, "loss": 0.3871, "step": 13530 }, { "epoch": 2.9301017095866695, "grad_norm": 0.7490260485361179, "learning_rate": 5e-06, "loss": 0.3924, "step": 13540 }, { "epoch": 2.9322657433455963, "grad_norm": 0.7829574236017007, "learning_rate": 5e-06, "loss": 0.387, "step": 13550 }, { "epoch": 2.9344297771045227, "grad_norm": 0.8053438714777432, "learning_rate": 5e-06, "loss": 0.3997, "step": 13560 }, { "epoch": 2.9365938108634495, "grad_norm": 0.789271548816451, "learning_rate": 5e-06, "loss": 0.3921, "step": 13570 }, { "epoch": 2.9387578446223763, "grad_norm": 0.7620667953803887, "learning_rate": 5e-06, "loss": 0.3945, "step": 13580 }, { "epoch": 2.9409218783813027, "grad_norm": 0.8157019776942496, "learning_rate": 5e-06, "loss": 0.3973, "step": 13590 }, { "epoch": 2.9430859121402295, "grad_norm": 0.7566983087966317, "learning_rate": 5e-06, "loss": 0.3973, "step": 13600 }, { "epoch": 2.945249945899156, "grad_norm": 0.7850341888405364, "learning_rate": 5e-06, "loss": 0.3852, "step": 13610 }, { "epoch": 2.9474139796580827, "grad_norm": 0.7756606362123769, "learning_rate": 5e-06, "loss": 0.3873, "step": 13620 }, { "epoch": 2.9495780134170095, "grad_norm": 0.7582467951052595, "learning_rate": 5e-06, "loss": 0.4002, "step": 13630 }, { "epoch": 2.951742047175936, "grad_norm": 0.7922689379335354, "learning_rate": 5e-06, "loss": 0.3948, "step": 13640 }, { "epoch": 2.9539060809348627, "grad_norm": 0.7435887161247465, "learning_rate": 5e-06, "loss": 0.3934, "step": 13650 }, { "epoch": 2.956070114693789, "grad_norm": 0.7677386746970113, "learning_rate": 5e-06, "loss": 0.3842, "step": 13660 }, { "epoch": 2.958234148452716, "grad_norm": 0.7621270826776456, "learning_rate": 5e-06, "loss": 0.3882, "step": 13670 }, { "epoch": 2.9603981822116427, "grad_norm": 0.7115127400494038, "learning_rate": 5e-06, "loss": 0.3869, "step": 13680 }, { "epoch": 2.962562215970569, "grad_norm": 0.7875693326081818, "learning_rate": 5e-06, "loss": 0.3818, "step": 13690 }, { "epoch": 2.964726249729496, "grad_norm": 0.7236217251683649, "learning_rate": 5e-06, "loss": 0.3817, "step": 13700 }, { "epoch": 2.9668902834884223, "grad_norm": 0.8125742954355849, "learning_rate": 5e-06, "loss": 0.3854, "step": 13710 }, { "epoch": 2.969054317247349, "grad_norm": 0.7524427643623132, "learning_rate": 5e-06, "loss": 0.3903, "step": 13720 }, { "epoch": 2.971218351006276, "grad_norm": 0.7390063903999405, "learning_rate": 5e-06, "loss": 0.3808, "step": 13730 }, { "epoch": 2.9733823847652023, "grad_norm": 0.7511379236710563, "learning_rate": 5e-06, "loss": 0.3826, "step": 13740 }, { "epoch": 2.9755464185241287, "grad_norm": 0.8210016741195086, "learning_rate": 5e-06, "loss": 0.3935, "step": 13750 }, { "epoch": 2.9777104522830555, "grad_norm": 0.7700571385479135, "learning_rate": 5e-06, "loss": 0.3947, "step": 13760 }, { "epoch": 2.9798744860419824, "grad_norm": 0.7901788530732445, "learning_rate": 5e-06, "loss": 0.3918, "step": 13770 }, { "epoch": 2.982038519800909, "grad_norm": 0.790326497189608, "learning_rate": 5e-06, "loss": 0.395, "step": 13780 }, { "epoch": 2.9842025535598355, "grad_norm": 0.7970458568075771, "learning_rate": 5e-06, "loss": 0.3838, "step": 13790 }, { "epoch": 2.986366587318762, "grad_norm": 0.7937921564780092, "learning_rate": 5e-06, "loss": 0.3871, "step": 13800 }, { "epoch": 2.9885306210776887, "grad_norm": 0.7869992321634875, "learning_rate": 5e-06, "loss": 0.3861, "step": 13810 }, { "epoch": 2.9906946548366156, "grad_norm": 0.7507057758028739, "learning_rate": 5e-06, "loss": 0.3863, "step": 13820 }, { "epoch": 2.992858688595542, "grad_norm": 0.8051759990871201, "learning_rate": 5e-06, "loss": 0.3811, "step": 13830 }, { "epoch": 2.9950227223544688, "grad_norm": 0.7596345710239549, "learning_rate": 5e-06, "loss": 0.3904, "step": 13840 }, { "epoch": 2.997186756113395, "grad_norm": 0.7917910564755264, "learning_rate": 5e-06, "loss": 0.3977, "step": 13850 }, { "epoch": 2.999350789872322, "grad_norm": 0.732648425467274, "learning_rate": 5e-06, "loss": 0.3911, "step": 13860 }, { "epoch": 3.0, "eval_loss": 0.546510636806488, "eval_runtime": 589.5092, "eval_samples_per_second": 26.402, "eval_steps_per_second": 0.414, "step": 13863 }, { "epoch": 3.0, "step": 13863, "total_flos": 3633869689454592.0, "train_loss": 0.474967997638056, "train_runtime": 94524.6869, "train_samples_per_second": 9.385, "train_steps_per_second": 0.147 } ], "logging_steps": 10, "max_steps": 13863, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3633869689454592.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }