{ "best_metric": 3.130511999130249, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.10231749117511639, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020463498235023277, "grad_norm": 17.940895080566406, "learning_rate": 5e-06, "loss": 14.0433, "step": 1 }, { "epoch": 0.00020463498235023277, "eval_loss": 3.9562315940856934, "eval_runtime": 47.1989, "eval_samples_per_second": 174.368, "eval_steps_per_second": 43.603, "step": 1 }, { "epoch": 0.00040926996470046554, "grad_norm": 22.070707321166992, "learning_rate": 1e-05, "loss": 14.7688, "step": 2 }, { "epoch": 0.0006139049470506984, "grad_norm": 20.095142364501953, "learning_rate": 1.5e-05, "loss": 14.6774, "step": 3 }, { "epoch": 0.0008185399294009311, "grad_norm": 17.93838119506836, "learning_rate": 2e-05, "loss": 14.672, "step": 4 }, { "epoch": 0.001023174911751164, "grad_norm": 13.304869651794434, "learning_rate": 2.5e-05, "loss": 14.7464, "step": 5 }, { "epoch": 0.0012278098941013967, "grad_norm": 10.018509864807129, "learning_rate": 3e-05, "loss": 14.7743, "step": 6 }, { "epoch": 0.0014324448764516293, "grad_norm": 8.531241416931152, "learning_rate": 3.5e-05, "loss": 14.7664, "step": 7 }, { "epoch": 0.0016370798588018621, "grad_norm": 8.279265403747559, "learning_rate": 4e-05, "loss": 14.8116, "step": 8 }, { "epoch": 0.001841714841152095, "grad_norm": 7.955838680267334, "learning_rate": 4.5e-05, "loss": 14.0356, "step": 9 }, { "epoch": 0.002046349823502328, "grad_norm": 8.396589279174805, "learning_rate": 5e-05, "loss": 14.4371, "step": 10 }, { "epoch": 0.0022509848058525606, "grad_norm": 7.477896213531494, "learning_rate": 5.500000000000001e-05, "loss": 14.3411, "step": 11 }, { "epoch": 0.0024556197882027934, "grad_norm": 8.143397331237793, "learning_rate": 6e-05, "loss": 14.5638, "step": 12 }, { "epoch": 0.0026602547705530263, "grad_norm": 7.183845043182373, "learning_rate": 6.500000000000001e-05, "loss": 14.4296, "step": 13 }, { "epoch": 0.0028648897529032586, "grad_norm": 7.372915267944336, "learning_rate": 7e-05, "loss": 14.7023, "step": 14 }, { "epoch": 0.0030695247352534915, "grad_norm": 8.364891052246094, "learning_rate": 7.500000000000001e-05, "loss": 14.6036, "step": 15 }, { "epoch": 0.0032741597176037243, "grad_norm": 8.31488037109375, "learning_rate": 8e-05, "loss": 14.7258, "step": 16 }, { "epoch": 0.003478794699953957, "grad_norm": 7.918307304382324, "learning_rate": 8.5e-05, "loss": 14.2108, "step": 17 }, { "epoch": 0.00368342968230419, "grad_norm": 7.043214797973633, "learning_rate": 9e-05, "loss": 14.4846, "step": 18 }, { "epoch": 0.0038880646646544227, "grad_norm": 7.622882843017578, "learning_rate": 9.5e-05, "loss": 14.7337, "step": 19 }, { "epoch": 0.004092699647004656, "grad_norm": 7.523791790008545, "learning_rate": 0.0001, "loss": 13.6743, "step": 20 }, { "epoch": 0.004297334629354888, "grad_norm": 7.598966598510742, "learning_rate": 9.999892908320647e-05, "loss": 13.6939, "step": 21 }, { "epoch": 0.004501969611705121, "grad_norm": 7.914957046508789, "learning_rate": 9.999571637870036e-05, "loss": 14.0858, "step": 22 }, { "epoch": 0.004706604594055354, "grad_norm": 8.37746810913086, "learning_rate": 9.999036202410325e-05, "loss": 14.1071, "step": 23 }, { "epoch": 0.004911239576405587, "grad_norm": 8.19976806640625, "learning_rate": 9.998286624877786e-05, "loss": 14.4376, "step": 24 }, { "epoch": 0.00511587455875582, "grad_norm": 8.429183959960938, "learning_rate": 9.997322937381829e-05, "loss": 13.8618, "step": 25 }, { "epoch": 0.0053205095411060525, "grad_norm": 8.871038436889648, "learning_rate": 9.996145181203615e-05, "loss": 14.0325, "step": 26 }, { "epoch": 0.0055251445234562845, "grad_norm": 9.305150985717773, "learning_rate": 9.994753406794301e-05, "loss": 14.309, "step": 27 }, { "epoch": 0.005729779505806517, "grad_norm": 8.649129867553711, "learning_rate": 9.99314767377287e-05, "loss": 14.174, "step": 28 }, { "epoch": 0.00593441448815675, "grad_norm": 9.096344947814941, "learning_rate": 9.991328050923581e-05, "loss": 14.1257, "step": 29 }, { "epoch": 0.006139049470506983, "grad_norm": 9.188639640808105, "learning_rate": 9.989294616193017e-05, "loss": 13.7195, "step": 30 }, { "epoch": 0.006343684452857216, "grad_norm": 9.031623840332031, "learning_rate": 9.98704745668676e-05, "loss": 13.2224, "step": 31 }, { "epoch": 0.0065483194352074486, "grad_norm": 10.63833999633789, "learning_rate": 9.98458666866564e-05, "loss": 14.6318, "step": 32 }, { "epoch": 0.006752954417557681, "grad_norm": 9.114713668823242, "learning_rate": 9.981912357541627e-05, "loss": 12.9636, "step": 33 }, { "epoch": 0.006957589399907914, "grad_norm": 10.119890213012695, "learning_rate": 9.97902463787331e-05, "loss": 14.3062, "step": 34 }, { "epoch": 0.007162224382258147, "grad_norm": 9.861045837402344, "learning_rate": 9.975923633360985e-05, "loss": 14.4585, "step": 35 }, { "epoch": 0.00736685936460838, "grad_norm": 9.876028060913086, "learning_rate": 9.972609476841367e-05, "loss": 13.4113, "step": 36 }, { "epoch": 0.007571494346958613, "grad_norm": 9.652708053588867, "learning_rate": 9.969082310281891e-05, "loss": 13.4341, "step": 37 }, { "epoch": 0.0077761293293088455, "grad_norm": 10.080284118652344, "learning_rate": 9.965342284774632e-05, "loss": 13.4885, "step": 38 }, { "epoch": 0.007980764311659078, "grad_norm": 10.708706855773926, "learning_rate": 9.961389560529836e-05, "loss": 13.8326, "step": 39 }, { "epoch": 0.008185399294009311, "grad_norm": 10.825308799743652, "learning_rate": 9.957224306869053e-05, "loss": 14.1761, "step": 40 }, { "epoch": 0.008390034276359544, "grad_norm": 11.960084915161133, "learning_rate": 9.952846702217886e-05, "loss": 13.7566, "step": 41 }, { "epoch": 0.008594669258709777, "grad_norm": 13.216733932495117, "learning_rate": 9.948256934098352e-05, "loss": 13.3996, "step": 42 }, { "epoch": 0.00879930424106001, "grad_norm": 11.897379875183105, "learning_rate": 9.943455199120837e-05, "loss": 13.2895, "step": 43 }, { "epoch": 0.009003939223410242, "grad_norm": 13.036720275878906, "learning_rate": 9.938441702975689e-05, "loss": 13.2497, "step": 44 }, { "epoch": 0.009208574205760475, "grad_norm": 12.781022071838379, "learning_rate": 9.933216660424395e-05, "loss": 13.1251, "step": 45 }, { "epoch": 0.009413209188110708, "grad_norm": 13.262389183044434, "learning_rate": 9.927780295290389e-05, "loss": 13.5698, "step": 46 }, { "epoch": 0.009617844170460941, "grad_norm": 15.804819107055664, "learning_rate": 9.922132840449459e-05, "loss": 14.0475, "step": 47 }, { "epoch": 0.009822479152811174, "grad_norm": 17.065067291259766, "learning_rate": 9.916274537819775e-05, "loss": 13.7782, "step": 48 }, { "epoch": 0.010027114135161407, "grad_norm": 19.974628448486328, "learning_rate": 9.91020563835152e-05, "loss": 13.1872, "step": 49 }, { "epoch": 0.01023174911751164, "grad_norm": 33.5842399597168, "learning_rate": 9.903926402016153e-05, "loss": 14.529, "step": 50 }, { "epoch": 0.010436384099861872, "grad_norm": 27.80715560913086, "learning_rate": 9.897437097795257e-05, "loss": 15.1481, "step": 51 }, { "epoch": 0.010641019082212105, "grad_norm": 25.08864974975586, "learning_rate": 9.890738003669029e-05, "loss": 15.3311, "step": 52 }, { "epoch": 0.010845654064562336, "grad_norm": 20.487937927246094, "learning_rate": 9.883829406604363e-05, "loss": 14.983, "step": 53 }, { "epoch": 0.011050289046912569, "grad_norm": 18.901628494262695, "learning_rate": 9.876711602542563e-05, "loss": 15.1327, "step": 54 }, { "epoch": 0.011254924029262802, "grad_norm": 14.395322799682617, "learning_rate": 9.869384896386668e-05, "loss": 14.1538, "step": 55 }, { "epoch": 0.011459559011613035, "grad_norm": 10.616168022155762, "learning_rate": 9.861849601988383e-05, "loss": 14.1126, "step": 56 }, { "epoch": 0.011664193993963267, "grad_norm": 8.293721199035645, "learning_rate": 9.854106042134641e-05, "loss": 13.6647, "step": 57 }, { "epoch": 0.0118688289763135, "grad_norm": 7.054158687591553, "learning_rate": 9.846154548533773e-05, "loss": 13.3569, "step": 58 }, { "epoch": 0.012073463958663733, "grad_norm": 7.207637310028076, "learning_rate": 9.837995461801299e-05, "loss": 13.5518, "step": 59 }, { "epoch": 0.012278098941013966, "grad_norm": 7.297451496124268, "learning_rate": 9.829629131445342e-05, "loss": 13.9634, "step": 60 }, { "epoch": 0.012482733923364199, "grad_norm": 7.142579555511475, "learning_rate": 9.821055915851647e-05, "loss": 13.9129, "step": 61 }, { "epoch": 0.012687368905714431, "grad_norm": 6.920661926269531, "learning_rate": 9.812276182268236e-05, "loss": 13.7893, "step": 62 }, { "epoch": 0.012892003888064664, "grad_norm": 7.224207401275635, "learning_rate": 9.803290306789676e-05, "loss": 13.4208, "step": 63 }, { "epoch": 0.013096638870414897, "grad_norm": 6.482351779937744, "learning_rate": 9.794098674340965e-05, "loss": 14.011, "step": 64 }, { "epoch": 0.01330127385276513, "grad_norm": 5.605070114135742, "learning_rate": 9.784701678661045e-05, "loss": 13.362, "step": 65 }, { "epoch": 0.013505908835115363, "grad_norm": 6.630495548248291, "learning_rate": 9.775099722285935e-05, "loss": 13.4117, "step": 66 }, { "epoch": 0.013710543817465596, "grad_norm": 6.491345405578613, "learning_rate": 9.765293216531486e-05, "loss": 13.6704, "step": 67 }, { "epoch": 0.013915178799815828, "grad_norm": 6.191098690032959, "learning_rate": 9.755282581475769e-05, "loss": 13.405, "step": 68 }, { "epoch": 0.014119813782166061, "grad_norm": 5.732306480407715, "learning_rate": 9.74506824594107e-05, "loss": 13.2353, "step": 69 }, { "epoch": 0.014324448764516294, "grad_norm": 5.888903617858887, "learning_rate": 9.73465064747553e-05, "loss": 13.3435, "step": 70 }, { "epoch": 0.014529083746866527, "grad_norm": 5.951428413391113, "learning_rate": 9.724030232334391e-05, "loss": 13.6627, "step": 71 }, { "epoch": 0.01473371872921676, "grad_norm": 6.11599063873291, "learning_rate": 9.713207455460894e-05, "loss": 13.5377, "step": 72 }, { "epoch": 0.014938353711566993, "grad_norm": 6.475672245025635, "learning_rate": 9.702182780466775e-05, "loss": 13.5349, "step": 73 }, { "epoch": 0.015142988693917225, "grad_norm": 6.255554676055908, "learning_rate": 9.690956679612421e-05, "loss": 13.427, "step": 74 }, { "epoch": 0.015347623676267458, "grad_norm": 6.450949668884277, "learning_rate": 9.67952963378663e-05, "loss": 13.1908, "step": 75 }, { "epoch": 0.015552258658617691, "grad_norm": 6.359993934631348, "learning_rate": 9.667902132486009e-05, "loss": 13.6013, "step": 76 }, { "epoch": 0.015756893640967922, "grad_norm": 6.952643394470215, "learning_rate": 9.656074673794018e-05, "loss": 12.9901, "step": 77 }, { "epoch": 0.015961528623318157, "grad_norm": 6.661192417144775, "learning_rate": 9.644047764359622e-05, "loss": 13.4725, "step": 78 }, { "epoch": 0.016166163605668388, "grad_norm": 6.839961051940918, "learning_rate": 9.631821919375591e-05, "loss": 13.0454, "step": 79 }, { "epoch": 0.016370798588018622, "grad_norm": 7.225835800170898, "learning_rate": 9.619397662556435e-05, "loss": 13.5887, "step": 80 }, { "epoch": 0.016575433570368853, "grad_norm": 7.208718776702881, "learning_rate": 9.606775526115963e-05, "loss": 12.9486, "step": 81 }, { "epoch": 0.016780068552719088, "grad_norm": 7.323213577270508, "learning_rate": 9.593956050744492e-05, "loss": 13.15, "step": 82 }, { "epoch": 0.01698470353506932, "grad_norm": 7.82949161529541, "learning_rate": 9.580939785585681e-05, "loss": 13.2066, "step": 83 }, { "epoch": 0.017189338517419554, "grad_norm": 7.900018692016602, "learning_rate": 9.567727288213005e-05, "loss": 13.556, "step": 84 }, { "epoch": 0.017393973499769785, "grad_norm": 8.467442512512207, "learning_rate": 9.554319124605879e-05, "loss": 13.7811, "step": 85 }, { "epoch": 0.01759860848212002, "grad_norm": 8.75790786743164, "learning_rate": 9.540715869125407e-05, "loss": 13.5011, "step": 86 }, { "epoch": 0.01780324346447025, "grad_norm": 8.657136917114258, "learning_rate": 9.526918104489777e-05, "loss": 12.9077, "step": 87 }, { "epoch": 0.018007878446820485, "grad_norm": 8.441506385803223, "learning_rate": 9.512926421749304e-05, "loss": 13.4052, "step": 88 }, { "epoch": 0.018212513429170716, "grad_norm": 8.36744499206543, "learning_rate": 9.498741420261108e-05, "loss": 12.8423, "step": 89 }, { "epoch": 0.01841714841152095, "grad_norm": 9.058586120605469, "learning_rate": 9.484363707663442e-05, "loss": 13.437, "step": 90 }, { "epoch": 0.01862178339387118, "grad_norm": 9.995050430297852, "learning_rate": 9.469793899849661e-05, "loss": 13.485, "step": 91 }, { "epoch": 0.018826418376221416, "grad_norm": 9.687874794006348, "learning_rate": 9.45503262094184e-05, "loss": 12.3617, "step": 92 }, { "epoch": 0.019031053358571647, "grad_norm": 9.374463081359863, "learning_rate": 9.440080503264037e-05, "loss": 12.6718, "step": 93 }, { "epoch": 0.019235688340921882, "grad_norm": 10.396638870239258, "learning_rate": 9.42493818731521e-05, "loss": 12.5755, "step": 94 }, { "epoch": 0.019440323323272113, "grad_norm": 11.383647918701172, "learning_rate": 9.409606321741775e-05, "loss": 12.4185, "step": 95 }, { "epoch": 0.019644958305622347, "grad_norm": 10.607769012451172, "learning_rate": 9.394085563309827e-05, "loss": 12.5911, "step": 96 }, { "epoch": 0.01984959328797258, "grad_norm": 13.467524528503418, "learning_rate": 9.378376576876999e-05, "loss": 14.1048, "step": 97 }, { "epoch": 0.020054228270322813, "grad_norm": 14.150450706481934, "learning_rate": 9.362480035363986e-05, "loss": 12.6339, "step": 98 }, { "epoch": 0.020258863252673044, "grad_norm": 15.242779731750488, "learning_rate": 9.34639661972572e-05, "loss": 13.5117, "step": 99 }, { "epoch": 0.02046349823502328, "grad_norm": 21.984615325927734, "learning_rate": 9.330127018922194e-05, "loss": 13.9761, "step": 100 }, { "epoch": 0.02046349823502328, "eval_loss": 3.370993137359619, "eval_runtime": 47.2126, "eval_samples_per_second": 174.318, "eval_steps_per_second": 43.59, "step": 100 }, { "epoch": 0.02066813321737351, "grad_norm": 11.214513778686523, "learning_rate": 9.31367192988896e-05, "loss": 14.0912, "step": 101 }, { "epoch": 0.020872768199723744, "grad_norm": 10.00228214263916, "learning_rate": 9.297032057507264e-05, "loss": 13.9889, "step": 102 }, { "epoch": 0.021077403182073975, "grad_norm": 10.31716251373291, "learning_rate": 9.280208114573859e-05, "loss": 14.1193, "step": 103 }, { "epoch": 0.02128203816442421, "grad_norm": 9.314844131469727, "learning_rate": 9.263200821770461e-05, "loss": 13.7396, "step": 104 }, { "epoch": 0.02148667314677444, "grad_norm": 7.567698001861572, "learning_rate": 9.246010907632895e-05, "loss": 13.7879, "step": 105 }, { "epoch": 0.021691308129124672, "grad_norm": 6.531108856201172, "learning_rate": 9.228639108519868e-05, "loss": 13.5185, "step": 106 }, { "epoch": 0.021895943111474907, "grad_norm": 5.202017307281494, "learning_rate": 9.211086168581433e-05, "loss": 13.3491, "step": 107 }, { "epoch": 0.022100578093825138, "grad_norm": 4.530038356781006, "learning_rate": 9.193352839727121e-05, "loss": 13.2004, "step": 108 }, { "epoch": 0.022305213076175372, "grad_norm": 4.831387996673584, "learning_rate": 9.175439881593716e-05, "loss": 13.4205, "step": 109 }, { "epoch": 0.022509848058525603, "grad_norm": 4.692884922027588, "learning_rate": 9.157348061512727e-05, "loss": 13.4912, "step": 110 }, { "epoch": 0.022714483040875838, "grad_norm": 5.204988479614258, "learning_rate": 9.139078154477512e-05, "loss": 13.1214, "step": 111 }, { "epoch": 0.02291911802322607, "grad_norm": 4.781569004058838, "learning_rate": 9.120630943110077e-05, "loss": 12.6118, "step": 112 }, { "epoch": 0.023123753005576304, "grad_norm": 4.754026412963867, "learning_rate": 9.102007217627568e-05, "loss": 13.186, "step": 113 }, { "epoch": 0.023328387987926535, "grad_norm": 5.035665035247803, "learning_rate": 9.083207775808396e-05, "loss": 12.7322, "step": 114 }, { "epoch": 0.02353302297027677, "grad_norm": 5.12575626373291, "learning_rate": 9.064233422958077e-05, "loss": 13.0182, "step": 115 }, { "epoch": 0.023737657952627, "grad_norm": 5.39860200881958, "learning_rate": 9.045084971874738e-05, "loss": 13.4676, "step": 116 }, { "epoch": 0.023942292934977235, "grad_norm": 5.005839824676514, "learning_rate": 9.025763242814291e-05, "loss": 13.0532, "step": 117 }, { "epoch": 0.024146927917327466, "grad_norm": 5.046457290649414, "learning_rate": 9.006269063455304e-05, "loss": 13.336, "step": 118 }, { "epoch": 0.0243515628996777, "grad_norm": 4.951815128326416, "learning_rate": 8.986603268863536e-05, "loss": 13.1308, "step": 119 }, { "epoch": 0.02455619788202793, "grad_norm": 5.16800594329834, "learning_rate": 8.966766701456177e-05, "loss": 12.7553, "step": 120 }, { "epoch": 0.024760832864378166, "grad_norm": 5.190509796142578, "learning_rate": 8.94676021096575e-05, "loss": 13.2239, "step": 121 }, { "epoch": 0.024965467846728397, "grad_norm": 5.662418365478516, "learning_rate": 8.926584654403724e-05, "loss": 13.2593, "step": 122 }, { "epoch": 0.025170102829078632, "grad_norm": 5.604646682739258, "learning_rate": 8.906240896023794e-05, "loss": 13.2693, "step": 123 }, { "epoch": 0.025374737811428863, "grad_norm": 5.807793140411377, "learning_rate": 8.885729807284856e-05, "loss": 13.465, "step": 124 }, { "epoch": 0.025579372793779098, "grad_norm": 6.032169818878174, "learning_rate": 8.865052266813685e-05, "loss": 13.0197, "step": 125 }, { "epoch": 0.02578400777612933, "grad_norm": 5.969254970550537, "learning_rate": 8.844209160367299e-05, "loss": 12.91, "step": 126 }, { "epoch": 0.025988642758479563, "grad_norm": 5.627323627471924, "learning_rate": 8.823201380795001e-05, "loss": 12.9693, "step": 127 }, { "epoch": 0.026193277740829794, "grad_norm": 5.775904655456543, "learning_rate": 8.802029828000156e-05, "loss": 13.3716, "step": 128 }, { "epoch": 0.02639791272318003, "grad_norm": 6.050631999969482, "learning_rate": 8.780695408901613e-05, "loss": 12.9946, "step": 129 }, { "epoch": 0.02660254770553026, "grad_norm": 6.608086109161377, "learning_rate": 8.759199037394887e-05, "loss": 12.7268, "step": 130 }, { "epoch": 0.026807182687880494, "grad_norm": 6.4099202156066895, "learning_rate": 8.737541634312985e-05, "loss": 13.3797, "step": 131 }, { "epoch": 0.027011817670230726, "grad_norm": 6.958422660827637, "learning_rate": 8.715724127386972e-05, "loss": 13.2627, "step": 132 }, { "epoch": 0.02721645265258096, "grad_norm": 6.657001495361328, "learning_rate": 8.693747451206232e-05, "loss": 13.1662, "step": 133 }, { "epoch": 0.02742108763493119, "grad_norm": 6.775047302246094, "learning_rate": 8.671612547178428e-05, "loss": 12.8757, "step": 134 }, { "epoch": 0.027625722617281426, "grad_norm": 6.7623419761657715, "learning_rate": 8.649320363489179e-05, "loss": 12.5799, "step": 135 }, { "epoch": 0.027830357599631657, "grad_norm": 7.408362865447998, "learning_rate": 8.626871855061438e-05, "loss": 13.8727, "step": 136 }, { "epoch": 0.02803499258198189, "grad_norm": 6.984137535095215, "learning_rate": 8.604267983514594e-05, "loss": 12.6957, "step": 137 }, { "epoch": 0.028239627564332122, "grad_norm": 7.494143486022949, "learning_rate": 8.581509717123273e-05, "loss": 13.5292, "step": 138 }, { "epoch": 0.028444262546682357, "grad_norm": 7.043254375457764, "learning_rate": 8.558598030775857e-05, "loss": 12.5103, "step": 139 }, { "epoch": 0.028648897529032588, "grad_norm": 7.2675957679748535, "learning_rate": 8.535533905932738e-05, "loss": 12.8951, "step": 140 }, { "epoch": 0.02885353251138282, "grad_norm": 7.874957084655762, "learning_rate": 8.51231833058426e-05, "loss": 12.9737, "step": 141 }, { "epoch": 0.029058167493733054, "grad_norm": 8.002019882202148, "learning_rate": 8.488952299208401e-05, "loss": 12.8148, "step": 142 }, { "epoch": 0.029262802476083285, "grad_norm": 8.36933422088623, "learning_rate": 8.46543681272818e-05, "loss": 12.4946, "step": 143 }, { "epoch": 0.02946743745843352, "grad_norm": 9.498835563659668, "learning_rate": 8.44177287846877e-05, "loss": 13.271, "step": 144 }, { "epoch": 0.02967207244078375, "grad_norm": 8.976995468139648, "learning_rate": 8.417961510114356e-05, "loss": 12.5241, "step": 145 }, { "epoch": 0.029876707423133985, "grad_norm": 9.178775787353516, "learning_rate": 8.39400372766471e-05, "loss": 12.4166, "step": 146 }, { "epoch": 0.030081342405484216, "grad_norm": 10.875651359558105, "learning_rate": 8.36990055739149e-05, "loss": 12.7323, "step": 147 }, { "epoch": 0.03028597738783445, "grad_norm": 11.843050003051758, "learning_rate": 8.345653031794292e-05, "loss": 12.6294, "step": 148 }, { "epoch": 0.030490612370184682, "grad_norm": 12.797874450683594, "learning_rate": 8.321262189556409e-05, "loss": 11.9468, "step": 149 }, { "epoch": 0.030695247352534916, "grad_norm": 21.556180953979492, "learning_rate": 8.296729075500344e-05, "loss": 14.375, "step": 150 }, { "epoch": 0.030899882334885147, "grad_norm": 5.878223419189453, "learning_rate": 8.272054740543052e-05, "loss": 13.2625, "step": 151 }, { "epoch": 0.031104517317235382, "grad_norm": 6.683862209320068, "learning_rate": 8.247240241650918e-05, "loss": 13.4469, "step": 152 }, { "epoch": 0.03130915229958561, "grad_norm": 6.695138931274414, "learning_rate": 8.222286641794488e-05, "loss": 13.8935, "step": 153 }, { "epoch": 0.031513787281935844, "grad_norm": 6.529450416564941, "learning_rate": 8.197195009902924e-05, "loss": 13.2827, "step": 154 }, { "epoch": 0.03171842226428608, "grad_norm": 5.889492034912109, "learning_rate": 8.171966420818228e-05, "loss": 13.2815, "step": 155 }, { "epoch": 0.03192305724663631, "grad_norm": 5.005529403686523, "learning_rate": 8.146601955249188e-05, "loss": 13.1348, "step": 156 }, { "epoch": 0.032127692228986544, "grad_norm": 4.527781009674072, "learning_rate": 8.121102699725089e-05, "loss": 12.9616, "step": 157 }, { "epoch": 0.032332327211336775, "grad_norm": 3.992450714111328, "learning_rate": 8.095469746549172e-05, "loss": 13.2171, "step": 158 }, { "epoch": 0.032536962193687013, "grad_norm": 3.9536304473876953, "learning_rate": 8.069704193751832e-05, "loss": 13.5083, "step": 159 }, { "epoch": 0.032741597176037245, "grad_norm": 4.0044264793396, "learning_rate": 8.043807145043604e-05, "loss": 13.5044, "step": 160 }, { "epoch": 0.032946232158387476, "grad_norm": 4.166686058044434, "learning_rate": 8.017779709767858e-05, "loss": 12.9416, "step": 161 }, { "epoch": 0.03315086714073771, "grad_norm": 4.292598724365234, "learning_rate": 7.991623002853296e-05, "loss": 12.932, "step": 162 }, { "epoch": 0.033355502123087945, "grad_norm": 4.622048377990723, "learning_rate": 7.965338144766186e-05, "loss": 13.1667, "step": 163 }, { "epoch": 0.033560137105438176, "grad_norm": 4.218106746673584, "learning_rate": 7.938926261462366e-05, "loss": 12.9429, "step": 164 }, { "epoch": 0.03376477208778841, "grad_norm": 4.615002155303955, "learning_rate": 7.912388484339012e-05, "loss": 13.4383, "step": 165 }, { "epoch": 0.03396940707013864, "grad_norm": 4.371853828430176, "learning_rate": 7.88572595018617e-05, "loss": 12.6596, "step": 166 }, { "epoch": 0.034174042052488876, "grad_norm": 4.507296562194824, "learning_rate": 7.858939801138061e-05, "loss": 13.0297, "step": 167 }, { "epoch": 0.03437867703483911, "grad_norm": 4.6610941886901855, "learning_rate": 7.832031184624164e-05, "loss": 12.6801, "step": 168 }, { "epoch": 0.03458331201718934, "grad_norm": 4.3974714279174805, "learning_rate": 7.80500125332005e-05, "loss": 12.6394, "step": 169 }, { "epoch": 0.03478794699953957, "grad_norm": 4.65360689163208, "learning_rate": 7.777851165098012e-05, "loss": 13.2642, "step": 170 }, { "epoch": 0.03499258198188981, "grad_norm": 4.651695251464844, "learning_rate": 7.750582082977467e-05, "loss": 13.3055, "step": 171 }, { "epoch": 0.03519721696424004, "grad_norm": 5.114010810852051, "learning_rate": 7.723195175075136e-05, "loss": 13.0045, "step": 172 }, { "epoch": 0.03540185194659027, "grad_norm": 5.113755702972412, "learning_rate": 7.695691614555003e-05, "loss": 12.9366, "step": 173 }, { "epoch": 0.0356064869289405, "grad_norm": 5.089533805847168, "learning_rate": 7.668072579578058e-05, "loss": 12.959, "step": 174 }, { "epoch": 0.03581112191129073, "grad_norm": 5.559483051300049, "learning_rate": 7.64033925325184e-05, "loss": 13.2842, "step": 175 }, { "epoch": 0.03601575689364097, "grad_norm": 5.3359761238098145, "learning_rate": 7.612492823579745e-05, "loss": 13.0262, "step": 176 }, { "epoch": 0.0362203918759912, "grad_norm": 5.409842014312744, "learning_rate": 7.584534483410137e-05, "loss": 13.0076, "step": 177 }, { "epoch": 0.03642502685834143, "grad_norm": 5.253081321716309, "learning_rate": 7.55646543038526e-05, "loss": 11.9703, "step": 178 }, { "epoch": 0.03662966184069166, "grad_norm": 5.482647895812988, "learning_rate": 7.528286866889924e-05, "loss": 12.6692, "step": 179 }, { "epoch": 0.0368342968230419, "grad_norm": 5.659306049346924, "learning_rate": 7.500000000000001e-05, "loss": 13.0874, "step": 180 }, { "epoch": 0.03703893180539213, "grad_norm": 5.71022891998291, "learning_rate": 7.471606041430723e-05, "loss": 12.9602, "step": 181 }, { "epoch": 0.03724356678774236, "grad_norm": 6.031240940093994, "learning_rate": 7.443106207484776e-05, "loss": 12.8276, "step": 182 }, { "epoch": 0.037448201770092594, "grad_norm": 5.916280746459961, "learning_rate": 7.414501719000187e-05, "loss": 12.7036, "step": 183 }, { "epoch": 0.03765283675244283, "grad_norm": 6.090421676635742, "learning_rate": 7.385793801298042e-05, "loss": 12.5362, "step": 184 }, { "epoch": 0.03785747173479306, "grad_norm": 6.97968053817749, "learning_rate": 7.35698368412999e-05, "loss": 13.1994, "step": 185 }, { "epoch": 0.038062106717143294, "grad_norm": 6.6946587562561035, "learning_rate": 7.328072601625557e-05, "loss": 12.9428, "step": 186 }, { "epoch": 0.038266741699493526, "grad_norm": 6.86458158493042, "learning_rate": 7.2990617922393e-05, "loss": 13.5336, "step": 187 }, { "epoch": 0.038471376681843764, "grad_norm": 7.41053581237793, "learning_rate": 7.269952498697734e-05, "loss": 13.1752, "step": 188 }, { "epoch": 0.038676011664193995, "grad_norm": 6.769413948059082, "learning_rate": 7.240745967946113e-05, "loss": 12.3618, "step": 189 }, { "epoch": 0.038880646646544226, "grad_norm": 8.171807289123535, "learning_rate": 7.211443451095007e-05, "loss": 13.4612, "step": 190 }, { "epoch": 0.03908528162889446, "grad_norm": 7.6870598793029785, "learning_rate": 7.18204620336671e-05, "loss": 12.8721, "step": 191 }, { "epoch": 0.039289916611244695, "grad_norm": 7.984126567840576, "learning_rate": 7.152555484041476e-05, "loss": 12.5025, "step": 192 }, { "epoch": 0.039494551593594926, "grad_norm": 8.748424530029297, "learning_rate": 7.122972556403567e-05, "loss": 12.5803, "step": 193 }, { "epoch": 0.03969918657594516, "grad_norm": 8.19789981842041, "learning_rate": 7.09329868768714e-05, "loss": 13.0793, "step": 194 }, { "epoch": 0.03990382155829539, "grad_norm": 8.25755786895752, "learning_rate": 7.063535149021973e-05, "loss": 13.2436, "step": 195 }, { "epoch": 0.040108456540645626, "grad_norm": 10.084080696105957, "learning_rate": 7.033683215379002e-05, "loss": 12.4769, "step": 196 }, { "epoch": 0.04031309152299586, "grad_norm": 11.04244327545166, "learning_rate": 7.003744165515705e-05, "loss": 13.3229, "step": 197 }, { "epoch": 0.04051772650534609, "grad_norm": 10.718149185180664, "learning_rate": 6.973719281921335e-05, "loss": 13.0458, "step": 198 }, { "epoch": 0.04072236148769632, "grad_norm": 12.596996307373047, "learning_rate": 6.943609850761979e-05, "loss": 13.2156, "step": 199 }, { "epoch": 0.04092699647004656, "grad_norm": 16.626497268676758, "learning_rate": 6.91341716182545e-05, "loss": 12.1366, "step": 200 }, { "epoch": 0.04092699647004656, "eval_loss": 3.208616256713867, "eval_runtime": 47.3335, "eval_samples_per_second": 173.872, "eval_steps_per_second": 43.479, "step": 200 }, { "epoch": 0.04113163145239679, "grad_norm": 3.9187700748443604, "learning_rate": 6.883142508466054e-05, "loss": 13.2198, "step": 201 }, { "epoch": 0.04133626643474702, "grad_norm": 5.088418006896973, "learning_rate": 6.852787187549182e-05, "loss": 12.934, "step": 202 }, { "epoch": 0.04154090141709725, "grad_norm": 4.74566125869751, "learning_rate": 6.82235249939575e-05, "loss": 13.3393, "step": 203 }, { "epoch": 0.04174553639944749, "grad_norm": 4.898460865020752, "learning_rate": 6.7918397477265e-05, "loss": 13.5989, "step": 204 }, { "epoch": 0.04195017138179772, "grad_norm": 4.619757652282715, "learning_rate": 6.761250239606169e-05, "loss": 13.0342, "step": 205 }, { "epoch": 0.04215480636414795, "grad_norm": 4.482340335845947, "learning_rate": 6.730585285387465e-05, "loss": 13.0489, "step": 206 }, { "epoch": 0.04235944134649818, "grad_norm": 4.1753644943237305, "learning_rate": 6.699846198654971e-05, "loss": 13.2165, "step": 207 }, { "epoch": 0.04256407632884842, "grad_norm": 3.9566304683685303, "learning_rate": 6.669034296168855e-05, "loss": 13.2601, "step": 208 }, { "epoch": 0.04276871131119865, "grad_norm": 4.045615196228027, "learning_rate": 6.638150897808468e-05, "loss": 13.0854, "step": 209 }, { "epoch": 0.04297334629354888, "grad_norm": 3.9672138690948486, "learning_rate": 6.607197326515808e-05, "loss": 13.5277, "step": 210 }, { "epoch": 0.04317798127589911, "grad_norm": 3.8964602947235107, "learning_rate": 6.57617490823885e-05, "loss": 13.3445, "step": 211 }, { "epoch": 0.043382616258249344, "grad_norm": 3.9119648933410645, "learning_rate": 6.545084971874738e-05, "loss": 12.763, "step": 212 }, { "epoch": 0.04358725124059958, "grad_norm": 4.09339714050293, "learning_rate": 6.513928849212873e-05, "loss": 13.1653, "step": 213 }, { "epoch": 0.043791886222949813, "grad_norm": 4.33394193649292, "learning_rate": 6.482707874877854e-05, "loss": 13.1689, "step": 214 }, { "epoch": 0.043996521205300045, "grad_norm": 4.071203231811523, "learning_rate": 6.451423386272312e-05, "loss": 12.9157, "step": 215 }, { "epoch": 0.044201156187650276, "grad_norm": 4.155096054077148, "learning_rate": 6.420076723519614e-05, "loss": 12.9944, "step": 216 }, { "epoch": 0.044405791170000514, "grad_norm": 4.474510669708252, "learning_rate": 6.388669229406462e-05, "loss": 12.9211, "step": 217 }, { "epoch": 0.044610426152350745, "grad_norm": 4.203741550445557, "learning_rate": 6.357202249325371e-05, "loss": 12.5732, "step": 218 }, { "epoch": 0.044815061134700976, "grad_norm": 4.361083984375, "learning_rate": 6.32567713121704e-05, "loss": 13.1363, "step": 219 }, { "epoch": 0.04501969611705121, "grad_norm": 4.626219749450684, "learning_rate": 6.294095225512603e-05, "loss": 13.1044, "step": 220 }, { "epoch": 0.045224331099401445, "grad_norm": 4.69849967956543, "learning_rate": 6.26245788507579e-05, "loss": 12.7933, "step": 221 }, { "epoch": 0.045428966081751676, "grad_norm": 4.679666996002197, "learning_rate": 6.230766465144967e-05, "loss": 13.1581, "step": 222 }, { "epoch": 0.04563360106410191, "grad_norm": 4.953638553619385, "learning_rate": 6.199022323275083e-05, "loss": 13.0212, "step": 223 }, { "epoch": 0.04583823604645214, "grad_norm": 4.850236415863037, "learning_rate": 6.167226819279528e-05, "loss": 13.0019, "step": 224 }, { "epoch": 0.046042871028802376, "grad_norm": 4.989190578460693, "learning_rate": 6.135381315171867e-05, "loss": 12.2553, "step": 225 }, { "epoch": 0.04624750601115261, "grad_norm": 4.897017478942871, "learning_rate": 6.103487175107507e-05, "loss": 13.0096, "step": 226 }, { "epoch": 0.04645214099350284, "grad_norm": 4.998581886291504, "learning_rate": 6.071545765325254e-05, "loss": 12.4746, "step": 227 }, { "epoch": 0.04665677597585307, "grad_norm": 5.277119159698486, "learning_rate": 6.0395584540887963e-05, "loss": 12.6303, "step": 228 }, { "epoch": 0.04686141095820331, "grad_norm": 5.548853874206543, "learning_rate": 6.007526611628086e-05, "loss": 12.8215, "step": 229 }, { "epoch": 0.04706604594055354, "grad_norm": 5.378997325897217, "learning_rate": 5.9754516100806423e-05, "loss": 13.1972, "step": 230 }, { "epoch": 0.04727068092290377, "grad_norm": 5.815462589263916, "learning_rate": 5.9433348234327765e-05, "loss": 13.2821, "step": 231 }, { "epoch": 0.047475315905254, "grad_norm": 5.872306823730469, "learning_rate": 5.911177627460739e-05, "loss": 13.199, "step": 232 }, { "epoch": 0.04767995088760424, "grad_norm": 5.859600067138672, "learning_rate": 5.8789813996717736e-05, "loss": 12.6879, "step": 233 }, { "epoch": 0.04788458586995447, "grad_norm": 6.172786712646484, "learning_rate": 5.8467475192451226e-05, "loss": 13.2259, "step": 234 }, { "epoch": 0.0480892208523047, "grad_norm": 6.31078577041626, "learning_rate": 5.814477366972945e-05, "loss": 12.3852, "step": 235 }, { "epoch": 0.04829385583465493, "grad_norm": 6.534313201904297, "learning_rate": 5.782172325201155e-05, "loss": 13.3609, "step": 236 }, { "epoch": 0.04849849081700517, "grad_norm": 6.585941314697266, "learning_rate": 5.749833777770225e-05, "loss": 12.9351, "step": 237 }, { "epoch": 0.0487031257993554, "grad_norm": 6.7909159660339355, "learning_rate": 5.717463109955896e-05, "loss": 12.2785, "step": 238 }, { "epoch": 0.04890776078170563, "grad_norm": 7.011481761932373, "learning_rate": 5.685061708409841e-05, "loss": 13.0815, "step": 239 }, { "epoch": 0.04911239576405586, "grad_norm": 7.400633335113525, "learning_rate": 5.6526309611002594e-05, "loss": 12.5917, "step": 240 }, { "epoch": 0.0493170307464061, "grad_norm": 6.944792747497559, "learning_rate": 5.6201722572524275e-05, "loss": 12.8307, "step": 241 }, { "epoch": 0.04952166572875633, "grad_norm": 9.408550262451172, "learning_rate": 5.587686987289189e-05, "loss": 13.0507, "step": 242 }, { "epoch": 0.049726300711106564, "grad_norm": 8.005476951599121, "learning_rate": 5.5551765427713884e-05, "loss": 12.5769, "step": 243 }, { "epoch": 0.049930935693456795, "grad_norm": 8.190591812133789, "learning_rate": 5.522642316338268e-05, "loss": 12.2517, "step": 244 }, { "epoch": 0.050135570675807026, "grad_norm": 9.257965087890625, "learning_rate": 5.490085701647805e-05, "loss": 13.0658, "step": 245 }, { "epoch": 0.050340205658157264, "grad_norm": 9.173686027526855, "learning_rate": 5.457508093317013e-05, "loss": 12.6359, "step": 246 }, { "epoch": 0.050544840640507495, "grad_norm": 9.59607982635498, "learning_rate": 5.4249108868622086e-05, "loss": 13.0733, "step": 247 }, { "epoch": 0.050749475622857726, "grad_norm": 10.842556953430176, "learning_rate": 5.392295478639225e-05, "loss": 12.6057, "step": 248 }, { "epoch": 0.05095411060520796, "grad_norm": 11.467256546020508, "learning_rate": 5.359663265783598e-05, "loss": 12.7772, "step": 249 }, { "epoch": 0.051158745587558195, "grad_norm": 16.473846435546875, "learning_rate": 5.327015646150716e-05, "loss": 12.2446, "step": 250 }, { "epoch": 0.051363380569908426, "grad_norm": 2.952035903930664, "learning_rate": 5.294354018255945e-05, "loss": 13.1735, "step": 251 }, { "epoch": 0.05156801555225866, "grad_norm": 3.22007417678833, "learning_rate": 5.26167978121472e-05, "loss": 13.0792, "step": 252 }, { "epoch": 0.05177265053460889, "grad_norm": 3.549088716506958, "learning_rate": 5.228994334682604e-05, "loss": 12.7948, "step": 253 }, { "epoch": 0.051977285516959126, "grad_norm": 3.728848695755005, "learning_rate": 5.196299078795344e-05, "loss": 13.0125, "step": 254 }, { "epoch": 0.05218192049930936, "grad_norm": 3.749281167984009, "learning_rate": 5.1635954141088813e-05, "loss": 13.2622, "step": 255 }, { "epoch": 0.05238655548165959, "grad_norm": 3.824709415435791, "learning_rate": 5.1308847415393666e-05, "loss": 12.9234, "step": 256 }, { "epoch": 0.05259119046400982, "grad_norm": 3.764427661895752, "learning_rate": 5.0981684623031415e-05, "loss": 13.2274, "step": 257 }, { "epoch": 0.05279582544636006, "grad_norm": 3.7923285961151123, "learning_rate": 5.0654479778567223e-05, "loss": 13.1988, "step": 258 }, { "epoch": 0.05300046042871029, "grad_norm": 3.733365535736084, "learning_rate": 5.0327246898367597e-05, "loss": 12.5384, "step": 259 }, { "epoch": 0.05320509541106052, "grad_norm": 3.8030307292938232, "learning_rate": 5e-05, "loss": 12.991, "step": 260 }, { "epoch": 0.05340973039341075, "grad_norm": 3.7069780826568604, "learning_rate": 4.9672753101632415e-05, "loss": 12.7534, "step": 261 }, { "epoch": 0.05361436537576099, "grad_norm": 3.764336109161377, "learning_rate": 4.934552022143279e-05, "loss": 12.8254, "step": 262 }, { "epoch": 0.05381900035811122, "grad_norm": 3.753891944885254, "learning_rate": 4.901831537696859e-05, "loss": 12.6035, "step": 263 }, { "epoch": 0.05402363534046145, "grad_norm": 3.9714443683624268, "learning_rate": 4.869115258460635e-05, "loss": 12.6499, "step": 264 }, { "epoch": 0.05422827032281168, "grad_norm": 3.999743938446045, "learning_rate": 4.83640458589112e-05, "loss": 13.0991, "step": 265 }, { "epoch": 0.05443290530516192, "grad_norm": 3.9809932708740234, "learning_rate": 4.8037009212046586e-05, "loss": 12.6208, "step": 266 }, { "epoch": 0.05463754028751215, "grad_norm": 4.165307521820068, "learning_rate": 4.7710056653173976e-05, "loss": 13.0348, "step": 267 }, { "epoch": 0.05484217526986238, "grad_norm": 4.428051948547363, "learning_rate": 4.738320218785281e-05, "loss": 13.4851, "step": 268 }, { "epoch": 0.055046810252212613, "grad_norm": 4.276752948760986, "learning_rate": 4.7056459817440544e-05, "loss": 12.8883, "step": 269 }, { "epoch": 0.05525144523456285, "grad_norm": 4.741238594055176, "learning_rate": 4.6729843538492847e-05, "loss": 13.3597, "step": 270 }, { "epoch": 0.05545608021691308, "grad_norm": 4.348086833953857, "learning_rate": 4.640336734216403e-05, "loss": 12.9206, "step": 271 }, { "epoch": 0.055660715199263314, "grad_norm": 4.487641334533691, "learning_rate": 4.607704521360776e-05, "loss": 12.7532, "step": 272 }, { "epoch": 0.055865350181613545, "grad_norm": 4.975533485412598, "learning_rate": 4.575089113137792e-05, "loss": 12.5924, "step": 273 }, { "epoch": 0.05606998516396378, "grad_norm": 4.721080303192139, "learning_rate": 4.542491906682989e-05, "loss": 12.9426, "step": 274 }, { "epoch": 0.056274620146314014, "grad_norm": 4.957543849945068, "learning_rate": 4.509914298352197e-05, "loss": 12.5506, "step": 275 }, { "epoch": 0.056479255128664245, "grad_norm": 4.958243370056152, "learning_rate": 4.477357683661734e-05, "loss": 12.7699, "step": 276 }, { "epoch": 0.056683890111014476, "grad_norm": 5.35684061050415, "learning_rate": 4.444823457228612e-05, "loss": 12.8455, "step": 277 }, { "epoch": 0.056888525093364714, "grad_norm": 5.440086364746094, "learning_rate": 4.412313012710813e-05, "loss": 13.7743, "step": 278 }, { "epoch": 0.057093160075714945, "grad_norm": 5.20829439163208, "learning_rate": 4.379827742747575e-05, "loss": 13.318, "step": 279 }, { "epoch": 0.057297795058065176, "grad_norm": 5.2258405685424805, "learning_rate": 4.347369038899744e-05, "loss": 13.0874, "step": 280 }, { "epoch": 0.05750243004041541, "grad_norm": 5.654691219329834, "learning_rate": 4.3149382915901606e-05, "loss": 12.4869, "step": 281 }, { "epoch": 0.05770706502276564, "grad_norm": 5.957024097442627, "learning_rate": 4.282536890044104e-05, "loss": 12.8174, "step": 282 }, { "epoch": 0.057911700005115876, "grad_norm": 6.341736316680908, "learning_rate": 4.250166222229774e-05, "loss": 12.6841, "step": 283 }, { "epoch": 0.05811633498746611, "grad_norm": 6.56013822555542, "learning_rate": 4.2178276747988446e-05, "loss": 13.0789, "step": 284 }, { "epoch": 0.05832096996981634, "grad_norm": 6.450329780578613, "learning_rate": 4.185522633027057e-05, "loss": 12.6028, "step": 285 }, { "epoch": 0.05852560495216657, "grad_norm": 6.356710433959961, "learning_rate": 4.153252480754877e-05, "loss": 13.0871, "step": 286 }, { "epoch": 0.05873023993451681, "grad_norm": 6.647814750671387, "learning_rate": 4.1210186003282275e-05, "loss": 12.6458, "step": 287 }, { "epoch": 0.05893487491686704, "grad_norm": 6.441559314727783, "learning_rate": 4.088822372539263e-05, "loss": 12.2483, "step": 288 }, { "epoch": 0.05913950989921727, "grad_norm": 8.019023895263672, "learning_rate": 4.0566651765672246e-05, "loss": 12.7241, "step": 289 }, { "epoch": 0.0593441448815675, "grad_norm": 7.507869720458984, "learning_rate": 4.0245483899193595e-05, "loss": 13.3737, "step": 290 }, { "epoch": 0.05954877986391774, "grad_norm": 7.296957015991211, "learning_rate": 3.992473388371915e-05, "loss": 12.6952, "step": 291 }, { "epoch": 0.05975341484626797, "grad_norm": 8.110812187194824, "learning_rate": 3.960441545911204e-05, "loss": 12.2771, "step": 292 }, { "epoch": 0.0599580498286182, "grad_norm": 8.923057556152344, "learning_rate": 3.928454234674747e-05, "loss": 12.7108, "step": 293 }, { "epoch": 0.06016268481096843, "grad_norm": 10.090682983398438, "learning_rate": 3.896512824892495e-05, "loss": 13.1477, "step": 294 }, { "epoch": 0.06036731979331867, "grad_norm": 8.957847595214844, "learning_rate": 3.864618684828134e-05, "loss": 11.7159, "step": 295 }, { "epoch": 0.0605719547756689, "grad_norm": 10.131745338439941, "learning_rate": 3.832773180720475e-05, "loss": 12.191, "step": 296 }, { "epoch": 0.06077658975801913, "grad_norm": 10.587480545043945, "learning_rate": 3.800977676724919e-05, "loss": 12.6343, "step": 297 }, { "epoch": 0.060981224740369364, "grad_norm": 11.739582061767578, "learning_rate": 3.769233534855035e-05, "loss": 12.5622, "step": 298 }, { "epoch": 0.0611858597227196, "grad_norm": 14.421760559082031, "learning_rate": 3.73754211492421e-05, "loss": 12.8323, "step": 299 }, { "epoch": 0.06139049470506983, "grad_norm": 17.0710391998291, "learning_rate": 3.705904774487396e-05, "loss": 11.9398, "step": 300 }, { "epoch": 0.06139049470506983, "eval_loss": 3.1531736850738525, "eval_runtime": 47.3782, "eval_samples_per_second": 173.709, "eval_steps_per_second": 43.438, "step": 300 }, { "epoch": 0.061595129687420064, "grad_norm": 2.744182825088501, "learning_rate": 3.6743228687829595e-05, "loss": 12.8172, "step": 301 }, { "epoch": 0.061799764669770295, "grad_norm": 3.1303441524505615, "learning_rate": 3.642797750674629e-05, "loss": 12.6422, "step": 302 }, { "epoch": 0.06200439965212053, "grad_norm": 3.3844480514526367, "learning_rate": 3.6113307705935396e-05, "loss": 13.0422, "step": 303 }, { "epoch": 0.062209034634470764, "grad_norm": 3.383885383605957, "learning_rate": 3.579923276480387e-05, "loss": 12.6751, "step": 304 }, { "epoch": 0.062413669616820995, "grad_norm": 3.3772575855255127, "learning_rate": 3.5485766137276894e-05, "loss": 13.055, "step": 305 }, { "epoch": 0.06261830459917123, "grad_norm": 3.6094918251037598, "learning_rate": 3.5172921251221455e-05, "loss": 13.1467, "step": 306 }, { "epoch": 0.06282293958152146, "grad_norm": 3.674668073654175, "learning_rate": 3.486071150787128e-05, "loss": 12.7998, "step": 307 }, { "epoch": 0.06302757456387169, "grad_norm": 3.914242744445801, "learning_rate": 3.4549150281252636e-05, "loss": 13.6751, "step": 308 }, { "epoch": 0.06323220954622193, "grad_norm": 3.7367589473724365, "learning_rate": 3.423825091761153e-05, "loss": 13.0127, "step": 309 }, { "epoch": 0.06343684452857216, "grad_norm": 3.7376673221588135, "learning_rate": 3.392802673484193e-05, "loss": 13.3006, "step": 310 }, { "epoch": 0.06364147951092239, "grad_norm": 3.9828696250915527, "learning_rate": 3.361849102191533e-05, "loss": 13.0535, "step": 311 }, { "epoch": 0.06384611449327263, "grad_norm": 3.7258951663970947, "learning_rate": 3.330965703831146e-05, "loss": 12.8206, "step": 312 }, { "epoch": 0.06405074947562286, "grad_norm": 3.842252254486084, "learning_rate": 3.300153801345028e-05, "loss": 13.0683, "step": 313 }, { "epoch": 0.06425538445797309, "grad_norm": 3.9074199199676514, "learning_rate": 3.2694147146125345e-05, "loss": 12.9179, "step": 314 }, { "epoch": 0.06446001944032333, "grad_norm": 3.9515974521636963, "learning_rate": 3.2387497603938326e-05, "loss": 13.3211, "step": 315 }, { "epoch": 0.06466465442267355, "grad_norm": 4.135197162628174, "learning_rate": 3.2081602522734986e-05, "loss": 13.1106, "step": 316 }, { "epoch": 0.06486928940502379, "grad_norm": 4.115512371063232, "learning_rate": 3.177647500604252e-05, "loss": 12.8488, "step": 317 }, { "epoch": 0.06507392438737403, "grad_norm": 4.200262069702148, "learning_rate": 3.147212812450819e-05, "loss": 12.7581, "step": 318 }, { "epoch": 0.06527855936972425, "grad_norm": 4.211337089538574, "learning_rate": 3.116857491533947e-05, "loss": 12.8883, "step": 319 }, { "epoch": 0.06548319435207449, "grad_norm": 4.417909622192383, "learning_rate": 3.086582838174551e-05, "loss": 12.9176, "step": 320 }, { "epoch": 0.06568782933442471, "grad_norm": 4.327807903289795, "learning_rate": 3.056390149238022e-05, "loss": 12.5289, "step": 321 }, { "epoch": 0.06589246431677495, "grad_norm": 4.564601898193359, "learning_rate": 3.0262807180786647e-05, "loss": 12.8324, "step": 322 }, { "epoch": 0.06609709929912519, "grad_norm": 4.540707111358643, "learning_rate": 2.996255834484296e-05, "loss": 12.1696, "step": 323 }, { "epoch": 0.06630173428147541, "grad_norm": 4.797798156738281, "learning_rate": 2.9663167846209998e-05, "loss": 12.73, "step": 324 }, { "epoch": 0.06650636926382565, "grad_norm": 4.712722301483154, "learning_rate": 2.936464850978027e-05, "loss": 12.439, "step": 325 }, { "epoch": 0.06671100424617589, "grad_norm": 4.9245781898498535, "learning_rate": 2.9067013123128613e-05, "loss": 12.8693, "step": 326 }, { "epoch": 0.06691563922852611, "grad_norm": 5.3487467765808105, "learning_rate": 2.8770274435964355e-05, "loss": 12.7175, "step": 327 }, { "epoch": 0.06712027421087635, "grad_norm": 5.03184175491333, "learning_rate": 2.8474445159585235e-05, "loss": 12.3854, "step": 328 }, { "epoch": 0.06732490919322658, "grad_norm": 4.980981349945068, "learning_rate": 2.8179537966332887e-05, "loss": 13.0615, "step": 329 }, { "epoch": 0.06752954417557681, "grad_norm": 5.728270053863525, "learning_rate": 2.7885565489049946e-05, "loss": 13.4382, "step": 330 }, { "epoch": 0.06773417915792705, "grad_norm": 5.375972270965576, "learning_rate": 2.759254032053888e-05, "loss": 12.7093, "step": 331 }, { "epoch": 0.06793881414027728, "grad_norm": 6.161487579345703, "learning_rate": 2.7300475013022663e-05, "loss": 13.2182, "step": 332 }, { "epoch": 0.06814344912262751, "grad_norm": 5.7192254066467285, "learning_rate": 2.700938207760701e-05, "loss": 13.2617, "step": 333 }, { "epoch": 0.06834808410497775, "grad_norm": 5.916773796081543, "learning_rate": 2.671927398374443e-05, "loss": 13.1881, "step": 334 }, { "epoch": 0.06855271908732798, "grad_norm": 6.228060722351074, "learning_rate": 2.6430163158700115e-05, "loss": 12.904, "step": 335 }, { "epoch": 0.06875735406967821, "grad_norm": 7.778335094451904, "learning_rate": 2.6142061987019577e-05, "loss": 13.1079, "step": 336 }, { "epoch": 0.06896198905202844, "grad_norm": 6.622939109802246, "learning_rate": 2.5854982809998153e-05, "loss": 12.9644, "step": 337 }, { "epoch": 0.06916662403437868, "grad_norm": 6.916367053985596, "learning_rate": 2.556893792515227e-05, "loss": 13.016, "step": 338 }, { "epoch": 0.06937125901672891, "grad_norm": 6.418735980987549, "learning_rate": 2.5283939585692783e-05, "loss": 12.5322, "step": 339 }, { "epoch": 0.06957589399907914, "grad_norm": 7.215633392333984, "learning_rate": 2.500000000000001e-05, "loss": 12.5916, "step": 340 }, { "epoch": 0.06978052898142938, "grad_norm": 7.442222595214844, "learning_rate": 2.471713133110078e-05, "loss": 13.7867, "step": 341 }, { "epoch": 0.06998516396377961, "grad_norm": 8.242687225341797, "learning_rate": 2.4435345696147403e-05, "loss": 12.5457, "step": 342 }, { "epoch": 0.07018979894612984, "grad_norm": 8.024588584899902, "learning_rate": 2.4154655165898627e-05, "loss": 13.2987, "step": 343 }, { "epoch": 0.07039443392848008, "grad_norm": 8.097381591796875, "learning_rate": 2.3875071764202563e-05, "loss": 12.2318, "step": 344 }, { "epoch": 0.0705990689108303, "grad_norm": 8.730584144592285, "learning_rate": 2.3596607467481603e-05, "loss": 12.7858, "step": 345 }, { "epoch": 0.07080370389318054, "grad_norm": 10.03264045715332, "learning_rate": 2.3319274204219428e-05, "loss": 12.4164, "step": 346 }, { "epoch": 0.07100833887553078, "grad_norm": 10.419230461120605, "learning_rate": 2.3043083854449988e-05, "loss": 11.9891, "step": 347 }, { "epoch": 0.071212973857881, "grad_norm": 10.628999710083008, "learning_rate": 2.2768048249248648e-05, "loss": 13.0758, "step": 348 }, { "epoch": 0.07141760884023124, "grad_norm": 12.918512344360352, "learning_rate": 2.2494179170225333e-05, "loss": 12.2184, "step": 349 }, { "epoch": 0.07162224382258146, "grad_norm": 17.08102798461914, "learning_rate": 2.2221488349019903e-05, "loss": 13.2989, "step": 350 }, { "epoch": 0.0718268788049317, "grad_norm": 2.621025800704956, "learning_rate": 2.194998746679952e-05, "loss": 13.1545, "step": 351 }, { "epoch": 0.07203151378728194, "grad_norm": 2.733877182006836, "learning_rate": 2.167968815375837e-05, "loss": 12.7721, "step": 352 }, { "epoch": 0.07223614876963216, "grad_norm": 2.7492332458496094, "learning_rate": 2.1410601988619394e-05, "loss": 12.9304, "step": 353 }, { "epoch": 0.0724407837519824, "grad_norm": 3.0048248767852783, "learning_rate": 2.1142740498138324e-05, "loss": 12.5243, "step": 354 }, { "epoch": 0.07264541873433264, "grad_norm": 3.2829930782318115, "learning_rate": 2.08761151566099e-05, "loss": 12.8763, "step": 355 }, { "epoch": 0.07285005371668286, "grad_norm": 3.4896061420440674, "learning_rate": 2.061073738537635e-05, "loss": 12.6124, "step": 356 }, { "epoch": 0.0730546886990331, "grad_norm": 3.766759157180786, "learning_rate": 2.034661855233815e-05, "loss": 12.8122, "step": 357 }, { "epoch": 0.07325932368138333, "grad_norm": 3.564368486404419, "learning_rate": 2.008376997146705e-05, "loss": 12.5289, "step": 358 }, { "epoch": 0.07346395866373356, "grad_norm": 3.5410869121551514, "learning_rate": 1.982220290232143e-05, "loss": 12.9323, "step": 359 }, { "epoch": 0.0736685936460838, "grad_norm": 3.591470718383789, "learning_rate": 1.9561928549563968e-05, "loss": 12.6307, "step": 360 }, { "epoch": 0.07387322862843403, "grad_norm": 3.747335910797119, "learning_rate": 1.9302958062481673e-05, "loss": 12.2941, "step": 361 }, { "epoch": 0.07407786361078426, "grad_norm": 3.776078224182129, "learning_rate": 1.9045302534508297e-05, "loss": 12.6067, "step": 362 }, { "epoch": 0.0742824985931345, "grad_norm": 4.040799617767334, "learning_rate": 1.8788973002749112e-05, "loss": 12.9593, "step": 363 }, { "epoch": 0.07448713357548473, "grad_norm": 3.7775509357452393, "learning_rate": 1.8533980447508137e-05, "loss": 12.3113, "step": 364 }, { "epoch": 0.07469176855783496, "grad_norm": 3.858264446258545, "learning_rate": 1.8280335791817733e-05, "loss": 12.9729, "step": 365 }, { "epoch": 0.07489640354018519, "grad_norm": 4.055905342102051, "learning_rate": 1.8028049900970767e-05, "loss": 12.5952, "step": 366 }, { "epoch": 0.07510103852253543, "grad_norm": 4.153656959533691, "learning_rate": 1.777713358205514e-05, "loss": 13.1257, "step": 367 }, { "epoch": 0.07530567350488566, "grad_norm": 4.324829578399658, "learning_rate": 1.7527597583490822e-05, "loss": 12.7625, "step": 368 }, { "epoch": 0.07551030848723589, "grad_norm": 4.624112129211426, "learning_rate": 1.7279452594569483e-05, "loss": 12.7958, "step": 369 }, { "epoch": 0.07571494346958613, "grad_norm": 4.383573055267334, "learning_rate": 1.703270924499656e-05, "loss": 12.8363, "step": 370 }, { "epoch": 0.07591957845193636, "grad_norm": 4.42855167388916, "learning_rate": 1.678737810443593e-05, "loss": 12.8649, "step": 371 }, { "epoch": 0.07612421343428659, "grad_norm": 4.5845947265625, "learning_rate": 1.6543469682057106e-05, "loss": 12.901, "step": 372 }, { "epoch": 0.07632884841663683, "grad_norm": 4.834083080291748, "learning_rate": 1.6300994426085103e-05, "loss": 13.347, "step": 373 }, { "epoch": 0.07653348339898705, "grad_norm": 4.795494079589844, "learning_rate": 1.605996272335291e-05, "loss": 12.8918, "step": 374 }, { "epoch": 0.07673811838133729, "grad_norm": 4.895383358001709, "learning_rate": 1.5820384898856434e-05, "loss": 13.162, "step": 375 }, { "epoch": 0.07694275336368753, "grad_norm": 4.894996166229248, "learning_rate": 1.5582271215312294e-05, "loss": 12.701, "step": 376 }, { "epoch": 0.07714738834603775, "grad_norm": 5.065547943115234, "learning_rate": 1.5345631872718214e-05, "loss": 12.8654, "step": 377 }, { "epoch": 0.07735202332838799, "grad_norm": 5.112913608551025, "learning_rate": 1.5110477007916001e-05, "loss": 12.766, "step": 378 }, { "epoch": 0.07755665831073823, "grad_norm": 5.340709209442139, "learning_rate": 1.4876816694157419e-05, "loss": 12.9361, "step": 379 }, { "epoch": 0.07776129329308845, "grad_norm": 5.618555068969727, "learning_rate": 1.4644660940672627e-05, "loss": 13.0778, "step": 380 }, { "epoch": 0.07796592827543869, "grad_norm": 5.697518348693848, "learning_rate": 1.4414019692241437e-05, "loss": 12.582, "step": 381 }, { "epoch": 0.07817056325778891, "grad_norm": 5.6424241065979, "learning_rate": 1.4184902828767287e-05, "loss": 12.7521, "step": 382 }, { "epoch": 0.07837519824013915, "grad_norm": 6.131405830383301, "learning_rate": 1.3957320164854059e-05, "loss": 13.2849, "step": 383 }, { "epoch": 0.07857983322248939, "grad_norm": 5.919434547424316, "learning_rate": 1.373128144938563e-05, "loss": 13.0198, "step": 384 }, { "epoch": 0.07878446820483961, "grad_norm": 6.204239368438721, "learning_rate": 1.3506796365108232e-05, "loss": 13.2723, "step": 385 }, { "epoch": 0.07898910318718985, "grad_norm": 6.187657833099365, "learning_rate": 1.3283874528215733e-05, "loss": 12.9363, "step": 386 }, { "epoch": 0.07919373816954008, "grad_norm": 6.771162509918213, "learning_rate": 1.3062525487937699e-05, "loss": 12.9462, "step": 387 }, { "epoch": 0.07939837315189031, "grad_norm": 7.12640380859375, "learning_rate": 1.2842758726130283e-05, "loss": 13.2795, "step": 388 }, { "epoch": 0.07960300813424055, "grad_norm": 6.75380802154541, "learning_rate": 1.2624583656870154e-05, "loss": 12.9841, "step": 389 }, { "epoch": 0.07980764311659078, "grad_norm": 7.025509357452393, "learning_rate": 1.2408009626051137e-05, "loss": 12.963, "step": 390 }, { "epoch": 0.08001227809894101, "grad_norm": 7.841353893280029, "learning_rate": 1.2193045910983863e-05, "loss": 12.8249, "step": 391 }, { "epoch": 0.08021691308129125, "grad_norm": 7.905152320861816, "learning_rate": 1.1979701719998453e-05, "loss": 12.7228, "step": 392 }, { "epoch": 0.08042154806364148, "grad_norm": 7.849075794219971, "learning_rate": 1.1767986192049984e-05, "loss": 12.9185, "step": 393 }, { "epoch": 0.08062618304599171, "grad_norm": 9.404616355895996, "learning_rate": 1.1557908396327028e-05, "loss": 12.9398, "step": 394 }, { "epoch": 0.08083081802834194, "grad_norm": 9.282879829406738, "learning_rate": 1.134947733186315e-05, "loss": 12.8917, "step": 395 }, { "epoch": 0.08103545301069218, "grad_norm": 10.77182388305664, "learning_rate": 1.1142701927151456e-05, "loss": 13.7875, "step": 396 }, { "epoch": 0.08124008799304241, "grad_norm": 11.910287857055664, "learning_rate": 1.0937591039762085e-05, "loss": 13.1089, "step": 397 }, { "epoch": 0.08144472297539264, "grad_norm": 11.270553588867188, "learning_rate": 1.0734153455962765e-05, "loss": 12.87, "step": 398 }, { "epoch": 0.08164935795774288, "grad_norm": 12.47913646697998, "learning_rate": 1.0532397890342505e-05, "loss": 12.6082, "step": 399 }, { "epoch": 0.08185399294009311, "grad_norm": 19.859079360961914, "learning_rate": 1.0332332985438248e-05, "loss": 12.4463, "step": 400 }, { "epoch": 0.08185399294009311, "eval_loss": 3.1334729194641113, "eval_runtime": 47.3815, "eval_samples_per_second": 173.697, "eval_steps_per_second": 43.435, "step": 400 }, { "epoch": 0.08205862792244334, "grad_norm": 2.140819549560547, "learning_rate": 1.013396731136465e-05, "loss": 12.6548, "step": 401 }, { "epoch": 0.08226326290479358, "grad_norm": 2.7390999794006348, "learning_rate": 9.937309365446973e-06, "loss": 12.7752, "step": 402 }, { "epoch": 0.0824678978871438, "grad_norm": 2.9735829830169678, "learning_rate": 9.742367571857091e-06, "loss": 13.2948, "step": 403 }, { "epoch": 0.08267253286949404, "grad_norm": 2.9806323051452637, "learning_rate": 9.549150281252633e-06, "loss": 12.9468, "step": 404 }, { "epoch": 0.08287716785184428, "grad_norm": 2.9686119556427, "learning_rate": 9.357665770419244e-06, "loss": 12.8879, "step": 405 }, { "epoch": 0.0830818028341945, "grad_norm": 3.127723217010498, "learning_rate": 9.167922241916055e-06, "loss": 12.6892, "step": 406 }, { "epoch": 0.08328643781654474, "grad_norm": 3.3917040824890137, "learning_rate": 8.97992782372432e-06, "loss": 13.2438, "step": 407 }, { "epoch": 0.08349107279889498, "grad_norm": 3.28285551071167, "learning_rate": 8.793690568899216e-06, "loss": 12.7453, "step": 408 }, { "epoch": 0.0836957077812452, "grad_norm": 3.364295482635498, "learning_rate": 8.609218455224893e-06, "loss": 13.0133, "step": 409 }, { "epoch": 0.08390034276359544, "grad_norm": 3.374210834503174, "learning_rate": 8.426519384872733e-06, "loss": 12.79, "step": 410 }, { "epoch": 0.08410497774594566, "grad_norm": 3.6572799682617188, "learning_rate": 8.245601184062852e-06, "loss": 12.6433, "step": 411 }, { "epoch": 0.0843096127282959, "grad_norm": 3.5379762649536133, "learning_rate": 8.066471602728803e-06, "loss": 12.6151, "step": 412 }, { "epoch": 0.08451424771064614, "grad_norm": 3.6328377723693848, "learning_rate": 7.889138314185678e-06, "loss": 13.0676, "step": 413 }, { "epoch": 0.08471888269299636, "grad_norm": 4.000665664672852, "learning_rate": 7.71360891480134e-06, "loss": 13.2594, "step": 414 }, { "epoch": 0.0849235176753466, "grad_norm": 4.031978130340576, "learning_rate": 7.539890923671062e-06, "loss": 12.738, "step": 415 }, { "epoch": 0.08512815265769684, "grad_norm": 3.724813461303711, "learning_rate": 7.367991782295391e-06, "loss": 12.2267, "step": 416 }, { "epoch": 0.08533278764004706, "grad_norm": 4.113027095794678, "learning_rate": 7.197918854261432e-06, "loss": 12.7985, "step": 417 }, { "epoch": 0.0855374226223973, "grad_norm": 4.147072792053223, "learning_rate": 7.029679424927365e-06, "loss": 13.3496, "step": 418 }, { "epoch": 0.08574205760474753, "grad_norm": 4.0234527587890625, "learning_rate": 6.863280701110408e-06, "loss": 13.023, "step": 419 }, { "epoch": 0.08594669258709776, "grad_norm": 4.267332077026367, "learning_rate": 6.698729810778065e-06, "loss": 13.0938, "step": 420 }, { "epoch": 0.086151327569448, "grad_norm": 4.35993766784668, "learning_rate": 6.536033802742813e-06, "loss": 12.7928, "step": 421 }, { "epoch": 0.08635596255179823, "grad_norm": 4.47703218460083, "learning_rate": 6.375199646360142e-06, "loss": 12.9274, "step": 422 }, { "epoch": 0.08656059753414846, "grad_norm": 4.716027736663818, "learning_rate": 6.216234231230012e-06, "loss": 13.2614, "step": 423 }, { "epoch": 0.08676523251649869, "grad_norm": 4.96610164642334, "learning_rate": 6.059144366901736e-06, "loss": 12.2165, "step": 424 }, { "epoch": 0.08696986749884893, "grad_norm": 4.611530780792236, "learning_rate": 5.903936782582253e-06, "loss": 12.7755, "step": 425 }, { "epoch": 0.08717450248119916, "grad_norm": 4.9565300941467285, "learning_rate": 5.750618126847912e-06, "loss": 13.0633, "step": 426 }, { "epoch": 0.08737913746354939, "grad_norm": 5.047351837158203, "learning_rate": 5.599194967359639e-06, "loss": 12.7034, "step": 427 }, { "epoch": 0.08758377244589963, "grad_norm": 4.904860019683838, "learning_rate": 5.449673790581611e-06, "loss": 12.5122, "step": 428 }, { "epoch": 0.08778840742824986, "grad_norm": 5.229506015777588, "learning_rate": 5.302061001503394e-06, "loss": 13.0031, "step": 429 }, { "epoch": 0.08799304241060009, "grad_norm": 5.458662509918213, "learning_rate": 5.156362923365588e-06, "loss": 12.8912, "step": 430 }, { "epoch": 0.08819767739295033, "grad_norm": 5.457494735717773, "learning_rate": 5.012585797388936e-06, "loss": 13.3934, "step": 431 }, { "epoch": 0.08840231237530055, "grad_norm": 5.259174823760986, "learning_rate": 4.87073578250698e-06, "loss": 12.2529, "step": 432 }, { "epoch": 0.08860694735765079, "grad_norm": 5.968120574951172, "learning_rate": 4.730818955102234e-06, "loss": 12.54, "step": 433 }, { "epoch": 0.08881158234000103, "grad_norm": 6.257950782775879, "learning_rate": 4.592841308745932e-06, "loss": 12.6848, "step": 434 }, { "epoch": 0.08901621732235125, "grad_norm": 6.009523868560791, "learning_rate": 4.456808753941205e-06, "loss": 13.0703, "step": 435 }, { "epoch": 0.08922085230470149, "grad_norm": 6.272739887237549, "learning_rate": 4.322727117869951e-06, "loss": 12.3788, "step": 436 }, { "epoch": 0.08942548728705173, "grad_norm": 6.675440788269043, "learning_rate": 4.190602144143207e-06, "loss": 13.0829, "step": 437 }, { "epoch": 0.08963012226940195, "grad_norm": 6.91643762588501, "learning_rate": 4.06043949255509e-06, "loss": 13.3085, "step": 438 }, { "epoch": 0.08983475725175219, "grad_norm": 7.155692100524902, "learning_rate": 3.932244738840379e-06, "loss": 12.7579, "step": 439 }, { "epoch": 0.09003939223410241, "grad_norm": 6.8719940185546875, "learning_rate": 3.8060233744356633e-06, "loss": 12.96, "step": 440 }, { "epoch": 0.09024402721645265, "grad_norm": 7.757195949554443, "learning_rate": 3.681780806244095e-06, "loss": 12.8733, "step": 441 }, { "epoch": 0.09044866219880289, "grad_norm": 7.0664215087890625, "learning_rate": 3.5595223564037884e-06, "loss": 12.6288, "step": 442 }, { "epoch": 0.09065329718115311, "grad_norm": 8.440971374511719, "learning_rate": 3.4392532620598216e-06, "loss": 13.3932, "step": 443 }, { "epoch": 0.09085793216350335, "grad_norm": 8.004168510437012, "learning_rate": 3.3209786751399187e-06, "loss": 12.6128, "step": 444 }, { "epoch": 0.09106256714585359, "grad_norm": 8.411291122436523, "learning_rate": 3.2047036621337236e-06, "loss": 13.4134, "step": 445 }, { "epoch": 0.09126720212820381, "grad_norm": 9.257599830627441, "learning_rate": 3.0904332038757977e-06, "loss": 12.8954, "step": 446 }, { "epoch": 0.09147183711055405, "grad_norm": 9.609465599060059, "learning_rate": 2.978172195332263e-06, "loss": 13.3849, "step": 447 }, { "epoch": 0.09167647209290428, "grad_norm": 11.43130111694336, "learning_rate": 2.8679254453910785e-06, "loss": 12.0155, "step": 448 }, { "epoch": 0.09188110707525451, "grad_norm": 12.757843971252441, "learning_rate": 2.759697676656098e-06, "loss": 12.183, "step": 449 }, { "epoch": 0.09208574205760475, "grad_norm": 17.703031539916992, "learning_rate": 2.653493525244721e-06, "loss": 13.8012, "step": 450 }, { "epoch": 0.09229037703995498, "grad_norm": 2.267075300216675, "learning_rate": 2.549317540589308e-06, "loss": 12.4868, "step": 451 }, { "epoch": 0.09249501202230521, "grad_norm": 2.5427181720733643, "learning_rate": 2.4471741852423237e-06, "loss": 12.5128, "step": 452 }, { "epoch": 0.09269964700465545, "grad_norm": 2.599229097366333, "learning_rate": 2.3470678346851518e-06, "loss": 12.9201, "step": 453 }, { "epoch": 0.09290428198700568, "grad_norm": 2.6054906845092773, "learning_rate": 2.2490027771406687e-06, "loss": 12.6195, "step": 454 }, { "epoch": 0.09310891696935591, "grad_norm": 2.9035027027130127, "learning_rate": 2.152983213389559e-06, "loss": 12.7726, "step": 455 }, { "epoch": 0.09331355195170614, "grad_norm": 3.060668468475342, "learning_rate": 2.0590132565903476e-06, "loss": 13.0542, "step": 456 }, { "epoch": 0.09351818693405638, "grad_norm": 3.1342105865478516, "learning_rate": 1.9670969321032407e-06, "loss": 12.9032, "step": 457 }, { "epoch": 0.09372282191640662, "grad_norm": 3.196115493774414, "learning_rate": 1.8772381773176417e-06, "loss": 13.1882, "step": 458 }, { "epoch": 0.09392745689875684, "grad_norm": 3.232515335083008, "learning_rate": 1.7894408414835362e-06, "loss": 12.8594, "step": 459 }, { "epoch": 0.09413209188110708, "grad_norm": 3.358297348022461, "learning_rate": 1.70370868554659e-06, "loss": 12.909, "step": 460 }, { "epoch": 0.0943367268634573, "grad_norm": 3.49599027633667, "learning_rate": 1.620045381987012e-06, "loss": 12.7092, "step": 461 }, { "epoch": 0.09454136184580754, "grad_norm": 3.4245402812957764, "learning_rate": 1.5384545146622852e-06, "loss": 12.656, "step": 462 }, { "epoch": 0.09474599682815778, "grad_norm": 3.7484071254730225, "learning_rate": 1.4589395786535953e-06, "loss": 12.6428, "step": 463 }, { "epoch": 0.094950631810508, "grad_norm": 3.6963798999786377, "learning_rate": 1.3815039801161721e-06, "loss": 12.75, "step": 464 }, { "epoch": 0.09515526679285824, "grad_norm": 3.9733211994171143, "learning_rate": 1.3061510361333185e-06, "loss": 12.9671, "step": 465 }, { "epoch": 0.09535990177520848, "grad_norm": 3.8950541019439697, "learning_rate": 1.232883974574367e-06, "loss": 12.7971, "step": 466 }, { "epoch": 0.0955645367575587, "grad_norm": 4.091780185699463, "learning_rate": 1.1617059339563807e-06, "loss": 12.4321, "step": 467 }, { "epoch": 0.09576917173990894, "grad_norm": 4.282808303833008, "learning_rate": 1.0926199633097157e-06, "loss": 13.4677, "step": 468 }, { "epoch": 0.09597380672225916, "grad_norm": 4.246225357055664, "learning_rate": 1.0256290220474307e-06, "loss": 12.7789, "step": 469 }, { "epoch": 0.0961784417046094, "grad_norm": 4.31233549118042, "learning_rate": 9.607359798384785e-07, "loss": 12.7813, "step": 470 }, { "epoch": 0.09638307668695964, "grad_norm": 4.715573787689209, "learning_rate": 8.979436164848088e-07, "loss": 13.1816, "step": 471 }, { "epoch": 0.09658771166930986, "grad_norm": 4.476752281188965, "learning_rate": 8.372546218022747e-07, "loss": 13.1693, "step": 472 }, { "epoch": 0.0967923466516601, "grad_norm": 4.613335132598877, "learning_rate": 7.786715955054203e-07, "loss": 12.6692, "step": 473 }, { "epoch": 0.09699698163401034, "grad_norm": 4.564615249633789, "learning_rate": 7.221970470961125e-07, "loss": 12.6351, "step": 474 }, { "epoch": 0.09720161661636056, "grad_norm": 4.650303840637207, "learning_rate": 6.678333957560512e-07, "loss": 13.1352, "step": 475 }, { "epoch": 0.0974062515987108, "grad_norm": 4.742563724517822, "learning_rate": 6.15582970243117e-07, "loss": 12.8478, "step": 476 }, { "epoch": 0.09761088658106103, "grad_norm": 5.02756929397583, "learning_rate": 5.654480087916303e-07, "loss": 12.4434, "step": 477 }, { "epoch": 0.09781552156341126, "grad_norm": 5.11809778213501, "learning_rate": 5.174306590164879e-07, "loss": 12.5754, "step": 478 }, { "epoch": 0.0980201565457615, "grad_norm": 5.026023864746094, "learning_rate": 4.715329778211375e-07, "loss": 12.7589, "step": 479 }, { "epoch": 0.09822479152811173, "grad_norm": 5.174000263214111, "learning_rate": 4.277569313094809e-07, "loss": 12.4781, "step": 480 }, { "epoch": 0.09842942651046196, "grad_norm": 5.73392391204834, "learning_rate": 3.8610439470164737e-07, "loss": 13.2662, "step": 481 }, { "epoch": 0.0986340614928122, "grad_norm": 5.5650224685668945, "learning_rate": 3.465771522536854e-07, "loss": 12.698, "step": 482 }, { "epoch": 0.09883869647516243, "grad_norm": 5.415269374847412, "learning_rate": 3.09176897181096e-07, "loss": 12.4908, "step": 483 }, { "epoch": 0.09904333145751266, "grad_norm": 5.8570170402526855, "learning_rate": 2.7390523158633554e-07, "loss": 12.5447, "step": 484 }, { "epoch": 0.09924796643986289, "grad_norm": 6.1962361335754395, "learning_rate": 2.407636663901591e-07, "loss": 13.4094, "step": 485 }, { "epoch": 0.09945260142221313, "grad_norm": 6.143553256988525, "learning_rate": 2.0975362126691712e-07, "loss": 12.9071, "step": 486 }, { "epoch": 0.09965723640456337, "grad_norm": 6.178109645843506, "learning_rate": 1.8087642458373134e-07, "loss": 12.7015, "step": 487 }, { "epoch": 0.09986187138691359, "grad_norm": 6.477899551391602, "learning_rate": 1.5413331334360182e-07, "loss": 12.9638, "step": 488 }, { "epoch": 0.10006650636926383, "grad_norm": 6.375460624694824, "learning_rate": 1.2952543313240472e-07, "loss": 12.3259, "step": 489 }, { "epoch": 0.10027114135161405, "grad_norm": 7.033268928527832, "learning_rate": 1.0705383806982606e-07, "loss": 12.8778, "step": 490 }, { "epoch": 0.10047577633396429, "grad_norm": 6.597715377807617, "learning_rate": 8.671949076420882e-08, "loss": 12.0536, "step": 491 }, { "epoch": 0.10068041131631453, "grad_norm": 7.840211868286133, "learning_rate": 6.852326227130834e-08, "loss": 13.5376, "step": 492 }, { "epoch": 0.10088504629866475, "grad_norm": 7.423345565795898, "learning_rate": 5.246593205699424e-08, "loss": 12.8476, "step": 493 }, { "epoch": 0.10108968128101499, "grad_norm": 8.046225547790527, "learning_rate": 3.8548187963854956e-08, "loss": 12.5719, "step": 494 }, { "epoch": 0.10129431626336523, "grad_norm": 8.249234199523926, "learning_rate": 2.6770626181715773e-08, "loss": 12.1992, "step": 495 }, { "epoch": 0.10149895124571545, "grad_norm": 9.472099304199219, "learning_rate": 1.7133751222137007e-08, "loss": 12.8579, "step": 496 }, { "epoch": 0.10170358622806569, "grad_norm": 9.204952239990234, "learning_rate": 9.637975896759077e-09, "loss": 12.0369, "step": 497 }, { "epoch": 0.10190822121041591, "grad_norm": 11.747493743896484, "learning_rate": 4.2836212996499865e-09, "loss": 13.0472, "step": 498 }, { "epoch": 0.10211285619276615, "grad_norm": 12.529196739196777, "learning_rate": 1.0709167935385455e-09, "loss": 12.7862, "step": 499 }, { "epoch": 0.10231749117511639, "grad_norm": 19.44767951965332, "learning_rate": 0.0, "loss": 13.4285, "step": 500 }, { "epoch": 0.10231749117511639, "eval_loss": 3.130511999130249, "eval_runtime": 47.3503, "eval_samples_per_second": 173.811, "eval_steps_per_second": 43.463, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9442642165235712.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }