{ "best_metric": 11.728843688964844, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.0657327143502732, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000164331785875683, "grad_norm": 0.042701028287410736, "learning_rate": 1e-05, "loss": 11.7654, "step": 1 }, { "epoch": 0.000164331785875683, "eval_loss": 11.765250205993652, "eval_runtime": 48.2849, "eval_samples_per_second": 212.261, "eval_steps_per_second": 53.081, "step": 1 }, { "epoch": 0.000328663571751366, "grad_norm": 0.04443906247615814, "learning_rate": 2e-05, "loss": 11.7649, "step": 2 }, { "epoch": 0.000492995357627049, "grad_norm": 0.05119364336133003, "learning_rate": 3e-05, "loss": 11.7635, "step": 3 }, { "epoch": 0.000657327143502732, "grad_norm": 0.04371592029929161, "learning_rate": 4e-05, "loss": 11.7631, "step": 4 }, { "epoch": 0.000821658929378415, "grad_norm": 0.050174057483673096, "learning_rate": 5e-05, "loss": 11.7675, "step": 5 }, { "epoch": 0.000985990715254098, "grad_norm": 0.05489945784211159, "learning_rate": 6e-05, "loss": 11.7681, "step": 6 }, { "epoch": 0.001150322501129781, "grad_norm": 0.055287834256887436, "learning_rate": 7e-05, "loss": 11.7656, "step": 7 }, { "epoch": 0.001314654287005464, "grad_norm": 0.05079254135489464, "learning_rate": 8e-05, "loss": 11.7637, "step": 8 }, { "epoch": 0.0014789860728811471, "grad_norm": 0.038517486304044724, "learning_rate": 9e-05, "loss": 11.7672, "step": 9 }, { "epoch": 0.00164331785875683, "grad_norm": 0.06290938705205917, "learning_rate": 0.0001, "loss": 11.7617, "step": 10 }, { "epoch": 0.001807649644632513, "grad_norm": 0.05406084656715393, "learning_rate": 9.99983777858264e-05, "loss": 11.7618, "step": 11 }, { "epoch": 0.001971981430508196, "grad_norm": 0.07149634510278702, "learning_rate": 9.999351124856874e-05, "loss": 11.7615, "step": 12 }, { "epoch": 0.002136313216383879, "grad_norm": 0.06020309031009674, "learning_rate": 9.998540070400966e-05, "loss": 11.7648, "step": 13 }, { "epoch": 0.002300645002259562, "grad_norm": 0.08084205538034439, "learning_rate": 9.997404667843075e-05, "loss": 11.7612, "step": 14 }, { "epoch": 0.0024649767881352452, "grad_norm": 0.06595081835985184, "learning_rate": 9.995944990857849e-05, "loss": 11.7615, "step": 15 }, { "epoch": 0.002629308574010928, "grad_norm": 0.0957484170794487, "learning_rate": 9.994161134161634e-05, "loss": 11.7614, "step": 16 }, { "epoch": 0.002793640359886611, "grad_norm": 0.09352919459342957, "learning_rate": 9.992053213506334e-05, "loss": 11.7624, "step": 17 }, { "epoch": 0.0029579721457622943, "grad_norm": 0.1028677374124527, "learning_rate": 9.989621365671902e-05, "loss": 11.7618, "step": 18 }, { "epoch": 0.003122303931637977, "grad_norm": 0.12186416238546371, "learning_rate": 9.986865748457457e-05, "loss": 11.7574, "step": 19 }, { "epoch": 0.00328663571751366, "grad_norm": 0.08856413513422012, "learning_rate": 9.983786540671051e-05, "loss": 11.758, "step": 20 }, { "epoch": 0.003450967503389343, "grad_norm": 0.08863840252161026, "learning_rate": 9.980383942118066e-05, "loss": 11.7614, "step": 21 }, { "epoch": 0.003615299289265026, "grad_norm": 0.13004614412784576, "learning_rate": 9.976658173588244e-05, "loss": 11.7599, "step": 22 }, { "epoch": 0.003779631075140709, "grad_norm": 0.12614966928958893, "learning_rate": 9.972609476841367e-05, "loss": 11.76, "step": 23 }, { "epoch": 0.003943962861016392, "grad_norm": 0.16564443707466125, "learning_rate": 9.968238114591566e-05, "loss": 11.7531, "step": 24 }, { "epoch": 0.004108294646892075, "grad_norm": 0.12352258712053299, "learning_rate": 9.96354437049027e-05, "loss": 11.7589, "step": 25 }, { "epoch": 0.004272626432767758, "grad_norm": 0.19589397311210632, "learning_rate": 9.95852854910781e-05, "loss": 11.755, "step": 26 }, { "epoch": 0.004436958218643441, "grad_norm": 0.1362888664007187, "learning_rate": 9.953190975913647e-05, "loss": 11.7526, "step": 27 }, { "epoch": 0.004601290004519124, "grad_norm": 0.16259880363941193, "learning_rate": 9.947531997255256e-05, "loss": 11.7523, "step": 28 }, { "epoch": 0.004765621790394807, "grad_norm": 0.16003204882144928, "learning_rate": 9.941551980335652e-05, "loss": 11.7501, "step": 29 }, { "epoch": 0.0049299535762704905, "grad_norm": 0.15248532593250275, "learning_rate": 9.935251313189564e-05, "loss": 11.7525, "step": 30 }, { "epoch": 0.005094285362146173, "grad_norm": 0.13439106941223145, "learning_rate": 9.928630404658255e-05, "loss": 11.7477, "step": 31 }, { "epoch": 0.005258617148021856, "grad_norm": 0.19816677272319794, "learning_rate": 9.921689684362989e-05, "loss": 11.7489, "step": 32 }, { "epoch": 0.0054229489338975395, "grad_norm": 0.14417704939842224, "learning_rate": 9.914429602677162e-05, "loss": 11.7435, "step": 33 }, { "epoch": 0.005587280719773222, "grad_norm": 0.1288655698299408, "learning_rate": 9.906850630697068e-05, "loss": 11.7456, "step": 34 }, { "epoch": 0.005751612505648905, "grad_norm": 0.1479674130678177, "learning_rate": 9.898953260211338e-05, "loss": 11.7448, "step": 35 }, { "epoch": 0.0059159442915245886, "grad_norm": 0.11588233709335327, "learning_rate": 9.890738003669029e-05, "loss": 11.7495, "step": 36 }, { "epoch": 0.006080276077400271, "grad_norm": 0.11137348413467407, "learning_rate": 9.882205394146361e-05, "loss": 11.743, "step": 37 }, { "epoch": 0.006244607863275954, "grad_norm": 0.11102559417486191, "learning_rate": 9.87335598531214e-05, "loss": 11.7388, "step": 38 }, { "epoch": 0.006408939649151637, "grad_norm": 0.10916115343570709, "learning_rate": 9.864190351391822e-05, "loss": 11.7415, "step": 39 }, { "epoch": 0.00657327143502732, "grad_norm": 0.10234220325946808, "learning_rate": 9.85470908713026e-05, "loss": 11.7427, "step": 40 }, { "epoch": 0.006737603220903003, "grad_norm": 0.1283135861158371, "learning_rate": 9.844912807753104e-05, "loss": 11.7424, "step": 41 }, { "epoch": 0.006901935006778686, "grad_norm": 0.11565033346414566, "learning_rate": 9.834802148926882e-05, "loss": 11.7331, "step": 42 }, { "epoch": 0.007066266792654369, "grad_norm": 0.11571874469518661, "learning_rate": 9.824377766717759e-05, "loss": 11.7374, "step": 43 }, { "epoch": 0.007230598578530052, "grad_norm": 0.10002451390028, "learning_rate": 9.813640337548954e-05, "loss": 11.7423, "step": 44 }, { "epoch": 0.007394930364405735, "grad_norm": 0.09244619309902191, "learning_rate": 9.802590558156862e-05, "loss": 11.7401, "step": 45 }, { "epoch": 0.007559262150281418, "grad_norm": 0.08112873882055283, "learning_rate": 9.791229145545831e-05, "loss": 11.7388, "step": 46 }, { "epoch": 0.0077235939361571014, "grad_norm": 0.10474348813295364, "learning_rate": 9.779556836941645e-05, "loss": 11.7399, "step": 47 }, { "epoch": 0.007887925722032785, "grad_norm": 0.11409953236579895, "learning_rate": 9.767574389743682e-05, "loss": 11.7385, "step": 48 }, { "epoch": 0.008052257507908467, "grad_norm": 0.2044820338487625, "learning_rate": 9.755282581475769e-05, "loss": 11.7294, "step": 49 }, { "epoch": 0.00821658929378415, "grad_norm": 0.2515033483505249, "learning_rate": 9.742682209735727e-05, "loss": 11.7291, "step": 50 }, { "epoch": 0.00821658929378415, "eval_loss": 11.74016284942627, "eval_runtime": 48.5939, "eval_samples_per_second": 210.911, "eval_steps_per_second": 52.743, "step": 50 }, { "epoch": 0.008380921079659834, "grad_norm": 0.10825006663799286, "learning_rate": 9.729774092143627e-05, "loss": 11.7462, "step": 51 }, { "epoch": 0.008545252865535516, "grad_norm": 0.09035345911979675, "learning_rate": 9.716559066288715e-05, "loss": 11.7452, "step": 52 }, { "epoch": 0.008709584651411199, "grad_norm": 0.109395332634449, "learning_rate": 9.703037989675087e-05, "loss": 11.7461, "step": 53 }, { "epoch": 0.008873916437286883, "grad_norm": 0.09128301590681076, "learning_rate": 9.689211739666023e-05, "loss": 11.7422, "step": 54 }, { "epoch": 0.009038248223162565, "grad_norm": 0.09604194760322571, "learning_rate": 9.675081213427076e-05, "loss": 11.741, "step": 55 }, { "epoch": 0.009202580009038248, "grad_norm": 0.09056773781776428, "learning_rate": 9.66064732786784e-05, "loss": 11.7421, "step": 56 }, { "epoch": 0.009366911794913932, "grad_norm": 0.08328656107187271, "learning_rate": 9.645911019582467e-05, "loss": 11.7412, "step": 57 }, { "epoch": 0.009531243580789614, "grad_norm": 0.09039179235696793, "learning_rate": 9.630873244788883e-05, "loss": 11.7383, "step": 58 }, { "epoch": 0.009695575366665297, "grad_norm": 0.07337184250354767, "learning_rate": 9.615534979266745e-05, "loss": 11.7419, "step": 59 }, { "epoch": 0.009859907152540981, "grad_norm": 0.07708930224180222, "learning_rate": 9.599897218294122e-05, "loss": 11.7387, "step": 60 }, { "epoch": 0.010024238938416663, "grad_norm": 0.08723892271518707, "learning_rate": 9.583960976582913e-05, "loss": 11.7393, "step": 61 }, { "epoch": 0.010188570724292346, "grad_norm": 0.08119846135377884, "learning_rate": 9.567727288213005e-05, "loss": 11.7412, "step": 62 }, { "epoch": 0.01035290251016803, "grad_norm": 0.0847805067896843, "learning_rate": 9.551197206565173e-05, "loss": 11.743, "step": 63 }, { "epoch": 0.010517234296043712, "grad_norm": 0.07967144250869751, "learning_rate": 9.534371804252728e-05, "loss": 11.7405, "step": 64 }, { "epoch": 0.010681566081919395, "grad_norm": 0.046992260962724686, "learning_rate": 9.517252173051911e-05, "loss": 11.7404, "step": 65 }, { "epoch": 0.010845897867795079, "grad_norm": 0.054778508841991425, "learning_rate": 9.49983942383106e-05, "loss": 11.7398, "step": 66 }, { "epoch": 0.011010229653670761, "grad_norm": 0.05465163663029671, "learning_rate": 9.482134686478519e-05, "loss": 11.7414, "step": 67 }, { "epoch": 0.011174561439546444, "grad_norm": 0.06919052451848984, "learning_rate": 9.464139109829321e-05, "loss": 11.7432, "step": 68 }, { "epoch": 0.011338893225422128, "grad_norm": 0.04978770390152931, "learning_rate": 9.445853861590647e-05, "loss": 11.7406, "step": 69 }, { "epoch": 0.01150322501129781, "grad_norm": 0.05154619738459587, "learning_rate": 9.42728012826605e-05, "loss": 11.7389, "step": 70 }, { "epoch": 0.011667556797173493, "grad_norm": 0.058466438204050064, "learning_rate": 9.408419115078471e-05, "loss": 11.7376, "step": 71 }, { "epoch": 0.011831888583049177, "grad_norm": 0.07136223465204239, "learning_rate": 9.389272045892024e-05, "loss": 11.738, "step": 72 }, { "epoch": 0.01199622036892486, "grad_norm": 0.0563649944961071, "learning_rate": 9.36984016313259e-05, "loss": 11.738, "step": 73 }, { "epoch": 0.012160552154800542, "grad_norm": 0.062396273016929626, "learning_rate": 9.350124727707197e-05, "loss": 11.7354, "step": 74 }, { "epoch": 0.012324883940676224, "grad_norm": 0.05039061978459358, "learning_rate": 9.330127018922194e-05, "loss": 11.7388, "step": 75 }, { "epoch": 0.012489215726551909, "grad_norm": 0.058604415506124496, "learning_rate": 9.309848334400246e-05, "loss": 11.7386, "step": 76 }, { "epoch": 0.012653547512427591, "grad_norm": 0.08999053388834, "learning_rate": 9.289289989996133e-05, "loss": 11.7338, "step": 77 }, { "epoch": 0.012817879298303273, "grad_norm": 0.05026502162218094, "learning_rate": 9.268453319711363e-05, "loss": 11.7371, "step": 78 }, { "epoch": 0.012982211084178958, "grad_norm": 0.06558049470186234, "learning_rate": 9.247339675607605e-05, "loss": 11.7358, "step": 79 }, { "epoch": 0.01314654287005464, "grad_norm": 0.06828798353672028, "learning_rate": 9.225950427718975e-05, "loss": 11.7333, "step": 80 }, { "epoch": 0.013310874655930323, "grad_norm": 0.05039716139435768, "learning_rate": 9.204286963963111e-05, "loss": 11.7367, "step": 81 }, { "epoch": 0.013475206441806007, "grad_norm": 0.09033340215682983, "learning_rate": 9.182350690051133e-05, "loss": 11.737, "step": 82 }, { "epoch": 0.01363953822768169, "grad_norm": 0.04329407215118408, "learning_rate": 9.160143029396422e-05, "loss": 11.7365, "step": 83 }, { "epoch": 0.013803870013557372, "grad_norm": 0.05856756493449211, "learning_rate": 9.13766542302225e-05, "loss": 11.7351, "step": 84 }, { "epoch": 0.013968201799433056, "grad_norm": 0.06545337289571762, "learning_rate": 9.114919329468282e-05, "loss": 11.7321, "step": 85 }, { "epoch": 0.014132533585308738, "grad_norm": 0.06385994702577591, "learning_rate": 9.091906224695935e-05, "loss": 11.734, "step": 86 }, { "epoch": 0.01429686537118442, "grad_norm": 0.0672338604927063, "learning_rate": 9.068627601992598e-05, "loss": 11.7357, "step": 87 }, { "epoch": 0.014461197157060105, "grad_norm": 0.06891307979822159, "learning_rate": 9.045084971874738e-05, "loss": 11.7329, "step": 88 }, { "epoch": 0.014625528942935787, "grad_norm": 0.05319250375032425, "learning_rate": 9.021279861989885e-05, "loss": 11.7367, "step": 89 }, { "epoch": 0.01478986072881147, "grad_norm": 0.07806988060474396, "learning_rate": 8.997213817017507e-05, "loss": 11.7364, "step": 90 }, { "epoch": 0.014954192514687154, "grad_norm": 0.07189088314771652, "learning_rate": 8.972888398568772e-05, "loss": 11.7356, "step": 91 }, { "epoch": 0.015118524300562836, "grad_norm": 0.07356931269168854, "learning_rate": 8.948305185085225e-05, "loss": 11.7331, "step": 92 }, { "epoch": 0.015282856086438519, "grad_norm": 0.07599375396966934, "learning_rate": 8.92346577173636e-05, "loss": 11.7298, "step": 93 }, { "epoch": 0.015447187872314203, "grad_norm": 0.08366679400205612, "learning_rate": 8.898371770316111e-05, "loss": 11.728, "step": 94 }, { "epoch": 0.015611519658189885, "grad_norm": 0.08361729979515076, "learning_rate": 8.873024809138272e-05, "loss": 11.7304, "step": 95 }, { "epoch": 0.01577585144406557, "grad_norm": 0.07593885809183121, "learning_rate": 8.847426532930831e-05, "loss": 11.7272, "step": 96 }, { "epoch": 0.01594018322994125, "grad_norm": 0.09948297590017319, "learning_rate": 8.821578602729242e-05, "loss": 11.7321, "step": 97 }, { "epoch": 0.016104515015816934, "grad_norm": 0.13416649401187897, "learning_rate": 8.795482695768658e-05, "loss": 11.7273, "step": 98 }, { "epoch": 0.01626884680169262, "grad_norm": 0.12499107420444489, "learning_rate": 8.769140505375085e-05, "loss": 11.7205, "step": 99 }, { "epoch": 0.0164331785875683, "grad_norm": 0.2499997466802597, "learning_rate": 8.742553740855506e-05, "loss": 11.7202, "step": 100 }, { "epoch": 0.0164331785875683, "eval_loss": 11.734024047851562, "eval_runtime": 48.7146, "eval_samples_per_second": 210.389, "eval_steps_per_second": 52.613, "step": 100 }, { "epoch": 0.016597510373443983, "grad_norm": 0.111448734998703, "learning_rate": 8.715724127386972e-05, "loss": 11.7424, "step": 101 }, { "epoch": 0.016761842159319668, "grad_norm": 0.0742286741733551, "learning_rate": 8.688653405904652e-05, "loss": 11.7416, "step": 102 }, { "epoch": 0.01692617394519535, "grad_norm": 0.08018925040960312, "learning_rate": 8.661343332988869e-05, "loss": 11.7371, "step": 103 }, { "epoch": 0.017090505731071032, "grad_norm": 0.08447378873825073, "learning_rate": 8.633795680751116e-05, "loss": 11.7374, "step": 104 }, { "epoch": 0.017254837516946717, "grad_norm": 0.08231153339147568, "learning_rate": 8.606012236719073e-05, "loss": 11.7377, "step": 105 }, { "epoch": 0.017419169302822397, "grad_norm": 0.07651390880346298, "learning_rate": 8.577994803720606e-05, "loss": 11.7391, "step": 106 }, { "epoch": 0.01758350108869808, "grad_norm": 0.08170609176158905, "learning_rate": 8.549745199766792e-05, "loss": 11.7371, "step": 107 }, { "epoch": 0.017747832874573766, "grad_norm": 0.05312446877360344, "learning_rate": 8.521265257933948e-05, "loss": 11.7405, "step": 108 }, { "epoch": 0.017912164660449446, "grad_norm": 0.05158795416355133, "learning_rate": 8.492556826244687e-05, "loss": 11.7399, "step": 109 }, { "epoch": 0.01807649644632513, "grad_norm": 0.060141801834106445, "learning_rate": 8.463621767547998e-05, "loss": 11.7336, "step": 110 }, { "epoch": 0.018240828232200815, "grad_norm": 0.07077991217374802, "learning_rate": 8.434461959398376e-05, "loss": 11.7364, "step": 111 }, { "epoch": 0.018405160018076495, "grad_norm": 0.05313685163855553, "learning_rate": 8.405079293933986e-05, "loss": 11.7386, "step": 112 }, { "epoch": 0.01856949180395218, "grad_norm": 0.056390322744846344, "learning_rate": 8.375475677753881e-05, "loss": 11.7361, "step": 113 }, { "epoch": 0.018733823589827864, "grad_norm": 0.06804954260587692, "learning_rate": 8.345653031794292e-05, "loss": 11.7351, "step": 114 }, { "epoch": 0.018898155375703544, "grad_norm": 0.06049950420856476, "learning_rate": 8.315613291203976e-05, "loss": 11.7321, "step": 115 }, { "epoch": 0.01906248716157923, "grad_norm": 0.062231432646512985, "learning_rate": 8.285358405218655e-05, "loss": 11.7345, "step": 116 }, { "epoch": 0.019226818947454913, "grad_norm": 0.07354050874710083, "learning_rate": 8.25489033703452e-05, "loss": 11.7314, "step": 117 }, { "epoch": 0.019391150733330594, "grad_norm": 0.05630939453840256, "learning_rate": 8.224211063680853e-05, "loss": 11.7317, "step": 118 }, { "epoch": 0.019555482519206278, "grad_norm": 0.06644688546657562, "learning_rate": 8.19332257589174e-05, "loss": 11.7328, "step": 119 }, { "epoch": 0.019719814305081962, "grad_norm": 0.07483275234699249, "learning_rate": 8.162226877976887e-05, "loss": 11.7338, "step": 120 }, { "epoch": 0.019884146090957643, "grad_norm": 0.06993226706981659, "learning_rate": 8.130925987691569e-05, "loss": 11.7337, "step": 121 }, { "epoch": 0.020048477876833327, "grad_norm": 0.06282593309879303, "learning_rate": 8.099421936105702e-05, "loss": 11.7343, "step": 122 }, { "epoch": 0.02021280966270901, "grad_norm": 0.04913927987217903, "learning_rate": 8.067716767472045e-05, "loss": 11.7289, "step": 123 }, { "epoch": 0.02037714144858469, "grad_norm": 0.06600144505500793, "learning_rate": 8.035812539093557e-05, "loss": 11.7353, "step": 124 }, { "epoch": 0.020541473234460376, "grad_norm": 0.05454692989587784, "learning_rate": 8.003711321189895e-05, "loss": 11.7329, "step": 125 }, { "epoch": 0.02070580502033606, "grad_norm": 0.05435941740870476, "learning_rate": 7.971415196763088e-05, "loss": 11.7338, "step": 126 }, { "epoch": 0.02087013680621174, "grad_norm": 0.05516434460878372, "learning_rate": 7.938926261462366e-05, "loss": 11.7278, "step": 127 }, { "epoch": 0.021034468592087425, "grad_norm": 0.05917780473828316, "learning_rate": 7.906246623448183e-05, "loss": 11.7353, "step": 128 }, { "epoch": 0.02119880037796311, "grad_norm": 0.05429854616522789, "learning_rate": 7.873378403255419e-05, "loss": 11.7334, "step": 129 }, { "epoch": 0.02136313216383879, "grad_norm": 0.059085000306367874, "learning_rate": 7.840323733655778e-05, "loss": 11.7326, "step": 130 }, { "epoch": 0.021527463949714474, "grad_norm": 0.06832055002450943, "learning_rate": 7.807084759519405e-05, "loss": 11.7345, "step": 131 }, { "epoch": 0.021691795735590158, "grad_norm": 0.06523392349481583, "learning_rate": 7.773663637675694e-05, "loss": 11.7315, "step": 132 }, { "epoch": 0.02185612752146584, "grad_norm": 0.05600828677415848, "learning_rate": 7.740062536773352e-05, "loss": 11.7334, "step": 133 }, { "epoch": 0.022020459307341523, "grad_norm": 0.05971834808588028, "learning_rate": 7.706283637139658e-05, "loss": 11.7314, "step": 134 }, { "epoch": 0.022184791093217207, "grad_norm": 0.06425528228282928, "learning_rate": 7.672329130639005e-05, "loss": 11.7293, "step": 135 }, { "epoch": 0.022349122879092888, "grad_norm": 0.048994455486536026, "learning_rate": 7.638201220530665e-05, "loss": 11.7309, "step": 136 }, { "epoch": 0.022513454664968572, "grad_norm": 0.05090711638331413, "learning_rate": 7.603902121325813e-05, "loss": 11.726, "step": 137 }, { "epoch": 0.022677786450844256, "grad_norm": 0.0425398163497448, "learning_rate": 7.569434058643844e-05, "loss": 11.7301, "step": 138 }, { "epoch": 0.022842118236719937, "grad_norm": 0.08484908938407898, "learning_rate": 7.534799269067953e-05, "loss": 11.7318, "step": 139 }, { "epoch": 0.02300645002259562, "grad_norm": 0.0685524120926857, "learning_rate": 7.500000000000001e-05, "loss": 11.7285, "step": 140 }, { "epoch": 0.023170781808471305, "grad_norm": 0.05587577447295189, "learning_rate": 7.465038509514688e-05, "loss": 11.7249, "step": 141 }, { "epoch": 0.023335113594346986, "grad_norm": 0.078126922249794, "learning_rate": 7.42991706621303e-05, "loss": 11.7242, "step": 142 }, { "epoch": 0.02349944538022267, "grad_norm": 0.07973043620586395, "learning_rate": 7.394637949075154e-05, "loss": 11.7341, "step": 143 }, { "epoch": 0.023663777166098354, "grad_norm": 0.08124510943889618, "learning_rate": 7.35920344731241e-05, "loss": 11.7245, "step": 144 }, { "epoch": 0.023828108951974035, "grad_norm": 0.06348835676908493, "learning_rate": 7.323615860218843e-05, "loss": 11.724, "step": 145 }, { "epoch": 0.02399244073784972, "grad_norm": 0.10301569104194641, "learning_rate": 7.287877497021978e-05, "loss": 11.718, "step": 146 }, { "epoch": 0.024156772523725403, "grad_norm": 0.08800891041755676, "learning_rate": 7.251990676732984e-05, "loss": 11.7249, "step": 147 }, { "epoch": 0.024321104309601084, "grad_norm": 0.1236993744969368, "learning_rate": 7.215957727996207e-05, "loss": 11.7283, "step": 148 }, { "epoch": 0.024485436095476768, "grad_norm": 0.13688653707504272, "learning_rate": 7.179780988938051e-05, "loss": 11.7278, "step": 149 }, { "epoch": 0.02464976788135245, "grad_norm": 0.2236781120300293, "learning_rate": 7.143462807015271e-05, "loss": 11.716, "step": 150 }, { "epoch": 0.02464976788135245, "eval_loss": 11.731405258178711, "eval_runtime": 48.5824, "eval_samples_per_second": 210.961, "eval_steps_per_second": 52.756, "step": 150 }, { "epoch": 0.024814099667228133, "grad_norm": 0.097035713493824, "learning_rate": 7.107005538862646e-05, "loss": 11.7423, "step": 151 }, { "epoch": 0.024978431453103817, "grad_norm": 0.08533385396003723, "learning_rate": 7.07041155014006e-05, "loss": 11.7377, "step": 152 }, { "epoch": 0.025142763238979498, "grad_norm": 0.04586503654718399, "learning_rate": 7.033683215379002e-05, "loss": 11.7377, "step": 153 }, { "epoch": 0.025307095024855182, "grad_norm": 0.08356229960918427, "learning_rate": 6.996822917828477e-05, "loss": 11.7346, "step": 154 }, { "epoch": 0.025471426810730866, "grad_norm": 0.06619363278150558, "learning_rate": 6.959833049300377e-05, "loss": 11.7376, "step": 155 }, { "epoch": 0.025635758596606547, "grad_norm": 0.07668127864599228, "learning_rate": 6.922716010014255e-05, "loss": 11.7374, "step": 156 }, { "epoch": 0.02580009038248223, "grad_norm": 0.056997958570718765, "learning_rate": 6.885474208441603e-05, "loss": 11.7321, "step": 157 }, { "epoch": 0.025964422168357915, "grad_norm": 0.07596030831336975, "learning_rate": 6.848110061149556e-05, "loss": 11.7388, "step": 158 }, { "epoch": 0.026128753954233596, "grad_norm": 0.06272446364164352, "learning_rate": 6.810625992644085e-05, "loss": 11.7357, "step": 159 }, { "epoch": 0.02629308574010928, "grad_norm": 0.06435201317071915, "learning_rate": 6.773024435212678e-05, "loss": 11.7327, "step": 160 }, { "epoch": 0.026457417525984964, "grad_norm": 0.0636703222990036, "learning_rate": 6.735307828766515e-05, "loss": 11.7331, "step": 161 }, { "epoch": 0.026621749311860645, "grad_norm": 0.043630119413137436, "learning_rate": 6.697478620682137e-05, "loss": 11.7361, "step": 162 }, { "epoch": 0.02678608109773633, "grad_norm": 0.06924209743738174, "learning_rate": 6.659539265642643e-05, "loss": 11.7336, "step": 163 }, { "epoch": 0.026950412883612013, "grad_norm": 0.04856458678841591, "learning_rate": 6.621492225478414e-05, "loss": 11.7375, "step": 164 }, { "epoch": 0.027114744669487694, "grad_norm": 0.05992913618683815, "learning_rate": 6.583339969007363e-05, "loss": 11.732, "step": 165 }, { "epoch": 0.02727907645536338, "grad_norm": 0.04800641909241676, "learning_rate": 6.545084971874738e-05, "loss": 11.7306, "step": 166 }, { "epoch": 0.027443408241239062, "grad_norm": 0.07342834025621414, "learning_rate": 6.506729716392481e-05, "loss": 11.732, "step": 167 }, { "epoch": 0.027607740027114743, "grad_norm": 0.055936265736818314, "learning_rate": 6.468276691378155e-05, "loss": 11.7306, "step": 168 }, { "epoch": 0.027772071812990427, "grad_norm": 0.06329957395792007, "learning_rate": 6.429728391993446e-05, "loss": 11.7333, "step": 169 }, { "epoch": 0.02793640359886611, "grad_norm": 0.05157172307372093, "learning_rate": 6.391087319582264e-05, "loss": 11.7335, "step": 170 }, { "epoch": 0.028100735384741792, "grad_norm": 0.04706769809126854, "learning_rate": 6.35235598150842e-05, "loss": 11.73, "step": 171 }, { "epoch": 0.028265067170617476, "grad_norm": 0.05760306119918823, "learning_rate": 6.313536890992935e-05, "loss": 11.7345, "step": 172 }, { "epoch": 0.02842939895649316, "grad_norm": 0.049483705312013626, "learning_rate": 6.274632566950967e-05, "loss": 11.728, "step": 173 }, { "epoch": 0.02859373074236884, "grad_norm": 0.047165218740701675, "learning_rate": 6.235645533828349e-05, "loss": 11.7328, "step": 174 }, { "epoch": 0.028758062528244525, "grad_norm": 0.05180773138999939, "learning_rate": 6.19657832143779e-05, "loss": 11.7366, "step": 175 }, { "epoch": 0.02892239431412021, "grad_norm": 0.04299752414226532, "learning_rate": 6.157433464794716e-05, "loss": 11.7347, "step": 176 }, { "epoch": 0.02908672609999589, "grad_norm": 0.05092500150203705, "learning_rate": 6.118213503952779e-05, "loss": 11.7291, "step": 177 }, { "epoch": 0.029251057885871574, "grad_norm": 0.037994395941495895, "learning_rate": 6.078920983839031e-05, "loss": 11.7313, "step": 178 }, { "epoch": 0.02941538967174726, "grad_norm": 0.06346435844898224, "learning_rate": 6.0395584540887963e-05, "loss": 11.7298, "step": 179 }, { "epoch": 0.02957972145762294, "grad_norm": 0.06775764375925064, "learning_rate": 6.0001284688802226e-05, "loss": 11.7279, "step": 180 }, { "epoch": 0.029744053243498624, "grad_norm": 0.05034025013446808, "learning_rate": 5.960633586768543e-05, "loss": 11.7286, "step": 181 }, { "epoch": 0.029908385029374308, "grad_norm": 0.05579889938235283, "learning_rate": 5.921076370520058e-05, "loss": 11.7258, "step": 182 }, { "epoch": 0.03007271681524999, "grad_norm": 0.04144003987312317, "learning_rate": 5.8814593869458455e-05, "loss": 11.73, "step": 183 }, { "epoch": 0.030237048601125673, "grad_norm": 0.053334783762693405, "learning_rate": 5.841785206735192e-05, "loss": 11.7293, "step": 184 }, { "epoch": 0.030401380387001357, "grad_norm": 0.05581595003604889, "learning_rate": 5.8020564042888015e-05, "loss": 11.7283, "step": 185 }, { "epoch": 0.030565712172877037, "grad_norm": 0.0592731349170208, "learning_rate": 5.762275557551727e-05, "loss": 11.7291, "step": 186 }, { "epoch": 0.03073004395875272, "grad_norm": 0.05535244196653366, "learning_rate": 5.7224452478461064e-05, "loss": 11.7224, "step": 187 }, { "epoch": 0.030894375744628406, "grad_norm": 0.04812576249241829, "learning_rate": 5.682568059703659e-05, "loss": 11.7276, "step": 188 }, { "epoch": 0.031058707530504086, "grad_norm": 0.045011844485998154, "learning_rate": 5.642646580697973e-05, "loss": 11.7279, "step": 189 }, { "epoch": 0.03122303931637977, "grad_norm": 0.05568164959549904, "learning_rate": 5.602683401276615e-05, "loss": 11.7236, "step": 190 }, { "epoch": 0.03138737110225545, "grad_norm": 0.07187269628047943, "learning_rate": 5.562681114593028e-05, "loss": 11.7256, "step": 191 }, { "epoch": 0.03155170288813114, "grad_norm": 0.06249718740582466, "learning_rate": 5.522642316338268e-05, "loss": 11.721, "step": 192 }, { "epoch": 0.03171603467400682, "grad_norm": 0.06274542212486267, "learning_rate": 5.482569604572576e-05, "loss": 11.723, "step": 193 }, { "epoch": 0.0318803664598825, "grad_norm": 0.05509130284190178, "learning_rate": 5.442465579556793e-05, "loss": 11.7282, "step": 194 }, { "epoch": 0.03204469824575819, "grad_norm": 0.07715385407209396, "learning_rate": 5.402332843583631e-05, "loss": 11.7274, "step": 195 }, { "epoch": 0.03220903003163387, "grad_norm": 0.06635577231645584, "learning_rate": 5.3621740008088126e-05, "loss": 11.7224, "step": 196 }, { "epoch": 0.03237336181750955, "grad_norm": 0.08863984048366547, "learning_rate": 5.321991657082097e-05, "loss": 11.7221, "step": 197 }, { "epoch": 0.03253769360338524, "grad_norm": 0.07797987759113312, "learning_rate": 5.281788419778187e-05, "loss": 11.7189, "step": 198 }, { "epoch": 0.03270202538926092, "grad_norm": 0.10929285734891891, "learning_rate": 5.2415668976275355e-05, "loss": 11.7152, "step": 199 }, { "epoch": 0.0328663571751366, "grad_norm": 0.19686374068260193, "learning_rate": 5.201329700547076e-05, "loss": 11.7165, "step": 200 }, { "epoch": 0.0328663571751366, "eval_loss": 11.729823112487793, "eval_runtime": 48.4047, "eval_samples_per_second": 211.735, "eval_steps_per_second": 52.949, "step": 200 }, { "epoch": 0.033030688961012286, "grad_norm": 0.08583119511604309, "learning_rate": 5.161079439470866e-05, "loss": 11.7399, "step": 201 }, { "epoch": 0.03319502074688797, "grad_norm": 0.061848536133766174, "learning_rate": 5.1208187261806615e-05, "loss": 11.7356, "step": 202 }, { "epoch": 0.03335935253276365, "grad_norm": 0.05972637981176376, "learning_rate": 5.080550173136457e-05, "loss": 11.7329, "step": 203 }, { "epoch": 0.033523684318639335, "grad_norm": 0.05366925522685051, "learning_rate": 5.0402763933069496e-05, "loss": 11.7334, "step": 204 }, { "epoch": 0.033688016104515016, "grad_norm": 0.05121481791138649, "learning_rate": 5e-05, "loss": 11.735, "step": 205 }, { "epoch": 0.0338523478903907, "grad_norm": 0.04870014265179634, "learning_rate": 4.9597236066930516e-05, "loss": 11.7347, "step": 206 }, { "epoch": 0.034016679676266384, "grad_norm": 0.05482671409845352, "learning_rate": 4.919449826863544e-05, "loss": 11.7389, "step": 207 }, { "epoch": 0.034181011462142065, "grad_norm": 0.0395878441631794, "learning_rate": 4.87918127381934e-05, "loss": 11.7373, "step": 208 }, { "epoch": 0.034345343248017746, "grad_norm": 0.06222689151763916, "learning_rate": 4.8389205605291365e-05, "loss": 11.7367, "step": 209 }, { "epoch": 0.03450967503389343, "grad_norm": 0.04613790661096573, "learning_rate": 4.798670299452926e-05, "loss": 11.7344, "step": 210 }, { "epoch": 0.034674006819769114, "grad_norm": 0.03873327374458313, "learning_rate": 4.758433102372466e-05, "loss": 11.7324, "step": 211 }, { "epoch": 0.034838338605644795, "grad_norm": 0.06393284350633621, "learning_rate": 4.7182115802218126e-05, "loss": 11.7293, "step": 212 }, { "epoch": 0.03500267039152048, "grad_norm": 0.054058074951171875, "learning_rate": 4.678008342917903e-05, "loss": 11.7288, "step": 213 }, { "epoch": 0.03516700217739616, "grad_norm": 0.04814436286687851, "learning_rate": 4.6378259991911886e-05, "loss": 11.7329, "step": 214 }, { "epoch": 0.035331333963271844, "grad_norm": 0.047144170850515366, "learning_rate": 4.597667156416371e-05, "loss": 11.7317, "step": 215 }, { "epoch": 0.03549566574914753, "grad_norm": 0.04114987701177597, "learning_rate": 4.5575344204432084e-05, "loss": 11.7334, "step": 216 }, { "epoch": 0.03565999753502321, "grad_norm": 0.05130726471543312, "learning_rate": 4.5174303954274244e-05, "loss": 11.7313, "step": 217 }, { "epoch": 0.03582432932089889, "grad_norm": 0.04466979578137398, "learning_rate": 4.477357683661734e-05, "loss": 11.7299, "step": 218 }, { "epoch": 0.03598866110677458, "grad_norm": 0.06053079292178154, "learning_rate": 4.437318885406973e-05, "loss": 11.7288, "step": 219 }, { "epoch": 0.03615299289265026, "grad_norm": 0.055950433015823364, "learning_rate": 4.397316598723385e-05, "loss": 11.7317, "step": 220 }, { "epoch": 0.03631732467852594, "grad_norm": 0.05433724448084831, "learning_rate": 4.3573534193020274e-05, "loss": 11.7314, "step": 221 }, { "epoch": 0.03648165646440163, "grad_norm": 0.059027209877967834, "learning_rate": 4.317431940296343e-05, "loss": 11.7279, "step": 222 }, { "epoch": 0.03664598825027731, "grad_norm": 0.04436887800693512, "learning_rate": 4.277554752153895e-05, "loss": 11.7289, "step": 223 }, { "epoch": 0.03681032003615299, "grad_norm": 0.05126571282744408, "learning_rate": 4.237724442448273e-05, "loss": 11.7301, "step": 224 }, { "epoch": 0.03697465182202868, "grad_norm": 0.04554610699415207, "learning_rate": 4.197943595711198e-05, "loss": 11.7306, "step": 225 }, { "epoch": 0.03713898360790436, "grad_norm": 0.06180788576602936, "learning_rate": 4.1582147932648074e-05, "loss": 11.725, "step": 226 }, { "epoch": 0.03730331539378004, "grad_norm": 0.06206175312399864, "learning_rate": 4.118540613054156e-05, "loss": 11.7316, "step": 227 }, { "epoch": 0.03746764717965573, "grad_norm": 0.06412436813116074, "learning_rate": 4.078923629479943e-05, "loss": 11.7288, "step": 228 }, { "epoch": 0.03763197896553141, "grad_norm": 0.059079430997371674, "learning_rate": 4.039366413231458e-05, "loss": 11.7303, "step": 229 }, { "epoch": 0.03779631075140709, "grad_norm": 0.05567527934908867, "learning_rate": 3.9998715311197785e-05, "loss": 11.729, "step": 230 }, { "epoch": 0.03796064253728278, "grad_norm": 0.06493416428565979, "learning_rate": 3.960441545911204e-05, "loss": 11.7304, "step": 231 }, { "epoch": 0.03812497432315846, "grad_norm": 0.048941027373075485, "learning_rate": 3.92107901616097e-05, "loss": 11.731, "step": 232 }, { "epoch": 0.03828930610903414, "grad_norm": 0.041554663330316544, "learning_rate": 3.8817864960472236e-05, "loss": 11.7341, "step": 233 }, { "epoch": 0.038453637894909826, "grad_norm": 0.042613379657268524, "learning_rate": 3.842566535205286e-05, "loss": 11.7274, "step": 234 }, { "epoch": 0.038617969680785506, "grad_norm": 0.051606450229883194, "learning_rate": 3.803421678562213e-05, "loss": 11.7251, "step": 235 }, { "epoch": 0.03878230146666119, "grad_norm": 0.050569821149110794, "learning_rate": 3.764354466171652e-05, "loss": 11.7301, "step": 236 }, { "epoch": 0.038946633252536875, "grad_norm": 0.0642523244023323, "learning_rate": 3.725367433049033e-05, "loss": 11.7294, "step": 237 }, { "epoch": 0.039110965038412555, "grad_norm": 0.05185132101178169, "learning_rate": 3.6864631090070655e-05, "loss": 11.7267, "step": 238 }, { "epoch": 0.039275296824288236, "grad_norm": 0.04526262730360031, "learning_rate": 3.6476440184915815e-05, "loss": 11.7257, "step": 239 }, { "epoch": 0.039439628610163924, "grad_norm": 0.06386663764715195, "learning_rate": 3.608912680417737e-05, "loss": 11.7228, "step": 240 }, { "epoch": 0.039603960396039604, "grad_norm": 0.0651041641831398, "learning_rate": 3.570271608006555e-05, "loss": 11.7286, "step": 241 }, { "epoch": 0.039768292181915285, "grad_norm": 0.05235549435019493, "learning_rate": 3.531723308621847e-05, "loss": 11.7248, "step": 242 }, { "epoch": 0.03993262396779097, "grad_norm": 0.06960996240377426, "learning_rate": 3.493270283607522e-05, "loss": 11.7285, "step": 243 }, { "epoch": 0.040096955753666653, "grad_norm": 0.05223159119486809, "learning_rate": 3.4549150281252636e-05, "loss": 11.7229, "step": 244 }, { "epoch": 0.040261287539542334, "grad_norm": 0.0649554505944252, "learning_rate": 3.4166600309926387e-05, "loss": 11.7265, "step": 245 }, { "epoch": 0.04042561932541802, "grad_norm": 0.058237750083208084, "learning_rate": 3.3785077745215873e-05, "loss": 11.7186, "step": 246 }, { "epoch": 0.0405899511112937, "grad_norm": 0.0660976842045784, "learning_rate": 3.340460734357359e-05, "loss": 11.7216, "step": 247 }, { "epoch": 0.04075428289716938, "grad_norm": 0.08093632757663727, "learning_rate": 3.3025213793178646e-05, "loss": 11.7193, "step": 248 }, { "epoch": 0.04091861468304507, "grad_norm": 0.11029274761676788, "learning_rate": 3.264692171233485e-05, "loss": 11.7199, "step": 249 }, { "epoch": 0.04108294646892075, "grad_norm": 0.16421598196029663, "learning_rate": 3.226975564787322e-05, "loss": 11.7199, "step": 250 }, { "epoch": 0.04108294646892075, "eval_loss": 11.729228019714355, "eval_runtime": 48.4124, "eval_samples_per_second": 211.702, "eval_steps_per_second": 52.941, "step": 250 }, { "epoch": 0.04124727825479643, "grad_norm": 0.08251149952411652, "learning_rate": 3.189374007355917e-05, "loss": 11.738, "step": 251 }, { "epoch": 0.04141161004067212, "grad_norm": 0.07333094626665115, "learning_rate": 3.151889938850445e-05, "loss": 11.7412, "step": 252 }, { "epoch": 0.0415759418265478, "grad_norm": 0.0722256600856781, "learning_rate": 3.114525791558398e-05, "loss": 11.7378, "step": 253 }, { "epoch": 0.04174027361242348, "grad_norm": 0.06710943579673767, "learning_rate": 3.0772839899857464e-05, "loss": 11.7372, "step": 254 }, { "epoch": 0.04190460539829917, "grad_norm": 0.059952352195978165, "learning_rate": 3.0401669506996256e-05, "loss": 11.7369, "step": 255 }, { "epoch": 0.04206893718417485, "grad_norm": 0.051765549927949905, "learning_rate": 3.003177082171523e-05, "loss": 11.7356, "step": 256 }, { "epoch": 0.04223326897005053, "grad_norm": 0.060351401567459106, "learning_rate": 2.9663167846209998e-05, "loss": 11.7339, "step": 257 }, { "epoch": 0.04239760075592622, "grad_norm": 0.04224325716495514, "learning_rate": 2.9295884498599414e-05, "loss": 11.7363, "step": 258 }, { "epoch": 0.0425619325418019, "grad_norm": 0.045162707567214966, "learning_rate": 2.8929944611373554e-05, "loss": 11.7312, "step": 259 }, { "epoch": 0.04272626432767758, "grad_norm": 0.050782881677150726, "learning_rate": 2.8565371929847284e-05, "loss": 11.7289, "step": 260 }, { "epoch": 0.04289059611355327, "grad_norm": 0.06226112321019173, "learning_rate": 2.8202190110619493e-05, "loss": 11.7317, "step": 261 }, { "epoch": 0.04305492789942895, "grad_norm": 0.06022179499268532, "learning_rate": 2.784042272003794e-05, "loss": 11.7308, "step": 262 }, { "epoch": 0.04321925968530463, "grad_norm": 0.055496398359537125, "learning_rate": 2.7480093232670158e-05, "loss": 11.7287, "step": 263 }, { "epoch": 0.043383591471180316, "grad_norm": 0.04589381441473961, "learning_rate": 2.712122502978024e-05, "loss": 11.733, "step": 264 }, { "epoch": 0.043547923257056, "grad_norm": 0.05781928077340126, "learning_rate": 2.6763841397811573e-05, "loss": 11.7306, "step": 265 }, { "epoch": 0.04371225504293168, "grad_norm": 0.04345298931002617, "learning_rate": 2.64079655268759e-05, "loss": 11.73, "step": 266 }, { "epoch": 0.043876586828807365, "grad_norm": 0.053832121193408966, "learning_rate": 2.605362050924848e-05, "loss": 11.7299, "step": 267 }, { "epoch": 0.044040918614683046, "grad_norm": 0.061959072947502136, "learning_rate": 2.57008293378697e-05, "loss": 11.7283, "step": 268 }, { "epoch": 0.04420525040055873, "grad_norm": 0.04928471893072128, "learning_rate": 2.534961490485313e-05, "loss": 11.7287, "step": 269 }, { "epoch": 0.044369582186434414, "grad_norm": 0.04960151016712189, "learning_rate": 2.500000000000001e-05, "loss": 11.7284, "step": 270 }, { "epoch": 0.044533913972310095, "grad_norm": 0.05671358108520508, "learning_rate": 2.4652007309320498e-05, "loss": 11.7317, "step": 271 }, { "epoch": 0.044698245758185776, "grad_norm": 0.04368621110916138, "learning_rate": 2.430565941356157e-05, "loss": 11.7326, "step": 272 }, { "epoch": 0.04486257754406146, "grad_norm": 0.046738844364881516, "learning_rate": 2.3960978786741877e-05, "loss": 11.7268, "step": 273 }, { "epoch": 0.045026909329937144, "grad_norm": 0.0573868490755558, "learning_rate": 2.361798779469336e-05, "loss": 11.7263, "step": 274 }, { "epoch": 0.045191241115812825, "grad_norm": 0.047170136123895645, "learning_rate": 2.3276708693609943e-05, "loss": 11.7294, "step": 275 }, { "epoch": 0.04535557290168851, "grad_norm": 0.04488043114542961, "learning_rate": 2.2937163628603435e-05, "loss": 11.7306, "step": 276 }, { "epoch": 0.04551990468756419, "grad_norm": 0.04915056750178337, "learning_rate": 2.259937463226651e-05, "loss": 11.732, "step": 277 }, { "epoch": 0.045684236473439874, "grad_norm": 0.045971404761075974, "learning_rate": 2.2263363623243054e-05, "loss": 11.7338, "step": 278 }, { "epoch": 0.04584856825931556, "grad_norm": 0.05000988021492958, "learning_rate": 2.192915240480596e-05, "loss": 11.7311, "step": 279 }, { "epoch": 0.04601290004519124, "grad_norm": 0.038665771484375, "learning_rate": 2.1596762663442218e-05, "loss": 11.7213, "step": 280 }, { "epoch": 0.04617723183106692, "grad_norm": 0.052090033888816833, "learning_rate": 2.1266215967445824e-05, "loss": 11.732, "step": 281 }, { "epoch": 0.04634156361694261, "grad_norm": 0.04800020158290863, "learning_rate": 2.0937533765518187e-05, "loss": 11.7283, "step": 282 }, { "epoch": 0.04650589540281829, "grad_norm": 0.044077180325984955, "learning_rate": 2.061073738537635e-05, "loss": 11.7275, "step": 283 }, { "epoch": 0.04667022718869397, "grad_norm": 0.06349783390760422, "learning_rate": 2.0285848032369137e-05, "loss": 11.7311, "step": 284 }, { "epoch": 0.04683455897456966, "grad_norm": 0.051240213215351105, "learning_rate": 1.996288678810105e-05, "loss": 11.723, "step": 285 }, { "epoch": 0.04699889076044534, "grad_norm": 0.05683488771319389, "learning_rate": 1.9641874609064443e-05, "loss": 11.7307, "step": 286 }, { "epoch": 0.04716322254632102, "grad_norm": 0.06246829405426979, "learning_rate": 1.932283232527956e-05, "loss": 11.7296, "step": 287 }, { "epoch": 0.04732755433219671, "grad_norm": 0.05560317263007164, "learning_rate": 1.9005780638942982e-05, "loss": 11.7269, "step": 288 }, { "epoch": 0.04749188611807239, "grad_norm": 0.0536818653345108, "learning_rate": 1.8690740123084316e-05, "loss": 11.7256, "step": 289 }, { "epoch": 0.04765621790394807, "grad_norm": 0.05386808514595032, "learning_rate": 1.837773122023114e-05, "loss": 11.7246, "step": 290 }, { "epoch": 0.04782054968982376, "grad_norm": 0.050899688154459, "learning_rate": 1.8066774241082612e-05, "loss": 11.7288, "step": 291 }, { "epoch": 0.04798488147569944, "grad_norm": 0.05641918256878853, "learning_rate": 1.7757889363191483e-05, "loss": 11.7243, "step": 292 }, { "epoch": 0.04814921326157512, "grad_norm": 0.06967247277498245, "learning_rate": 1.745109662965481e-05, "loss": 11.7252, "step": 293 }, { "epoch": 0.04831354504745081, "grad_norm": 0.06244378536939621, "learning_rate": 1.714641594781347e-05, "loss": 11.726, "step": 294 }, { "epoch": 0.04847787683332649, "grad_norm": 0.050528816878795624, "learning_rate": 1.684386708796025e-05, "loss": 11.7232, "step": 295 }, { "epoch": 0.04864220861920217, "grad_norm": 0.051517076790332794, "learning_rate": 1.6543469682057106e-05, "loss": 11.7235, "step": 296 }, { "epoch": 0.048806540405077856, "grad_norm": 0.07252132147550583, "learning_rate": 1.62452432224612e-05, "loss": 11.7191, "step": 297 }, { "epoch": 0.048970872190953536, "grad_norm": 0.11321794241666794, "learning_rate": 1.5949207060660138e-05, "loss": 11.7252, "step": 298 }, { "epoch": 0.04913520397682922, "grad_norm": 0.11245723068714142, "learning_rate": 1.5655380406016235e-05, "loss": 11.7133, "step": 299 }, { "epoch": 0.0492995357627049, "grad_norm": 0.2027348428964615, "learning_rate": 1.536378232452003e-05, "loss": 11.708, "step": 300 }, { "epoch": 0.0492995357627049, "eval_loss": 11.728962898254395, "eval_runtime": 48.8448, "eval_samples_per_second": 209.828, "eval_steps_per_second": 52.472, "step": 300 }, { "epoch": 0.049463867548580585, "grad_norm": 0.07305345684289932, "learning_rate": 1.5074431737553157e-05, "loss": 11.7376, "step": 301 }, { "epoch": 0.049628199334456266, "grad_norm": 0.060168150812387466, "learning_rate": 1.4787347420660541e-05, "loss": 11.7355, "step": 302 }, { "epoch": 0.04979253112033195, "grad_norm": 0.05955510959029198, "learning_rate": 1.4502548002332088e-05, "loss": 11.7376, "step": 303 }, { "epoch": 0.049956862906207634, "grad_norm": 0.051338743418455124, "learning_rate": 1.422005196279395e-05, "loss": 11.7334, "step": 304 }, { "epoch": 0.050121194692083315, "grad_norm": 0.040862489491701126, "learning_rate": 1.3939877632809278e-05, "loss": 11.7356, "step": 305 }, { "epoch": 0.050285526477958996, "grad_norm": 0.05390212684869766, "learning_rate": 1.3662043192488849e-05, "loss": 11.7341, "step": 306 }, { "epoch": 0.050449858263834683, "grad_norm": 0.049856122583150864, "learning_rate": 1.338656667011134e-05, "loss": 11.7368, "step": 307 }, { "epoch": 0.050614190049710364, "grad_norm": 0.05018840357661247, "learning_rate": 1.3113465940953495e-05, "loss": 11.7294, "step": 308 }, { "epoch": 0.050778521835586045, "grad_norm": 0.041660480201244354, "learning_rate": 1.2842758726130283e-05, "loss": 11.7319, "step": 309 }, { "epoch": 0.05094285362146173, "grad_norm": 0.034591760486364365, "learning_rate": 1.257446259144494e-05, "loss": 11.7332, "step": 310 }, { "epoch": 0.05110718540733741, "grad_norm": 0.04666655510663986, "learning_rate": 1.2308594946249163e-05, "loss": 11.7297, "step": 311 }, { "epoch": 0.051271517193213094, "grad_norm": 0.050986479967832565, "learning_rate": 1.204517304231343e-05, "loss": 11.7347, "step": 312 }, { "epoch": 0.05143584897908878, "grad_norm": 0.05234253406524658, "learning_rate": 1.178421397270758e-05, "loss": 11.7342, "step": 313 }, { "epoch": 0.05160018076496446, "grad_norm": 0.05376161262392998, "learning_rate": 1.1525734670691701e-05, "loss": 11.7288, "step": 314 }, { "epoch": 0.05176451255084014, "grad_norm": 0.043203748762607574, "learning_rate": 1.1269751908617277e-05, "loss": 11.728, "step": 315 }, { "epoch": 0.05192884433671583, "grad_norm": 0.040804892778396606, "learning_rate": 1.1016282296838887e-05, "loss": 11.7346, "step": 316 }, { "epoch": 0.05209317612259151, "grad_norm": 0.05142979696393013, "learning_rate": 1.0765342282636416e-05, "loss": 11.7333, "step": 317 }, { "epoch": 0.05225750790846719, "grad_norm": 0.059128034859895706, "learning_rate": 1.0516948149147754e-05, "loss": 11.7294, "step": 318 }, { "epoch": 0.05242183969434288, "grad_norm": 0.05118521302938461, "learning_rate": 1.0271116014312293e-05, "loss": 11.7303, "step": 319 }, { "epoch": 0.05258617148021856, "grad_norm": 0.049545012414455414, "learning_rate": 1.0027861829824952e-05, "loss": 11.7304, "step": 320 }, { "epoch": 0.05275050326609424, "grad_norm": 0.044770702719688416, "learning_rate": 9.787201380101157e-06, "loss": 11.7317, "step": 321 }, { "epoch": 0.05291483505196993, "grad_norm": 0.05450345575809479, "learning_rate": 9.549150281252633e-06, "loss": 11.7286, "step": 322 }, { "epoch": 0.05307916683784561, "grad_norm": 0.04360397160053253, "learning_rate": 9.313723980074018e-06, "loss": 11.7299, "step": 323 }, { "epoch": 0.05324349862372129, "grad_norm": 0.06776392459869385, "learning_rate": 9.080937753040646e-06, "loss": 11.7297, "step": 324 }, { "epoch": 0.05340783040959698, "grad_norm": 0.047481000423431396, "learning_rate": 8.850806705317183e-06, "loss": 11.7295, "step": 325 }, { "epoch": 0.05357216219547266, "grad_norm": 0.04181436076760292, "learning_rate": 8.623345769777514e-06, "loss": 11.7312, "step": 326 }, { "epoch": 0.05373649398134834, "grad_norm": 0.056467074900865555, "learning_rate": 8.398569706035792e-06, "loss": 11.7301, "step": 327 }, { "epoch": 0.05390082576722403, "grad_norm": 0.047711823135614395, "learning_rate": 8.176493099488663e-06, "loss": 11.733, "step": 328 }, { "epoch": 0.05406515755309971, "grad_norm": 0.05818518251180649, "learning_rate": 7.957130360368898e-06, "loss": 11.7269, "step": 329 }, { "epoch": 0.05422948933897539, "grad_norm": 0.058100625872612, "learning_rate": 7.740495722810271e-06, "loss": 11.7265, "step": 330 }, { "epoch": 0.054393821124851076, "grad_norm": 0.046868883073329926, "learning_rate": 7.526603243923957e-06, "loss": 11.7262, "step": 331 }, { "epoch": 0.05455815291072676, "grad_norm": 0.04526424780488014, "learning_rate": 7.315466802886401e-06, "loss": 11.7275, "step": 332 }, { "epoch": 0.05472248469660244, "grad_norm": 0.06406047940254211, "learning_rate": 7.107100100038671e-06, "loss": 11.7313, "step": 333 }, { "epoch": 0.054886816482478125, "grad_norm": 0.04994950816035271, "learning_rate": 6.901516655997536e-06, "loss": 11.7246, "step": 334 }, { "epoch": 0.055051148268353806, "grad_norm": 0.054884202778339386, "learning_rate": 6.698729810778065e-06, "loss": 11.7282, "step": 335 }, { "epoch": 0.055215480054229486, "grad_norm": 0.05075803026556969, "learning_rate": 6.498752722928042e-06, "loss": 11.7321, "step": 336 }, { "epoch": 0.055379811840105174, "grad_norm": 0.04193416237831116, "learning_rate": 6.301598368674105e-06, "loss": 11.7267, "step": 337 }, { "epoch": 0.055544143625980855, "grad_norm": 0.04416705667972565, "learning_rate": 6.107279541079769e-06, "loss": 11.7181, "step": 338 }, { "epoch": 0.055708475411856535, "grad_norm": 0.052217379212379456, "learning_rate": 5.915808849215304e-06, "loss": 11.7238, "step": 339 }, { "epoch": 0.05587280719773222, "grad_norm": 0.050664108246564865, "learning_rate": 5.727198717339511e-06, "loss": 11.7246, "step": 340 }, { "epoch": 0.056037138983607904, "grad_norm": 0.04814159497618675, "learning_rate": 5.54146138409355e-06, "loss": 11.7278, "step": 341 }, { "epoch": 0.056201470769483584, "grad_norm": 0.05136456713080406, "learning_rate": 5.358608901706802e-06, "loss": 11.7274, "step": 342 }, { "epoch": 0.05636580255535927, "grad_norm": 0.048106636852025986, "learning_rate": 5.178653135214812e-06, "loss": 11.7243, "step": 343 }, { "epoch": 0.05653013434123495, "grad_norm": 0.058061935007572174, "learning_rate": 5.001605761689398e-06, "loss": 11.7241, "step": 344 }, { "epoch": 0.05669446612711063, "grad_norm": 0.06779101490974426, "learning_rate": 4.827478269480895e-06, "loss": 11.7196, "step": 345 }, { "epoch": 0.05685879791298632, "grad_norm": 0.05850118398666382, "learning_rate": 4.65628195747273e-06, "loss": 11.7234, "step": 346 }, { "epoch": 0.057023129698862, "grad_norm": 0.08406161516904831, "learning_rate": 4.488027934348271e-06, "loss": 11.7165, "step": 347 }, { "epoch": 0.05718746148473768, "grad_norm": 0.08055326342582703, "learning_rate": 4.322727117869951e-06, "loss": 11.7168, "step": 348 }, { "epoch": 0.05735179327061337, "grad_norm": 0.11211990565061569, "learning_rate": 4.16039023417088e-06, "loss": 11.7213, "step": 349 }, { "epoch": 0.05751612505648905, "grad_norm": 0.202422633767128, "learning_rate": 4.001027817058789e-06, "loss": 11.705, "step": 350 }, { "epoch": 0.05751612505648905, "eval_loss": 11.728860855102539, "eval_runtime": 48.8743, "eval_samples_per_second": 209.701, "eval_steps_per_second": 52.441, "step": 350 }, { "epoch": 0.05768045684236473, "grad_norm": 0.08208485692739487, "learning_rate": 3.844650207332562e-06, "loss": 11.7381, "step": 351 }, { "epoch": 0.05784478862824042, "grad_norm": 0.07359649240970612, "learning_rate": 3.691267552111183e-06, "loss": 11.7384, "step": 352 }, { "epoch": 0.0580091204141161, "grad_norm": 0.06210344284772873, "learning_rate": 3.54088980417534e-06, "loss": 11.7368, "step": 353 }, { "epoch": 0.05817345219999178, "grad_norm": 0.055169302970170975, "learning_rate": 3.393526721321616e-06, "loss": 11.7347, "step": 354 }, { "epoch": 0.05833778398586747, "grad_norm": 0.05137772858142853, "learning_rate": 3.249187865729264e-06, "loss": 11.7372, "step": 355 }, { "epoch": 0.05850211577174315, "grad_norm": 0.05137026309967041, "learning_rate": 3.1078826033397843e-06, "loss": 11.7329, "step": 356 }, { "epoch": 0.05866644755761883, "grad_norm": 0.04226312413811684, "learning_rate": 2.9696201032491434e-06, "loss": 11.7324, "step": 357 }, { "epoch": 0.05883077934349452, "grad_norm": 0.03976316750049591, "learning_rate": 2.8344093371128424e-06, "loss": 11.7344, "step": 358 }, { "epoch": 0.0589951111293702, "grad_norm": 0.04595496505498886, "learning_rate": 2.70225907856374e-06, "loss": 11.7316, "step": 359 }, { "epoch": 0.05915944291524588, "grad_norm": 0.051731742918491364, "learning_rate": 2.573177902642726e-06, "loss": 11.7312, "step": 360 }, { "epoch": 0.059323774701121566, "grad_norm": 0.04073851928114891, "learning_rate": 2.4471741852423237e-06, "loss": 11.7325, "step": 361 }, { "epoch": 0.05948810648699725, "grad_norm": 0.038902051746845245, "learning_rate": 2.324256102563188e-06, "loss": 11.7311, "step": 362 }, { "epoch": 0.05965243827287293, "grad_norm": 0.0553940124809742, "learning_rate": 2.204431630583548e-06, "loss": 11.7336, "step": 363 }, { "epoch": 0.059816770058748615, "grad_norm": 0.05747877433896065, "learning_rate": 2.087708544541689e-06, "loss": 11.7305, "step": 364 }, { "epoch": 0.059981101844624296, "grad_norm": 0.05037953704595566, "learning_rate": 1.974094418431388e-06, "loss": 11.7329, "step": 365 }, { "epoch": 0.06014543363049998, "grad_norm": 0.047859761863946915, "learning_rate": 1.8635966245104664e-06, "loss": 11.7308, "step": 366 }, { "epoch": 0.060309765416375664, "grad_norm": 0.04152829200029373, "learning_rate": 1.7562223328224325e-06, "loss": 11.7284, "step": 367 }, { "epoch": 0.060474097202251345, "grad_norm": 0.042301371693611145, "learning_rate": 1.6519785107311891e-06, "loss": 11.732, "step": 368 }, { "epoch": 0.060638428988127026, "grad_norm": 0.057195208966732025, "learning_rate": 1.5508719224689717e-06, "loss": 11.7318, "step": 369 }, { "epoch": 0.06080276077400271, "grad_norm": 0.052724309265613556, "learning_rate": 1.4529091286973995e-06, "loss": 11.7308, "step": 370 }, { "epoch": 0.060967092559878394, "grad_norm": 0.04090241715312004, "learning_rate": 1.358096486081778e-06, "loss": 11.733, "step": 371 }, { "epoch": 0.061131424345754075, "grad_norm": 0.043258052319288254, "learning_rate": 1.2664401468786114e-06, "loss": 11.731, "step": 372 }, { "epoch": 0.06129575613162976, "grad_norm": 0.04883791506290436, "learning_rate": 1.1779460585363944e-06, "loss": 11.7319, "step": 373 }, { "epoch": 0.06146008791750544, "grad_norm": 0.05210534855723381, "learning_rate": 1.0926199633097157e-06, "loss": 11.7308, "step": 374 }, { "epoch": 0.061624419703381124, "grad_norm": 0.05686335638165474, "learning_rate": 1.0104673978866164e-06, "loss": 11.7305, "step": 375 }, { "epoch": 0.06178875148925681, "grad_norm": 0.08011097460985184, "learning_rate": 9.314936930293283e-07, "loss": 11.7287, "step": 376 }, { "epoch": 0.06195308327513249, "grad_norm": 0.059680283069610596, "learning_rate": 8.557039732283944e-07, "loss": 11.7319, "step": 377 }, { "epoch": 0.06211741506100817, "grad_norm": 0.07159244269132614, "learning_rate": 7.83103156370113e-07, "loss": 11.7292, "step": 378 }, { "epoch": 0.06228174684688386, "grad_norm": 0.04805372655391693, "learning_rate": 7.136959534174592e-07, "loss": 11.7277, "step": 379 }, { "epoch": 0.06244607863275954, "grad_norm": 0.03767850250005722, "learning_rate": 6.474868681043578e-07, "loss": 11.7282, "step": 380 }, { "epoch": 0.06261041041863523, "grad_norm": 0.04085582122206688, "learning_rate": 5.844801966434832e-07, "loss": 11.7256, "step": 381 }, { "epoch": 0.0627747422045109, "grad_norm": 0.04595988988876343, "learning_rate": 5.246800274474439e-07, "loss": 11.7294, "step": 382 }, { "epoch": 0.06293907399038659, "grad_norm": 0.040494050830602646, "learning_rate": 4.680902408635335e-07, "loss": 11.7289, "step": 383 }, { "epoch": 0.06310340577626228, "grad_norm": 0.04093782976269722, "learning_rate": 4.1471450892189846e-07, "loss": 11.7281, "step": 384 }, { "epoch": 0.06326773756213795, "grad_norm": 0.05765712633728981, "learning_rate": 3.6455629509730136e-07, "loss": 11.7256, "step": 385 }, { "epoch": 0.06343206934801364, "grad_norm": 0.045128606259822845, "learning_rate": 3.1761885408435054e-07, "loss": 11.7282, "step": 386 }, { "epoch": 0.06359640113388933, "grad_norm": 0.05226225033402443, "learning_rate": 2.7390523158633554e-07, "loss": 11.7289, "step": 387 }, { "epoch": 0.063760732919765, "grad_norm": 0.046483978629112244, "learning_rate": 2.334182641175686e-07, "loss": 11.7242, "step": 388 }, { "epoch": 0.06392506470564069, "grad_norm": 0.062402740120887756, "learning_rate": 1.9616057881935436e-07, "loss": 11.7259, "step": 389 }, { "epoch": 0.06408939649151638, "grad_norm": 0.04270698502659798, "learning_rate": 1.6213459328950352e-07, "loss": 11.7308, "step": 390 }, { "epoch": 0.06425372827739205, "grad_norm": 0.06238847225904465, "learning_rate": 1.3134251542544774e-07, "loss": 11.7215, "step": 391 }, { "epoch": 0.06441806006326774, "grad_norm": 0.04868527129292488, "learning_rate": 1.0378634328099269e-07, "loss": 11.7282, "step": 392 }, { "epoch": 0.06458239184914343, "grad_norm": 0.05947161093354225, "learning_rate": 7.946786493666647e-08, "loss": 11.7183, "step": 393 }, { "epoch": 0.0647467236350191, "grad_norm": 0.04062836617231369, "learning_rate": 5.838865838366792e-08, "loss": 11.728, "step": 394 }, { "epoch": 0.06491105542089479, "grad_norm": 0.06852062791585922, "learning_rate": 4.055009142152067e-08, "loss": 11.7225, "step": 395 }, { "epoch": 0.06507538720677047, "grad_norm": 0.07569117099046707, "learning_rate": 2.595332156925534e-08, "loss": 11.723, "step": 396 }, { "epoch": 0.06523971899264615, "grad_norm": 0.08385792374610901, "learning_rate": 1.4599295990352924e-08, "loss": 11.7212, "step": 397 }, { "epoch": 0.06540405077852184, "grad_norm": 0.08387003093957901, "learning_rate": 6.488751431266149e-09, "loss": 11.7231, "step": 398 }, { "epoch": 0.06556838256439752, "grad_norm": 0.10819617658853531, "learning_rate": 1.622214173602199e-09, "loss": 11.7179, "step": 399 }, { "epoch": 0.0657327143502732, "grad_norm": 0.1437562257051468, "learning_rate": 0.0, "loss": 11.7059, "step": 400 }, { "epoch": 0.0657327143502732, "eval_loss": 11.728843688964844, "eval_runtime": 48.2858, "eval_samples_per_second": 212.257, "eval_steps_per_second": 53.08, "step": 400 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 165738341990400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }