{ "best_metric": 1.8335492610931396, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.7215007215007215, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001443001443001443, "grad_norm": 0.21467295289039612, "learning_rate": 5e-06, "loss": 2.0025, "step": 1 }, { "epoch": 0.001443001443001443, "eval_loss": 2.499330997467041, "eval_runtime": 155.4624, "eval_samples_per_second": 7.507, "eval_steps_per_second": 1.878, "step": 1 }, { "epoch": 0.002886002886002886, "grad_norm": 0.31256306171417236, "learning_rate": 1e-05, "loss": 1.821, "step": 2 }, { "epoch": 0.004329004329004329, "grad_norm": 0.3174012303352356, "learning_rate": 1.5e-05, "loss": 1.9903, "step": 3 }, { "epoch": 0.005772005772005772, "grad_norm": 0.3661087155342102, "learning_rate": 2e-05, "loss": 2.0412, "step": 4 }, { "epoch": 0.007215007215007215, "grad_norm": 0.34538447856903076, "learning_rate": 2.5e-05, "loss": 1.9811, "step": 5 }, { "epoch": 0.008658008658008658, "grad_norm": 0.3496937155723572, "learning_rate": 3e-05, "loss": 2.1532, "step": 6 }, { "epoch": 0.010101010101010102, "grad_norm": 0.3897201418876648, "learning_rate": 3.5e-05, "loss": 2.0584, "step": 7 }, { "epoch": 0.011544011544011544, "grad_norm": 0.359721839427948, "learning_rate": 4e-05, "loss": 2.0975, "step": 8 }, { "epoch": 0.012987012987012988, "grad_norm": 0.368932843208313, "learning_rate": 4.5e-05, "loss": 2.0563, "step": 9 }, { "epoch": 0.01443001443001443, "grad_norm": 0.46016350388526917, "learning_rate": 5e-05, "loss": 1.9935, "step": 10 }, { "epoch": 0.015873015873015872, "grad_norm": 0.49919214844703674, "learning_rate": 5.500000000000001e-05, "loss": 2.0946, "step": 11 }, { "epoch": 0.017316017316017316, "grad_norm": 0.4046800136566162, "learning_rate": 6e-05, "loss": 2.2127, "step": 12 }, { "epoch": 0.01875901875901876, "grad_norm": 0.40110981464385986, "learning_rate": 6.500000000000001e-05, "loss": 2.0404, "step": 13 }, { "epoch": 0.020202020202020204, "grad_norm": 0.4125487506389618, "learning_rate": 7e-05, "loss": 2.0546, "step": 14 }, { "epoch": 0.021645021645021644, "grad_norm": 0.3947140872478485, "learning_rate": 7.500000000000001e-05, "loss": 2.164, "step": 15 }, { "epoch": 0.023088023088023088, "grad_norm": 0.4024007022380829, "learning_rate": 8e-05, "loss": 2.225, "step": 16 }, { "epoch": 0.024531024531024532, "grad_norm": 0.4380939304828644, "learning_rate": 8.5e-05, "loss": 2.128, "step": 17 }, { "epoch": 0.025974025974025976, "grad_norm": 0.409138947725296, "learning_rate": 9e-05, "loss": 2.0402, "step": 18 }, { "epoch": 0.027417027417027416, "grad_norm": 0.5016234517097473, "learning_rate": 9.5e-05, "loss": 2.0481, "step": 19 }, { "epoch": 0.02886002886002886, "grad_norm": 0.4870470464229584, "learning_rate": 0.0001, "loss": 2.1388, "step": 20 }, { "epoch": 0.030303030303030304, "grad_norm": 0.46186015009880066, "learning_rate": 9.999892908320647e-05, "loss": 1.8981, "step": 21 }, { "epoch": 0.031746031746031744, "grad_norm": 0.4449887275695801, "learning_rate": 9.999571637870036e-05, "loss": 1.959, "step": 22 }, { "epoch": 0.03318903318903319, "grad_norm": 0.4438563585281372, "learning_rate": 9.999036202410325e-05, "loss": 2.0846, "step": 23 }, { "epoch": 0.03463203463203463, "grad_norm": 0.48415109515190125, "learning_rate": 9.998286624877786e-05, "loss": 1.6974, "step": 24 }, { "epoch": 0.03607503607503607, "grad_norm": 0.44993868470191956, "learning_rate": 9.997322937381829e-05, "loss": 2.21, "step": 25 }, { "epoch": 0.03751803751803752, "grad_norm": 0.49077942967414856, "learning_rate": 9.996145181203615e-05, "loss": 1.9039, "step": 26 }, { "epoch": 0.03896103896103896, "grad_norm": 0.4759511947631836, "learning_rate": 9.994753406794301e-05, "loss": 2.018, "step": 27 }, { "epoch": 0.04040404040404041, "grad_norm": 0.4197351634502411, "learning_rate": 9.99314767377287e-05, "loss": 1.866, "step": 28 }, { "epoch": 0.04184704184704185, "grad_norm": 0.43352821469306946, "learning_rate": 9.991328050923581e-05, "loss": 1.8513, "step": 29 }, { "epoch": 0.04329004329004329, "grad_norm": 0.4750743508338928, "learning_rate": 9.989294616193017e-05, "loss": 2.0907, "step": 30 }, { "epoch": 0.044733044733044736, "grad_norm": 0.48469865322113037, "learning_rate": 9.98704745668676e-05, "loss": 1.833, "step": 31 }, { "epoch": 0.046176046176046176, "grad_norm": 0.47763270139694214, "learning_rate": 9.98458666866564e-05, "loss": 2.1126, "step": 32 }, { "epoch": 0.047619047619047616, "grad_norm": 0.4813288748264313, "learning_rate": 9.981912357541627e-05, "loss": 2.1389, "step": 33 }, { "epoch": 0.049062049062049064, "grad_norm": 0.5742578506469727, "learning_rate": 9.97902463787331e-05, "loss": 1.9601, "step": 34 }, { "epoch": 0.050505050505050504, "grad_norm": 0.5102812051773071, "learning_rate": 9.975923633360985e-05, "loss": 2.0476, "step": 35 }, { "epoch": 0.05194805194805195, "grad_norm": 0.5013228058815002, "learning_rate": 9.972609476841367e-05, "loss": 1.8781, "step": 36 }, { "epoch": 0.05339105339105339, "grad_norm": 0.5211206078529358, "learning_rate": 9.969082310281891e-05, "loss": 2.0521, "step": 37 }, { "epoch": 0.05483405483405483, "grad_norm": 0.6096839904785156, "learning_rate": 9.965342284774632e-05, "loss": 1.7361, "step": 38 }, { "epoch": 0.05627705627705628, "grad_norm": 0.6038843393325806, "learning_rate": 9.961389560529836e-05, "loss": 1.8981, "step": 39 }, { "epoch": 0.05772005772005772, "grad_norm": 0.5440136790275574, "learning_rate": 9.957224306869053e-05, "loss": 1.8275, "step": 40 }, { "epoch": 0.05916305916305916, "grad_norm": 0.8856310844421387, "learning_rate": 9.952846702217886e-05, "loss": 2.063, "step": 41 }, { "epoch": 0.06060606060606061, "grad_norm": 0.8176852464675903, "learning_rate": 9.948256934098352e-05, "loss": 1.6802, "step": 42 }, { "epoch": 0.06204906204906205, "grad_norm": 0.8885481953620911, "learning_rate": 9.943455199120837e-05, "loss": 1.8003, "step": 43 }, { "epoch": 0.06349206349206349, "grad_norm": 0.746778666973114, "learning_rate": 9.938441702975689e-05, "loss": 1.985, "step": 44 }, { "epoch": 0.06493506493506493, "grad_norm": 0.7613317966461182, "learning_rate": 9.933216660424395e-05, "loss": 1.8953, "step": 45 }, { "epoch": 0.06637806637806638, "grad_norm": 0.9400774836540222, "learning_rate": 9.927780295290389e-05, "loss": 1.919, "step": 46 }, { "epoch": 0.06782106782106782, "grad_norm": 0.8416843414306641, "learning_rate": 9.922132840449459e-05, "loss": 1.8115, "step": 47 }, { "epoch": 0.06926406926406926, "grad_norm": 0.8849237561225891, "learning_rate": 9.916274537819775e-05, "loss": 1.9293, "step": 48 }, { "epoch": 0.0707070707070707, "grad_norm": 0.9754048585891724, "learning_rate": 9.91020563835152e-05, "loss": 1.7725, "step": 49 }, { "epoch": 0.07215007215007214, "grad_norm": 1.8871734142303467, "learning_rate": 9.903926402016153e-05, "loss": 1.8307, "step": 50 }, { "epoch": 0.0735930735930736, "grad_norm": 0.7267559766769409, "learning_rate": 9.897437097795257e-05, "loss": 2.0867, "step": 51 }, { "epoch": 0.07503607503607504, "grad_norm": 0.8035733699798584, "learning_rate": 9.890738003669029e-05, "loss": 1.7587, "step": 52 }, { "epoch": 0.07647907647907648, "grad_norm": 0.6840614080429077, "learning_rate": 9.883829406604363e-05, "loss": 1.9629, "step": 53 }, { "epoch": 0.07792207792207792, "grad_norm": 0.49338486790657043, "learning_rate": 9.876711602542563e-05, "loss": 2.0719, "step": 54 }, { "epoch": 0.07936507936507936, "grad_norm": 0.3963480591773987, "learning_rate": 9.869384896386668e-05, "loss": 2.1635, "step": 55 }, { "epoch": 0.08080808080808081, "grad_norm": 0.3595295250415802, "learning_rate": 9.861849601988383e-05, "loss": 1.8126, "step": 56 }, { "epoch": 0.08225108225108226, "grad_norm": 0.3469742238521576, "learning_rate": 9.854106042134641e-05, "loss": 2.0279, "step": 57 }, { "epoch": 0.0836940836940837, "grad_norm": 0.33953097462654114, "learning_rate": 9.846154548533773e-05, "loss": 1.9088, "step": 58 }, { "epoch": 0.08513708513708514, "grad_norm": 0.305743932723999, "learning_rate": 9.837995461801299e-05, "loss": 2.0343, "step": 59 }, { "epoch": 0.08658008658008658, "grad_norm": 0.3304813802242279, "learning_rate": 9.829629131445342e-05, "loss": 2.0815, "step": 60 }, { "epoch": 0.08802308802308802, "grad_norm": 0.34086668491363525, "learning_rate": 9.821055915851647e-05, "loss": 2.1491, "step": 61 }, { "epoch": 0.08946608946608947, "grad_norm": 0.38692232966423035, "learning_rate": 9.812276182268236e-05, "loss": 1.9965, "step": 62 }, { "epoch": 0.09090909090909091, "grad_norm": 0.3496730625629425, "learning_rate": 9.803290306789676e-05, "loss": 2.0397, "step": 63 }, { "epoch": 0.09235209235209235, "grad_norm": 0.384067565202713, "learning_rate": 9.794098674340965e-05, "loss": 2.0058, "step": 64 }, { "epoch": 0.09379509379509379, "grad_norm": 0.3466632068157196, "learning_rate": 9.784701678661045e-05, "loss": 2.066, "step": 65 }, { "epoch": 0.09523809523809523, "grad_norm": 0.31923907995224, "learning_rate": 9.775099722285935e-05, "loss": 1.8307, "step": 66 }, { "epoch": 0.09668109668109669, "grad_norm": 0.39498981833457947, "learning_rate": 9.765293216531486e-05, "loss": 2.0947, "step": 67 }, { "epoch": 0.09812409812409813, "grad_norm": 0.36418530344963074, "learning_rate": 9.755282581475769e-05, "loss": 2.1766, "step": 68 }, { "epoch": 0.09956709956709957, "grad_norm": 0.3260754346847534, "learning_rate": 9.74506824594107e-05, "loss": 1.9282, "step": 69 }, { "epoch": 0.10101010101010101, "grad_norm": 0.40958088636398315, "learning_rate": 9.73465064747553e-05, "loss": 1.7713, "step": 70 }, { "epoch": 0.10245310245310245, "grad_norm": 0.4162747859954834, "learning_rate": 9.724030232334391e-05, "loss": 2.0031, "step": 71 }, { "epoch": 0.1038961038961039, "grad_norm": 0.41455021500587463, "learning_rate": 9.713207455460894e-05, "loss": 2.1278, "step": 72 }, { "epoch": 0.10533910533910534, "grad_norm": 0.40532439947128296, "learning_rate": 9.702182780466775e-05, "loss": 2.0676, "step": 73 }, { "epoch": 0.10678210678210678, "grad_norm": 0.3800431787967682, "learning_rate": 9.690956679612421e-05, "loss": 1.9015, "step": 74 }, { "epoch": 0.10822510822510822, "grad_norm": 0.4204113185405731, "learning_rate": 9.67952963378663e-05, "loss": 1.9664, "step": 75 }, { "epoch": 0.10966810966810966, "grad_norm": 0.4439535140991211, "learning_rate": 9.667902132486009e-05, "loss": 1.9959, "step": 76 }, { "epoch": 0.1111111111111111, "grad_norm": 0.4069725573062897, "learning_rate": 9.656074673794018e-05, "loss": 1.9064, "step": 77 }, { "epoch": 0.11255411255411256, "grad_norm": 0.42558419704437256, "learning_rate": 9.644047764359622e-05, "loss": 1.9374, "step": 78 }, { "epoch": 0.113997113997114, "grad_norm": 0.46812930703163147, "learning_rate": 9.631821919375591e-05, "loss": 1.8003, "step": 79 }, { "epoch": 0.11544011544011544, "grad_norm": 0.4569174349308014, "learning_rate": 9.619397662556435e-05, "loss": 1.7006, "step": 80 }, { "epoch": 0.11688311688311688, "grad_norm": 0.4715164601802826, "learning_rate": 9.606775526115963e-05, "loss": 1.9712, "step": 81 }, { "epoch": 0.11832611832611832, "grad_norm": 0.4455204904079437, "learning_rate": 9.593956050744492e-05, "loss": 1.8968, "step": 82 }, { "epoch": 0.11976911976911978, "grad_norm": 0.5169462561607361, "learning_rate": 9.580939785585681e-05, "loss": 1.963, "step": 83 }, { "epoch": 0.12121212121212122, "grad_norm": 0.4874756932258606, "learning_rate": 9.567727288213005e-05, "loss": 1.943, "step": 84 }, { "epoch": 0.12265512265512266, "grad_norm": 0.4800645112991333, "learning_rate": 9.554319124605879e-05, "loss": 1.8902, "step": 85 }, { "epoch": 0.1240981240981241, "grad_norm": 0.48956722021102905, "learning_rate": 9.540715869125407e-05, "loss": 1.7206, "step": 86 }, { "epoch": 0.12554112554112554, "grad_norm": 0.5349124670028687, "learning_rate": 9.526918104489777e-05, "loss": 1.8946, "step": 87 }, { "epoch": 0.12698412698412698, "grad_norm": 0.5947434306144714, "learning_rate": 9.512926421749304e-05, "loss": 1.8111, "step": 88 }, { "epoch": 0.12842712842712842, "grad_norm": 0.5332505702972412, "learning_rate": 9.498741420261108e-05, "loss": 1.8397, "step": 89 }, { "epoch": 0.12987012987012986, "grad_norm": 0.5390715599060059, "learning_rate": 9.484363707663442e-05, "loss": 1.847, "step": 90 }, { "epoch": 0.13131313131313133, "grad_norm": 0.5333109498023987, "learning_rate": 9.469793899849661e-05, "loss": 1.912, "step": 91 }, { "epoch": 0.13275613275613277, "grad_norm": 0.5515038967132568, "learning_rate": 9.45503262094184e-05, "loss": 1.8217, "step": 92 }, { "epoch": 0.1341991341991342, "grad_norm": 0.615770161151886, "learning_rate": 9.440080503264037e-05, "loss": 1.854, "step": 93 }, { "epoch": 0.13564213564213565, "grad_norm": 0.6496592164039612, "learning_rate": 9.42493818731521e-05, "loss": 1.7574, "step": 94 }, { "epoch": 0.1370851370851371, "grad_norm": 0.7257400155067444, "learning_rate": 9.409606321741775e-05, "loss": 1.8909, "step": 95 }, { "epoch": 0.13852813852813853, "grad_norm": 0.7065343856811523, "learning_rate": 9.394085563309827e-05, "loss": 1.9357, "step": 96 }, { "epoch": 0.13997113997113997, "grad_norm": 0.7580714225769043, "learning_rate": 9.378376576876999e-05, "loss": 1.9127, "step": 97 }, { "epoch": 0.1414141414141414, "grad_norm": 0.8716478943824768, "learning_rate": 9.362480035363986e-05, "loss": 1.7791, "step": 98 }, { "epoch": 0.14285714285714285, "grad_norm": 1.0269742012023926, "learning_rate": 9.34639661972572e-05, "loss": 1.8904, "step": 99 }, { "epoch": 0.1443001443001443, "grad_norm": 1.5355191230773926, "learning_rate": 9.330127018922194e-05, "loss": 1.8833, "step": 100 }, { "epoch": 0.1443001443001443, "eval_loss": 1.993816614151001, "eval_runtime": 157.0351, "eval_samples_per_second": 7.431, "eval_steps_per_second": 1.859, "step": 100 }, { "epoch": 0.14574314574314573, "grad_norm": 0.742597222328186, "learning_rate": 9.31367192988896e-05, "loss": 2.0576, "step": 101 }, { "epoch": 0.1471861471861472, "grad_norm": 0.8394694924354553, "learning_rate": 9.297032057507264e-05, "loss": 1.7602, "step": 102 }, { "epoch": 0.14862914862914864, "grad_norm": 0.838948667049408, "learning_rate": 9.280208114573859e-05, "loss": 1.8816, "step": 103 }, { "epoch": 0.15007215007215008, "grad_norm": 0.7070848941802979, "learning_rate": 9.263200821770461e-05, "loss": 2.0543, "step": 104 }, { "epoch": 0.15151515151515152, "grad_norm": 0.44883328676223755, "learning_rate": 9.246010907632895e-05, "loss": 1.9549, "step": 105 }, { "epoch": 0.15295815295815296, "grad_norm": 0.35736629366874695, "learning_rate": 9.228639108519868e-05, "loss": 2.0421, "step": 106 }, { "epoch": 0.1544011544011544, "grad_norm": 0.3133707642555237, "learning_rate": 9.211086168581433e-05, "loss": 1.8866, "step": 107 }, { "epoch": 0.15584415584415584, "grad_norm": 0.31356483697891235, "learning_rate": 9.193352839727121e-05, "loss": 1.921, "step": 108 }, { "epoch": 0.15728715728715728, "grad_norm": 0.3296065926551819, "learning_rate": 9.175439881593716e-05, "loss": 2.036, "step": 109 }, { "epoch": 0.15873015873015872, "grad_norm": 0.28739431500434875, "learning_rate": 9.157348061512727e-05, "loss": 1.9744, "step": 110 }, { "epoch": 0.16017316017316016, "grad_norm": 0.3230794072151184, "learning_rate": 9.139078154477512e-05, "loss": 2.0474, "step": 111 }, { "epoch": 0.16161616161616163, "grad_norm": 0.31099987030029297, "learning_rate": 9.120630943110077e-05, "loss": 2.1352, "step": 112 }, { "epoch": 0.16305916305916307, "grad_norm": 0.31235870718955994, "learning_rate": 9.102007217627568e-05, "loss": 1.9676, "step": 113 }, { "epoch": 0.1645021645021645, "grad_norm": 0.39123886823654175, "learning_rate": 9.083207775808396e-05, "loss": 1.9695, "step": 114 }, { "epoch": 0.16594516594516595, "grad_norm": 0.3258761167526245, "learning_rate": 9.064233422958077e-05, "loss": 1.747, "step": 115 }, { "epoch": 0.1673881673881674, "grad_norm": 0.35148924589157104, "learning_rate": 9.045084971874738e-05, "loss": 1.983, "step": 116 }, { "epoch": 0.16883116883116883, "grad_norm": 0.3492424488067627, "learning_rate": 9.025763242814291e-05, "loss": 1.9037, "step": 117 }, { "epoch": 0.17027417027417027, "grad_norm": 0.4515904486179352, "learning_rate": 9.006269063455304e-05, "loss": 1.9855, "step": 118 }, { "epoch": 0.1717171717171717, "grad_norm": 0.3563421964645386, "learning_rate": 8.986603268863536e-05, "loss": 1.8676, "step": 119 }, { "epoch": 0.17316017316017315, "grad_norm": 0.4804840385913849, "learning_rate": 8.966766701456177e-05, "loss": 2.0058, "step": 120 }, { "epoch": 0.1746031746031746, "grad_norm": 0.4358946979045868, "learning_rate": 8.94676021096575e-05, "loss": 2.0668, "step": 121 }, { "epoch": 0.17604617604617603, "grad_norm": 0.4688352644443512, "learning_rate": 8.926584654403724e-05, "loss": 2.0926, "step": 122 }, { "epoch": 0.1774891774891775, "grad_norm": 0.38327309489250183, "learning_rate": 8.906240896023794e-05, "loss": 1.9646, "step": 123 }, { "epoch": 0.17893217893217894, "grad_norm": 0.3731294572353363, "learning_rate": 8.885729807284856e-05, "loss": 1.9799, "step": 124 }, { "epoch": 0.18037518037518038, "grad_norm": 0.4308861494064331, "learning_rate": 8.865052266813685e-05, "loss": 2.1767, "step": 125 }, { "epoch": 0.18181818181818182, "grad_norm": 0.41706499457359314, "learning_rate": 8.844209160367299e-05, "loss": 1.9402, "step": 126 }, { "epoch": 0.18326118326118326, "grad_norm": 0.4007774293422699, "learning_rate": 8.823201380795001e-05, "loss": 1.9648, "step": 127 }, { "epoch": 0.1847041847041847, "grad_norm": 0.38914182782173157, "learning_rate": 8.802029828000156e-05, "loss": 1.9749, "step": 128 }, { "epoch": 0.18614718614718614, "grad_norm": 0.4113757312297821, "learning_rate": 8.780695408901613e-05, "loss": 1.8051, "step": 129 }, { "epoch": 0.18759018759018758, "grad_norm": 0.44275519251823425, "learning_rate": 8.759199037394887e-05, "loss": 1.8252, "step": 130 }, { "epoch": 0.18903318903318903, "grad_norm": 0.43470901250839233, "learning_rate": 8.737541634312985e-05, "loss": 1.8967, "step": 131 }, { "epoch": 0.19047619047619047, "grad_norm": 0.48708173632621765, "learning_rate": 8.715724127386972e-05, "loss": 1.9375, "step": 132 }, { "epoch": 0.1919191919191919, "grad_norm": 0.4494212865829468, "learning_rate": 8.693747451206232e-05, "loss": 1.8855, "step": 133 }, { "epoch": 0.19336219336219337, "grad_norm": 0.42587026953697205, "learning_rate": 8.671612547178428e-05, "loss": 1.7339, "step": 134 }, { "epoch": 0.19480519480519481, "grad_norm": 0.5141189694404602, "learning_rate": 8.649320363489179e-05, "loss": 1.8482, "step": 135 }, { "epoch": 0.19624819624819625, "grad_norm": 0.559985876083374, "learning_rate": 8.626871855061438e-05, "loss": 1.9328, "step": 136 }, { "epoch": 0.1976911976911977, "grad_norm": 0.5099179744720459, "learning_rate": 8.604267983514594e-05, "loss": 1.8337, "step": 137 }, { "epoch": 0.19913419913419914, "grad_norm": 0.4592433273792267, "learning_rate": 8.581509717123273e-05, "loss": 1.7073, "step": 138 }, { "epoch": 0.20057720057720058, "grad_norm": 0.5081384181976318, "learning_rate": 8.558598030775857e-05, "loss": 1.8063, "step": 139 }, { "epoch": 0.20202020202020202, "grad_norm": 0.5941727757453918, "learning_rate": 8.535533905932738e-05, "loss": 1.7908, "step": 140 }, { "epoch": 0.20346320346320346, "grad_norm": 0.5501158833503723, "learning_rate": 8.51231833058426e-05, "loss": 1.8062, "step": 141 }, { "epoch": 0.2049062049062049, "grad_norm": 0.5739471316337585, "learning_rate": 8.488952299208401e-05, "loss": 1.9205, "step": 142 }, { "epoch": 0.20634920634920634, "grad_norm": 0.6430457830429077, "learning_rate": 8.46543681272818e-05, "loss": 2.075, "step": 143 }, { "epoch": 0.2077922077922078, "grad_norm": 0.6881362795829773, "learning_rate": 8.44177287846877e-05, "loss": 1.9087, "step": 144 }, { "epoch": 0.20923520923520925, "grad_norm": 0.6772449612617493, "learning_rate": 8.417961510114356e-05, "loss": 1.5338, "step": 145 }, { "epoch": 0.2106782106782107, "grad_norm": 0.727997362613678, "learning_rate": 8.39400372766471e-05, "loss": 1.8159, "step": 146 }, { "epoch": 0.21212121212121213, "grad_norm": 0.7379339337348938, "learning_rate": 8.36990055739149e-05, "loss": 1.6428, "step": 147 }, { "epoch": 0.21356421356421357, "grad_norm": 0.8291444182395935, "learning_rate": 8.345653031794292e-05, "loss": 1.8209, "step": 148 }, { "epoch": 0.215007215007215, "grad_norm": 0.8952499628067017, "learning_rate": 8.321262189556409e-05, "loss": 1.6022, "step": 149 }, { "epoch": 0.21645021645021645, "grad_norm": 1.9928690195083618, "learning_rate": 8.296729075500344e-05, "loss": 1.62, "step": 150 }, { "epoch": 0.2178932178932179, "grad_norm": 0.5600587725639343, "learning_rate": 8.272054740543052e-05, "loss": 1.9747, "step": 151 }, { "epoch": 0.21933621933621933, "grad_norm": 0.8041567802429199, "learning_rate": 8.247240241650918e-05, "loss": 2.0701, "step": 152 }, { "epoch": 0.22077922077922077, "grad_norm": 0.8225897550582886, "learning_rate": 8.222286641794488e-05, "loss": 2.0347, "step": 153 }, { "epoch": 0.2222222222222222, "grad_norm": 0.7567336559295654, "learning_rate": 8.197195009902924e-05, "loss": 1.7514, "step": 154 }, { "epoch": 0.22366522366522368, "grad_norm": 0.6722519993782043, "learning_rate": 8.171966420818228e-05, "loss": 1.9292, "step": 155 }, { "epoch": 0.22510822510822512, "grad_norm": 0.4068787395954132, "learning_rate": 8.146601955249188e-05, "loss": 1.9831, "step": 156 }, { "epoch": 0.22655122655122656, "grad_norm": 0.32383856177330017, "learning_rate": 8.121102699725089e-05, "loss": 1.915, "step": 157 }, { "epoch": 0.227994227994228, "grad_norm": 0.31018707156181335, "learning_rate": 8.095469746549172e-05, "loss": 2.0141, "step": 158 }, { "epoch": 0.22943722943722944, "grad_norm": 0.28106921911239624, "learning_rate": 8.069704193751832e-05, "loss": 1.9433, "step": 159 }, { "epoch": 0.23088023088023088, "grad_norm": 0.30933740735054016, "learning_rate": 8.043807145043604e-05, "loss": 2.11, "step": 160 }, { "epoch": 0.23232323232323232, "grad_norm": 0.3066120445728302, "learning_rate": 8.017779709767858e-05, "loss": 2.0969, "step": 161 }, { "epoch": 0.23376623376623376, "grad_norm": 0.3253624737262726, "learning_rate": 7.991623002853296e-05, "loss": 1.7843, "step": 162 }, { "epoch": 0.2352092352092352, "grad_norm": 0.3151222765445709, "learning_rate": 7.965338144766186e-05, "loss": 1.9197, "step": 163 }, { "epoch": 0.23665223665223664, "grad_norm": 0.38550567626953125, "learning_rate": 7.938926261462366e-05, "loss": 2.0804, "step": 164 }, { "epoch": 0.23809523809523808, "grad_norm": 0.3623102307319641, "learning_rate": 7.912388484339012e-05, "loss": 1.8096, "step": 165 }, { "epoch": 0.23953823953823955, "grad_norm": 0.39857810735702515, "learning_rate": 7.88572595018617e-05, "loss": 2.001, "step": 166 }, { "epoch": 0.240981240981241, "grad_norm": 0.39852413535118103, "learning_rate": 7.858939801138061e-05, "loss": 1.9444, "step": 167 }, { "epoch": 0.24242424242424243, "grad_norm": 0.3307431638240814, "learning_rate": 7.832031184624164e-05, "loss": 1.9379, "step": 168 }, { "epoch": 0.24386724386724387, "grad_norm": 0.3271358609199524, "learning_rate": 7.80500125332005e-05, "loss": 1.902, "step": 169 }, { "epoch": 0.2453102453102453, "grad_norm": 0.41468778252601624, "learning_rate": 7.777851165098012e-05, "loss": 1.9665, "step": 170 }, { "epoch": 0.24675324675324675, "grad_norm": 0.3892781138420105, "learning_rate": 7.750582082977467e-05, "loss": 1.8492, "step": 171 }, { "epoch": 0.2481962481962482, "grad_norm": 0.38811030983924866, "learning_rate": 7.723195175075136e-05, "loss": 1.9571, "step": 172 }, { "epoch": 0.24963924963924963, "grad_norm": 0.43744465708732605, "learning_rate": 7.695691614555003e-05, "loss": 2.0064, "step": 173 }, { "epoch": 0.2510822510822511, "grad_norm": 0.4178030788898468, "learning_rate": 7.668072579578058e-05, "loss": 2.0999, "step": 174 }, { "epoch": 0.25252525252525254, "grad_norm": 0.40274107456207275, "learning_rate": 7.64033925325184e-05, "loss": 1.8941, "step": 175 }, { "epoch": 0.25396825396825395, "grad_norm": 0.4556225836277008, "learning_rate": 7.612492823579745e-05, "loss": 2.0221, "step": 176 }, { "epoch": 0.2554112554112554, "grad_norm": 0.45292896032333374, "learning_rate": 7.584534483410137e-05, "loss": 1.8131, "step": 177 }, { "epoch": 0.25685425685425683, "grad_norm": 0.42167505621910095, "learning_rate": 7.55646543038526e-05, "loss": 2.1179, "step": 178 }, { "epoch": 0.2582972582972583, "grad_norm": 0.42378920316696167, "learning_rate": 7.528286866889924e-05, "loss": 2.1155, "step": 179 }, { "epoch": 0.2597402597402597, "grad_norm": 0.4680819511413574, "learning_rate": 7.500000000000001e-05, "loss": 2.0742, "step": 180 }, { "epoch": 0.2611832611832612, "grad_norm": 0.4424709975719452, "learning_rate": 7.471606041430723e-05, "loss": 2.0885, "step": 181 }, { "epoch": 0.26262626262626265, "grad_norm": 0.531936764717102, "learning_rate": 7.443106207484776e-05, "loss": 1.8219, "step": 182 }, { "epoch": 0.26406926406926406, "grad_norm": 0.39951208233833313, "learning_rate": 7.414501719000187e-05, "loss": 1.8678, "step": 183 }, { "epoch": 0.26551226551226553, "grad_norm": 0.44766274094581604, "learning_rate": 7.385793801298042e-05, "loss": 2.0432, "step": 184 }, { "epoch": 0.26695526695526695, "grad_norm": 0.4364171326160431, "learning_rate": 7.35698368412999e-05, "loss": 1.8403, "step": 185 }, { "epoch": 0.2683982683982684, "grad_norm": 0.3969670534133911, "learning_rate": 7.328072601625557e-05, "loss": 1.8147, "step": 186 }, { "epoch": 0.2698412698412698, "grad_norm": 0.4603928327560425, "learning_rate": 7.2990617922393e-05, "loss": 1.8538, "step": 187 }, { "epoch": 0.2712842712842713, "grad_norm": 0.4237711727619171, "learning_rate": 7.269952498697734e-05, "loss": 1.8051, "step": 188 }, { "epoch": 0.2727272727272727, "grad_norm": 0.5007184147834778, "learning_rate": 7.240745967946113e-05, "loss": 1.8868, "step": 189 }, { "epoch": 0.2741702741702742, "grad_norm": 0.5073036551475525, "learning_rate": 7.211443451095007e-05, "loss": 1.7303, "step": 190 }, { "epoch": 0.2756132756132756, "grad_norm": 0.5766152143478394, "learning_rate": 7.18204620336671e-05, "loss": 1.9035, "step": 191 }, { "epoch": 0.27705627705627706, "grad_norm": 0.5440102815628052, "learning_rate": 7.152555484041476e-05, "loss": 1.8722, "step": 192 }, { "epoch": 0.2784992784992785, "grad_norm": 0.5761400461196899, "learning_rate": 7.122972556403567e-05, "loss": 1.8134, "step": 193 }, { "epoch": 0.27994227994227994, "grad_norm": 0.6331032514572144, "learning_rate": 7.09329868768714e-05, "loss": 1.8894, "step": 194 }, { "epoch": 0.2813852813852814, "grad_norm": 0.6492578983306885, "learning_rate": 7.063535149021973e-05, "loss": 1.8969, "step": 195 }, { "epoch": 0.2828282828282828, "grad_norm": 0.6990649700164795, "learning_rate": 7.033683215379002e-05, "loss": 2.0232, "step": 196 }, { "epoch": 0.2842712842712843, "grad_norm": 0.778406023979187, "learning_rate": 7.003744165515705e-05, "loss": 1.8087, "step": 197 }, { "epoch": 0.2857142857142857, "grad_norm": 0.7730764150619507, "learning_rate": 6.973719281921335e-05, "loss": 1.8531, "step": 198 }, { "epoch": 0.28715728715728717, "grad_norm": 0.8641883730888367, "learning_rate": 6.943609850761979e-05, "loss": 1.9818, "step": 199 }, { "epoch": 0.2886002886002886, "grad_norm": 1.346280813217163, "learning_rate": 6.91341716182545e-05, "loss": 1.7613, "step": 200 }, { "epoch": 0.2886002886002886, "eval_loss": 1.930011510848999, "eval_runtime": 156.8582, "eval_samples_per_second": 7.44, "eval_steps_per_second": 1.862, "step": 200 }, { "epoch": 0.29004329004329005, "grad_norm": 0.33210301399230957, "learning_rate": 6.883142508466054e-05, "loss": 1.8346, "step": 201 }, { "epoch": 0.29148629148629146, "grad_norm": 0.48873820900917053, "learning_rate": 6.852787187549182e-05, "loss": 1.7156, "step": 202 }, { "epoch": 0.29292929292929293, "grad_norm": 0.5536623001098633, "learning_rate": 6.82235249939575e-05, "loss": 1.9248, "step": 203 }, { "epoch": 0.2943722943722944, "grad_norm": 0.5415230393409729, "learning_rate": 6.7918397477265e-05, "loss": 1.8952, "step": 204 }, { "epoch": 0.2958152958152958, "grad_norm": 0.4394938349723816, "learning_rate": 6.761250239606169e-05, "loss": 1.8813, "step": 205 }, { "epoch": 0.2972582972582973, "grad_norm": 0.37631645798683167, "learning_rate": 6.730585285387465e-05, "loss": 1.9335, "step": 206 }, { "epoch": 0.2987012987012987, "grad_norm": 0.3685065507888794, "learning_rate": 6.699846198654971e-05, "loss": 1.878, "step": 207 }, { "epoch": 0.30014430014430016, "grad_norm": 0.2967827618122101, "learning_rate": 6.669034296168855e-05, "loss": 1.9316, "step": 208 }, { "epoch": 0.30158730158730157, "grad_norm": 0.3100146949291229, "learning_rate": 6.638150897808468e-05, "loss": 1.9145, "step": 209 }, { "epoch": 0.30303030303030304, "grad_norm": 0.3605823516845703, "learning_rate": 6.607197326515808e-05, "loss": 1.7016, "step": 210 }, { "epoch": 0.30447330447330445, "grad_norm": 0.364137202501297, "learning_rate": 6.57617490823885e-05, "loss": 1.9892, "step": 211 }, { "epoch": 0.3059163059163059, "grad_norm": 0.29972338676452637, "learning_rate": 6.545084971874738e-05, "loss": 2.0432, "step": 212 }, { "epoch": 0.30735930735930733, "grad_norm": 0.3284769058227539, "learning_rate": 6.513928849212873e-05, "loss": 2.0474, "step": 213 }, { "epoch": 0.3088023088023088, "grad_norm": 0.3190697431564331, "learning_rate": 6.482707874877854e-05, "loss": 2.0264, "step": 214 }, { "epoch": 0.31024531024531027, "grad_norm": 0.32829707860946655, "learning_rate": 6.451423386272312e-05, "loss": 1.845, "step": 215 }, { "epoch": 0.3116883116883117, "grad_norm": 0.38648197054862976, "learning_rate": 6.420076723519614e-05, "loss": 2.068, "step": 216 }, { "epoch": 0.31313131313131315, "grad_norm": 0.3774462640285492, "learning_rate": 6.388669229406462e-05, "loss": 1.7653, "step": 217 }, { "epoch": 0.31457431457431456, "grad_norm": 0.35081520676612854, "learning_rate": 6.357202249325371e-05, "loss": 1.8903, "step": 218 }, { "epoch": 0.31601731601731603, "grad_norm": 0.377341628074646, "learning_rate": 6.32567713121704e-05, "loss": 1.8658, "step": 219 }, { "epoch": 0.31746031746031744, "grad_norm": 0.33360525965690613, "learning_rate": 6.294095225512603e-05, "loss": 1.894, "step": 220 }, { "epoch": 0.3189033189033189, "grad_norm": 0.3708435893058777, "learning_rate": 6.26245788507579e-05, "loss": 1.8171, "step": 221 }, { "epoch": 0.3203463203463203, "grad_norm": 0.4073418378829956, "learning_rate": 6.230766465144967e-05, "loss": 2.0069, "step": 222 }, { "epoch": 0.3217893217893218, "grad_norm": 0.3824368417263031, "learning_rate": 6.199022323275083e-05, "loss": 1.8745, "step": 223 }, { "epoch": 0.32323232323232326, "grad_norm": 0.37881967425346375, "learning_rate": 6.167226819279528e-05, "loss": 2.0335, "step": 224 }, { "epoch": 0.3246753246753247, "grad_norm": 0.3804931938648224, "learning_rate": 6.135381315171867e-05, "loss": 1.9527, "step": 225 }, { "epoch": 0.32611832611832614, "grad_norm": 0.35501721501350403, "learning_rate": 6.103487175107507e-05, "loss": 1.9177, "step": 226 }, { "epoch": 0.32756132756132755, "grad_norm": 0.4466770887374878, "learning_rate": 6.071545765325254e-05, "loss": 2.1138, "step": 227 }, { "epoch": 0.329004329004329, "grad_norm": 0.42904266715049744, "learning_rate": 6.0395584540887963e-05, "loss": 1.9386, "step": 228 }, { "epoch": 0.33044733044733043, "grad_norm": 0.4379250705242157, "learning_rate": 6.007526611628086e-05, "loss": 1.9915, "step": 229 }, { "epoch": 0.3318903318903319, "grad_norm": 0.5087229013442993, "learning_rate": 5.9754516100806423e-05, "loss": 1.8386, "step": 230 }, { "epoch": 0.3333333333333333, "grad_norm": 0.4807024896144867, "learning_rate": 5.9433348234327765e-05, "loss": 1.9612, "step": 231 }, { "epoch": 0.3347763347763348, "grad_norm": 0.387539267539978, "learning_rate": 5.911177627460739e-05, "loss": 1.9333, "step": 232 }, { "epoch": 0.3362193362193362, "grad_norm": 0.4602753221988678, "learning_rate": 5.8789813996717736e-05, "loss": 1.8672, "step": 233 }, { "epoch": 0.33766233766233766, "grad_norm": 0.40583816170692444, "learning_rate": 5.8467475192451226e-05, "loss": 1.9556, "step": 234 }, { "epoch": 0.33910533910533913, "grad_norm": 0.5055876970291138, "learning_rate": 5.814477366972945e-05, "loss": 1.9021, "step": 235 }, { "epoch": 0.34054834054834054, "grad_norm": 0.43246597051620483, "learning_rate": 5.782172325201155e-05, "loss": 1.6321, "step": 236 }, { "epoch": 0.341991341991342, "grad_norm": 0.44223451614379883, "learning_rate": 5.749833777770225e-05, "loss": 1.7132, "step": 237 }, { "epoch": 0.3434343434343434, "grad_norm": 0.47704342007637024, "learning_rate": 5.717463109955896e-05, "loss": 1.8639, "step": 238 }, { "epoch": 0.3448773448773449, "grad_norm": 0.618209719657898, "learning_rate": 5.685061708409841e-05, "loss": 1.8064, "step": 239 }, { "epoch": 0.3463203463203463, "grad_norm": 0.47115299105644226, "learning_rate": 5.6526309611002594e-05, "loss": 1.8133, "step": 240 }, { "epoch": 0.3477633477633478, "grad_norm": 0.4711276888847351, "learning_rate": 5.6201722572524275e-05, "loss": 1.8172, "step": 241 }, { "epoch": 0.3492063492063492, "grad_norm": 0.5899491310119629, "learning_rate": 5.587686987289189e-05, "loss": 1.9716, "step": 242 }, { "epoch": 0.35064935064935066, "grad_norm": 0.6002910137176514, "learning_rate": 5.5551765427713884e-05, "loss": 1.9386, "step": 243 }, { "epoch": 0.35209235209235207, "grad_norm": 0.6195904612541199, "learning_rate": 5.522642316338268e-05, "loss": 1.9987, "step": 244 }, { "epoch": 0.35353535353535354, "grad_norm": 0.6184617877006531, "learning_rate": 5.490085701647805e-05, "loss": 1.822, "step": 245 }, { "epoch": 0.354978354978355, "grad_norm": 0.5911903977394104, "learning_rate": 5.457508093317013e-05, "loss": 1.8101, "step": 246 }, { "epoch": 0.3564213564213564, "grad_norm": 0.7217023372650146, "learning_rate": 5.4249108868622086e-05, "loss": 1.9276, "step": 247 }, { "epoch": 0.3578643578643579, "grad_norm": 0.6757352352142334, "learning_rate": 5.392295478639225e-05, "loss": 1.8208, "step": 248 }, { "epoch": 0.3593073593073593, "grad_norm": 0.8589720726013184, "learning_rate": 5.359663265783598e-05, "loss": 1.7009, "step": 249 }, { "epoch": 0.36075036075036077, "grad_norm": 1.2887048721313477, "learning_rate": 5.327015646150716e-05, "loss": 1.8405, "step": 250 }, { "epoch": 0.3621933621933622, "grad_norm": 0.20886729657649994, "learning_rate": 5.294354018255945e-05, "loss": 1.9762, "step": 251 }, { "epoch": 0.36363636363636365, "grad_norm": 0.2683543562889099, "learning_rate": 5.26167978121472e-05, "loss": 1.8028, "step": 252 }, { "epoch": 0.36507936507936506, "grad_norm": 0.28197985887527466, "learning_rate": 5.228994334682604e-05, "loss": 1.7985, "step": 253 }, { "epoch": 0.3665223665223665, "grad_norm": 0.3346744179725647, "learning_rate": 5.196299078795344e-05, "loss": 1.9574, "step": 254 }, { "epoch": 0.36796536796536794, "grad_norm": 0.33492788672447205, "learning_rate": 5.1635954141088813e-05, "loss": 1.7003, "step": 255 }, { "epoch": 0.3694083694083694, "grad_norm": 0.41365864872932434, "learning_rate": 5.1308847415393666e-05, "loss": 2.0219, "step": 256 }, { "epoch": 0.3708513708513709, "grad_norm": 0.4230329990386963, "learning_rate": 5.0981684623031415e-05, "loss": 1.8089, "step": 257 }, { "epoch": 0.3722943722943723, "grad_norm": 0.40661102533340454, "learning_rate": 5.0654479778567223e-05, "loss": 1.8797, "step": 258 }, { "epoch": 0.37373737373737376, "grad_norm": 0.3464805781841278, "learning_rate": 5.0327246898367597e-05, "loss": 1.7252, "step": 259 }, { "epoch": 0.37518037518037517, "grad_norm": 0.31361910700798035, "learning_rate": 5e-05, "loss": 2.0597, "step": 260 }, { "epoch": 0.37662337662337664, "grad_norm": 0.43410736322402954, "learning_rate": 4.9672753101632415e-05, "loss": 1.8767, "step": 261 }, { "epoch": 0.37806637806637805, "grad_norm": 0.4131929576396942, "learning_rate": 4.934552022143279e-05, "loss": 1.929, "step": 262 }, { "epoch": 0.3795093795093795, "grad_norm": 0.3646806478500366, "learning_rate": 4.901831537696859e-05, "loss": 2.0679, "step": 263 }, { "epoch": 0.38095238095238093, "grad_norm": 0.2950616478919983, "learning_rate": 4.869115258460635e-05, "loss": 1.9789, "step": 264 }, { "epoch": 0.3823953823953824, "grad_norm": 0.387099027633667, "learning_rate": 4.83640458589112e-05, "loss": 1.9296, "step": 265 }, { "epoch": 0.3838383838383838, "grad_norm": 0.351723849773407, "learning_rate": 4.8037009212046586e-05, "loss": 1.8302, "step": 266 }, { "epoch": 0.3852813852813853, "grad_norm": 0.40654313564300537, "learning_rate": 4.7710056653173976e-05, "loss": 1.8677, "step": 267 }, { "epoch": 0.38672438672438675, "grad_norm": 0.39013293385505676, "learning_rate": 4.738320218785281e-05, "loss": 2.0443, "step": 268 }, { "epoch": 0.38816738816738816, "grad_norm": 0.3763997256755829, "learning_rate": 4.7056459817440544e-05, "loss": 1.9559, "step": 269 }, { "epoch": 0.38961038961038963, "grad_norm": 0.39660537242889404, "learning_rate": 4.6729843538492847e-05, "loss": 1.8978, "step": 270 }, { "epoch": 0.39105339105339104, "grad_norm": 0.40315741300582886, "learning_rate": 4.640336734216403e-05, "loss": 1.8238, "step": 271 }, { "epoch": 0.3924963924963925, "grad_norm": 0.4773310720920563, "learning_rate": 4.607704521360776e-05, "loss": 1.8576, "step": 272 }, { "epoch": 0.3939393939393939, "grad_norm": 0.35388076305389404, "learning_rate": 4.575089113137792e-05, "loss": 1.9199, "step": 273 }, { "epoch": 0.3953823953823954, "grad_norm": 0.38064417243003845, "learning_rate": 4.542491906682989e-05, "loss": 2.0109, "step": 274 }, { "epoch": 0.3968253968253968, "grad_norm": 0.4209693670272827, "learning_rate": 4.509914298352197e-05, "loss": 1.9023, "step": 275 }, { "epoch": 0.39826839826839827, "grad_norm": 0.432314932346344, "learning_rate": 4.477357683661734e-05, "loss": 2.014, "step": 276 }, { "epoch": 0.3997113997113997, "grad_norm": 0.4496338665485382, "learning_rate": 4.444823457228612e-05, "loss": 1.7333, "step": 277 }, { "epoch": 0.40115440115440115, "grad_norm": 0.4074253439903259, "learning_rate": 4.412313012710813e-05, "loss": 1.9451, "step": 278 }, { "epoch": 0.4025974025974026, "grad_norm": 0.43842747807502747, "learning_rate": 4.379827742747575e-05, "loss": 1.7648, "step": 279 }, { "epoch": 0.40404040404040403, "grad_norm": 0.41590142250061035, "learning_rate": 4.347369038899744e-05, "loss": 1.991, "step": 280 }, { "epoch": 0.4054834054834055, "grad_norm": 0.41823965311050415, "learning_rate": 4.3149382915901606e-05, "loss": 1.8753, "step": 281 }, { "epoch": 0.4069264069264069, "grad_norm": 0.4189210534095764, "learning_rate": 4.282536890044104e-05, "loss": 1.9947, "step": 282 }, { "epoch": 0.4083694083694084, "grad_norm": 0.43361666798591614, "learning_rate": 4.250166222229774e-05, "loss": 1.949, "step": 283 }, { "epoch": 0.4098124098124098, "grad_norm": 0.4770269989967346, "learning_rate": 4.2178276747988446e-05, "loss": 1.7797, "step": 284 }, { "epoch": 0.41125541125541126, "grad_norm": 0.44815993309020996, "learning_rate": 4.185522633027057e-05, "loss": 1.6703, "step": 285 }, { "epoch": 0.4126984126984127, "grad_norm": 0.5173628926277161, "learning_rate": 4.153252480754877e-05, "loss": 1.8344, "step": 286 }, { "epoch": 0.41414141414141414, "grad_norm": 0.5571794509887695, "learning_rate": 4.1210186003282275e-05, "loss": 2.0112, "step": 287 }, { "epoch": 0.4155844155844156, "grad_norm": 0.5537445545196533, "learning_rate": 4.088822372539263e-05, "loss": 1.7687, "step": 288 }, { "epoch": 0.417027417027417, "grad_norm": 0.5494181513786316, "learning_rate": 4.0566651765672246e-05, "loss": 1.8711, "step": 289 }, { "epoch": 0.4184704184704185, "grad_norm": 0.49445641040802, "learning_rate": 4.0245483899193595e-05, "loss": 1.759, "step": 290 }, { "epoch": 0.4199134199134199, "grad_norm": 0.569502592086792, "learning_rate": 3.992473388371915e-05, "loss": 1.9987, "step": 291 }, { "epoch": 0.4213564213564214, "grad_norm": 0.6796965599060059, "learning_rate": 3.960441545911204e-05, "loss": 1.871, "step": 292 }, { "epoch": 0.4227994227994228, "grad_norm": 0.5192450284957886, "learning_rate": 3.928454234674747e-05, "loss": 1.5954, "step": 293 }, { "epoch": 0.42424242424242425, "grad_norm": 0.5506890416145325, "learning_rate": 3.896512824892495e-05, "loss": 1.9684, "step": 294 }, { "epoch": 0.42568542568542567, "grad_norm": 0.5987568497657776, "learning_rate": 3.864618684828134e-05, "loss": 2.034, "step": 295 }, { "epoch": 0.42712842712842713, "grad_norm": 0.582402229309082, "learning_rate": 3.832773180720475e-05, "loss": 1.8364, "step": 296 }, { "epoch": 0.42857142857142855, "grad_norm": 0.6683496832847595, "learning_rate": 3.800977676724919e-05, "loss": 1.8847, "step": 297 }, { "epoch": 0.43001443001443, "grad_norm": 0.6951612830162048, "learning_rate": 3.769233534855035e-05, "loss": 1.7696, "step": 298 }, { "epoch": 0.4314574314574315, "grad_norm": 0.9686669707298279, "learning_rate": 3.73754211492421e-05, "loss": 1.6039, "step": 299 }, { "epoch": 0.4329004329004329, "grad_norm": 1.6390401124954224, "learning_rate": 3.705904774487396e-05, "loss": 2.0773, "step": 300 }, { "epoch": 0.4329004329004329, "eval_loss": 1.8472508192062378, "eval_runtime": 156.8612, "eval_samples_per_second": 7.44, "eval_steps_per_second": 1.862, "step": 300 }, { "epoch": 0.43434343434343436, "grad_norm": 0.2445702999830246, "learning_rate": 3.6743228687829595e-05, "loss": 1.9193, "step": 301 }, { "epoch": 0.4357864357864358, "grad_norm": 0.23023849725723267, "learning_rate": 3.642797750674629e-05, "loss": 1.685, "step": 302 }, { "epoch": 0.43722943722943725, "grad_norm": 0.21904835104942322, "learning_rate": 3.6113307705935396e-05, "loss": 1.8372, "step": 303 }, { "epoch": 0.43867243867243866, "grad_norm": 0.2425857037305832, "learning_rate": 3.579923276480387e-05, "loss": 1.7388, "step": 304 }, { "epoch": 0.4401154401154401, "grad_norm": 0.25920021533966064, "learning_rate": 3.5485766137276894e-05, "loss": 1.8383, "step": 305 }, { "epoch": 0.44155844155844154, "grad_norm": 0.2727745771408081, "learning_rate": 3.5172921251221455e-05, "loss": 1.8585, "step": 306 }, { "epoch": 0.443001443001443, "grad_norm": 0.2757931351661682, "learning_rate": 3.486071150787128e-05, "loss": 1.6702, "step": 307 }, { "epoch": 0.4444444444444444, "grad_norm": 0.31464436650276184, "learning_rate": 3.4549150281252636e-05, "loss": 1.9148, "step": 308 }, { "epoch": 0.4458874458874459, "grad_norm": 0.3319891393184662, "learning_rate": 3.423825091761153e-05, "loss": 2.0834, "step": 309 }, { "epoch": 0.44733044733044736, "grad_norm": 0.3357025384902954, "learning_rate": 3.392802673484193e-05, "loss": 1.9062, "step": 310 }, { "epoch": 0.44877344877344877, "grad_norm": 0.30716291069984436, "learning_rate": 3.361849102191533e-05, "loss": 2.0304, "step": 311 }, { "epoch": 0.45021645021645024, "grad_norm": 0.31525135040283203, "learning_rate": 3.330965703831146e-05, "loss": 1.8541, "step": 312 }, { "epoch": 0.45165945165945165, "grad_norm": 0.2891955077648163, "learning_rate": 3.300153801345028e-05, "loss": 1.9576, "step": 313 }, { "epoch": 0.4531024531024531, "grad_norm": 0.3366871774196625, "learning_rate": 3.2694147146125345e-05, "loss": 1.9217, "step": 314 }, { "epoch": 0.45454545454545453, "grad_norm": 0.3520633578300476, "learning_rate": 3.2387497603938326e-05, "loss": 1.9279, "step": 315 }, { "epoch": 0.455988455988456, "grad_norm": 0.4141858220100403, "learning_rate": 3.2081602522734986e-05, "loss": 1.7844, "step": 316 }, { "epoch": 0.4574314574314574, "grad_norm": 0.3475481867790222, "learning_rate": 3.177647500604252e-05, "loss": 1.833, "step": 317 }, { "epoch": 0.4588744588744589, "grad_norm": 0.366415411233902, "learning_rate": 3.147212812450819e-05, "loss": 2.0163, "step": 318 }, { "epoch": 0.4603174603174603, "grad_norm": 0.3866393566131592, "learning_rate": 3.116857491533947e-05, "loss": 1.7986, "step": 319 }, { "epoch": 0.46176046176046176, "grad_norm": 0.3545255661010742, "learning_rate": 3.086582838174551e-05, "loss": 2.0761, "step": 320 }, { "epoch": 0.46320346320346323, "grad_norm": 0.42348945140838623, "learning_rate": 3.056390149238022e-05, "loss": 1.9795, "step": 321 }, { "epoch": 0.46464646464646464, "grad_norm": 0.36564403772354126, "learning_rate": 3.0262807180786647e-05, "loss": 1.8851, "step": 322 }, { "epoch": 0.4660894660894661, "grad_norm": 0.3915037512779236, "learning_rate": 2.996255834484296e-05, "loss": 1.9392, "step": 323 }, { "epoch": 0.4675324675324675, "grad_norm": 0.38861963152885437, "learning_rate": 2.9663167846209998e-05, "loss": 1.9099, "step": 324 }, { "epoch": 0.468975468975469, "grad_norm": 0.38369008898735046, "learning_rate": 2.936464850978027e-05, "loss": 1.7798, "step": 325 }, { "epoch": 0.4704184704184704, "grad_norm": 0.42101967334747314, "learning_rate": 2.9067013123128613e-05, "loss": 1.8606, "step": 326 }, { "epoch": 0.47186147186147187, "grad_norm": 0.41875502467155457, "learning_rate": 2.8770274435964355e-05, "loss": 1.6667, "step": 327 }, { "epoch": 0.4733044733044733, "grad_norm": 0.41931435465812683, "learning_rate": 2.8474445159585235e-05, "loss": 1.8602, "step": 328 }, { "epoch": 0.47474747474747475, "grad_norm": 0.46239984035491943, "learning_rate": 2.8179537966332887e-05, "loss": 1.9873, "step": 329 }, { "epoch": 0.47619047619047616, "grad_norm": 0.4297107756137848, "learning_rate": 2.7885565489049946e-05, "loss": 1.8147, "step": 330 }, { "epoch": 0.47763347763347763, "grad_norm": 0.4730760455131531, "learning_rate": 2.759254032053888e-05, "loss": 2.121, "step": 331 }, { "epoch": 0.4790764790764791, "grad_norm": 0.4633348882198334, "learning_rate": 2.7300475013022663e-05, "loss": 1.862, "step": 332 }, { "epoch": 0.4805194805194805, "grad_norm": 0.451509565114975, "learning_rate": 2.700938207760701e-05, "loss": 1.717, "step": 333 }, { "epoch": 0.481962481962482, "grad_norm": 0.5135688781738281, "learning_rate": 2.671927398374443e-05, "loss": 1.7786, "step": 334 }, { "epoch": 0.4834054834054834, "grad_norm": 0.5174142718315125, "learning_rate": 2.6430163158700115e-05, "loss": 1.8283, "step": 335 }, { "epoch": 0.48484848484848486, "grad_norm": 0.4473469853401184, "learning_rate": 2.6142061987019577e-05, "loss": 1.9845, "step": 336 }, { "epoch": 0.4862914862914863, "grad_norm": 0.4503369629383087, "learning_rate": 2.5854982809998153e-05, "loss": 1.759, "step": 337 }, { "epoch": 0.48773448773448774, "grad_norm": 0.6512835621833801, "learning_rate": 2.556893792515227e-05, "loss": 1.7662, "step": 338 }, { "epoch": 0.48917748917748916, "grad_norm": 0.5281562209129333, "learning_rate": 2.5283939585692783e-05, "loss": 1.791, "step": 339 }, { "epoch": 0.4906204906204906, "grad_norm": 0.6262437701225281, "learning_rate": 2.500000000000001e-05, "loss": 1.7711, "step": 340 }, { "epoch": 0.49206349206349204, "grad_norm": 0.5214657783508301, "learning_rate": 2.471713133110078e-05, "loss": 1.7583, "step": 341 }, { "epoch": 0.4935064935064935, "grad_norm": 0.5274417996406555, "learning_rate": 2.4435345696147403e-05, "loss": 1.8388, "step": 342 }, { "epoch": 0.494949494949495, "grad_norm": 0.5707380175590515, "learning_rate": 2.4154655165898627e-05, "loss": 2.0271, "step": 343 }, { "epoch": 0.4963924963924964, "grad_norm": 0.6870908141136169, "learning_rate": 2.3875071764202563e-05, "loss": 2.0183, "step": 344 }, { "epoch": 0.49783549783549785, "grad_norm": 0.6021436452865601, "learning_rate": 2.3596607467481603e-05, "loss": 1.8119, "step": 345 }, { "epoch": 0.49927849927849927, "grad_norm": 0.7010958790779114, "learning_rate": 2.3319274204219428e-05, "loss": 1.7931, "step": 346 }, { "epoch": 0.5007215007215007, "grad_norm": 0.7699779868125916, "learning_rate": 2.3043083854449988e-05, "loss": 1.6772, "step": 347 }, { "epoch": 0.5021645021645021, "grad_norm": 0.7229447364807129, "learning_rate": 2.2768048249248648e-05, "loss": 1.8659, "step": 348 }, { "epoch": 0.5036075036075036, "grad_norm": 0.8113691210746765, "learning_rate": 2.2494179170225333e-05, "loss": 1.7651, "step": 349 }, { "epoch": 0.5050505050505051, "grad_norm": 1.5266594886779785, "learning_rate": 2.2221488349019903e-05, "loss": 1.9331, "step": 350 }, { "epoch": 0.5064935064935064, "grad_norm": 0.2073773890733719, "learning_rate": 2.194998746679952e-05, "loss": 1.8606, "step": 351 }, { "epoch": 0.5079365079365079, "grad_norm": 0.21894040703773499, "learning_rate": 2.167968815375837e-05, "loss": 1.6546, "step": 352 }, { "epoch": 0.5093795093795094, "grad_norm": 0.24405646324157715, "learning_rate": 2.1410601988619394e-05, "loss": 1.6878, "step": 353 }, { "epoch": 0.5108225108225108, "grad_norm": 0.23255270719528198, "learning_rate": 2.1142740498138324e-05, "loss": 1.7686, "step": 354 }, { "epoch": 0.5122655122655123, "grad_norm": 0.2648371458053589, "learning_rate": 2.08761151566099e-05, "loss": 1.7577, "step": 355 }, { "epoch": 0.5137085137085137, "grad_norm": 0.28337547183036804, "learning_rate": 2.061073738537635e-05, "loss": 1.8133, "step": 356 }, { "epoch": 0.5151515151515151, "grad_norm": 0.29984724521636963, "learning_rate": 2.034661855233815e-05, "loss": 1.6594, "step": 357 }, { "epoch": 0.5165945165945166, "grad_norm": 0.3129202425479889, "learning_rate": 2.008376997146705e-05, "loss": 1.9644, "step": 358 }, { "epoch": 0.5180375180375181, "grad_norm": 0.678475558757782, "learning_rate": 1.982220290232143e-05, "loss": 1.6171, "step": 359 }, { "epoch": 0.5194805194805194, "grad_norm": 0.2939353585243225, "learning_rate": 1.9561928549563968e-05, "loss": 1.9871, "step": 360 }, { "epoch": 0.5209235209235209, "grad_norm": 0.28600218892097473, "learning_rate": 1.9302958062481673e-05, "loss": 1.8726, "step": 361 }, { "epoch": 0.5223665223665224, "grad_norm": 0.30675143003463745, "learning_rate": 1.9045302534508297e-05, "loss": 1.8862, "step": 362 }, { "epoch": 0.5238095238095238, "grad_norm": 0.32877466082572937, "learning_rate": 1.8788973002749112e-05, "loss": 1.7572, "step": 363 }, { "epoch": 0.5252525252525253, "grad_norm": 0.3716675043106079, "learning_rate": 1.8533980447508137e-05, "loss": 1.8445, "step": 364 }, { "epoch": 0.5266955266955267, "grad_norm": 0.3779400885105133, "learning_rate": 1.8280335791817733e-05, "loss": 1.9895, "step": 365 }, { "epoch": 0.5281385281385281, "grad_norm": 0.38026732206344604, "learning_rate": 1.8028049900970767e-05, "loss": 2.0318, "step": 366 }, { "epoch": 0.5295815295815296, "grad_norm": 0.37040016055107117, "learning_rate": 1.777713358205514e-05, "loss": 1.8038, "step": 367 }, { "epoch": 0.5310245310245311, "grad_norm": 0.3922972083091736, "learning_rate": 1.7527597583490822e-05, "loss": 1.7281, "step": 368 }, { "epoch": 0.5324675324675324, "grad_norm": 0.4298830032348633, "learning_rate": 1.7279452594569483e-05, "loss": 1.8546, "step": 369 }, { "epoch": 0.5339105339105339, "grad_norm": 0.39382922649383545, "learning_rate": 1.703270924499656e-05, "loss": 1.9061, "step": 370 }, { "epoch": 0.5353535353535354, "grad_norm": 0.4678158164024353, "learning_rate": 1.678737810443593e-05, "loss": 1.8942, "step": 371 }, { "epoch": 0.5367965367965368, "grad_norm": 0.3930918872356415, "learning_rate": 1.6543469682057106e-05, "loss": 1.845, "step": 372 }, { "epoch": 0.5382395382395382, "grad_norm": 0.3660011291503906, "learning_rate": 1.6300994426085103e-05, "loss": 1.8379, "step": 373 }, { "epoch": 0.5396825396825397, "grad_norm": 0.444845050573349, "learning_rate": 1.605996272335291e-05, "loss": 2.0223, "step": 374 }, { "epoch": 0.5411255411255411, "grad_norm": 0.4140741527080536, "learning_rate": 1.5820384898856434e-05, "loss": 1.8988, "step": 375 }, { "epoch": 0.5425685425685426, "grad_norm": 0.4683399498462677, "learning_rate": 1.5582271215312294e-05, "loss": 1.7753, "step": 376 }, { "epoch": 0.5440115440115441, "grad_norm": 0.39313170313835144, "learning_rate": 1.5345631872718214e-05, "loss": 1.9094, "step": 377 }, { "epoch": 0.5454545454545454, "grad_norm": 0.3654247224330902, "learning_rate": 1.5110477007916001e-05, "loss": 1.7358, "step": 378 }, { "epoch": 0.5468975468975469, "grad_norm": 0.44113779067993164, "learning_rate": 1.4876816694157419e-05, "loss": 1.9154, "step": 379 }, { "epoch": 0.5483405483405484, "grad_norm": 0.39124253392219543, "learning_rate": 1.4644660940672627e-05, "loss": 1.9706, "step": 380 }, { "epoch": 0.5497835497835498, "grad_norm": 0.46143198013305664, "learning_rate": 1.4414019692241437e-05, "loss": 1.809, "step": 381 }, { "epoch": 0.5512265512265512, "grad_norm": 0.4628862142562866, "learning_rate": 1.4184902828767287e-05, "loss": 1.7279, "step": 382 }, { "epoch": 0.5526695526695526, "grad_norm": 0.49773210287094116, "learning_rate": 1.3957320164854059e-05, "loss": 1.8587, "step": 383 }, { "epoch": 0.5541125541125541, "grad_norm": 0.4421599507331848, "learning_rate": 1.373128144938563e-05, "loss": 1.7864, "step": 384 }, { "epoch": 0.5555555555555556, "grad_norm": 0.4303032457828522, "learning_rate": 1.3506796365108232e-05, "loss": 1.8105, "step": 385 }, { "epoch": 0.556998556998557, "grad_norm": 0.4077335000038147, "learning_rate": 1.3283874528215733e-05, "loss": 1.7978, "step": 386 }, { "epoch": 0.5584415584415584, "grad_norm": 0.5202664732933044, "learning_rate": 1.3062525487937699e-05, "loss": 1.6826, "step": 387 }, { "epoch": 0.5598845598845599, "grad_norm": 0.46949782967567444, "learning_rate": 1.2842758726130283e-05, "loss": 1.8609, "step": 388 }, { "epoch": 0.5613275613275613, "grad_norm": 0.6220776438713074, "learning_rate": 1.2624583656870154e-05, "loss": 1.6619, "step": 389 }, { "epoch": 0.5627705627705628, "grad_norm": 0.5599220991134644, "learning_rate": 1.2408009626051137e-05, "loss": 1.8675, "step": 390 }, { "epoch": 0.5642135642135642, "grad_norm": 0.48732367157936096, "learning_rate": 1.2193045910983863e-05, "loss": 1.8726, "step": 391 }, { "epoch": 0.5656565656565656, "grad_norm": 0.5116375088691711, "learning_rate": 1.1979701719998453e-05, "loss": 1.8269, "step": 392 }, { "epoch": 0.5670995670995671, "grad_norm": 0.5755485892295837, "learning_rate": 1.1767986192049984e-05, "loss": 2.0062, "step": 393 }, { "epoch": 0.5685425685425686, "grad_norm": 0.5523481369018555, "learning_rate": 1.1557908396327028e-05, "loss": 1.8802, "step": 394 }, { "epoch": 0.56998556998557, "grad_norm": 0.5976821780204773, "learning_rate": 1.134947733186315e-05, "loss": 1.9027, "step": 395 }, { "epoch": 0.5714285714285714, "grad_norm": 0.672989547252655, "learning_rate": 1.1142701927151456e-05, "loss": 1.6237, "step": 396 }, { "epoch": 0.5728715728715729, "grad_norm": 0.717589259147644, "learning_rate": 1.0937591039762085e-05, "loss": 1.8292, "step": 397 }, { "epoch": 0.5743145743145743, "grad_norm": 0.6616857051849365, "learning_rate": 1.0734153455962765e-05, "loss": 1.6972, "step": 398 }, { "epoch": 0.5757575757575758, "grad_norm": 0.7580000162124634, "learning_rate": 1.0532397890342505e-05, "loss": 1.4864, "step": 399 }, { "epoch": 0.5772005772005772, "grad_norm": 1.0676060914993286, "learning_rate": 1.0332332985438248e-05, "loss": 1.6269, "step": 400 }, { "epoch": 0.5772005772005772, "eval_loss": 1.8380851745605469, "eval_runtime": 156.9174, "eval_samples_per_second": 7.437, "eval_steps_per_second": 1.861, "step": 400 }, { "epoch": 0.5786435786435786, "grad_norm": 0.22474224865436554, "learning_rate": 1.013396731136465e-05, "loss": 1.7378, "step": 401 }, { "epoch": 0.5800865800865801, "grad_norm": 0.20946919918060303, "learning_rate": 9.937309365446973e-06, "loss": 1.8411, "step": 402 }, { "epoch": 0.5815295815295816, "grad_norm": 0.203902006149292, "learning_rate": 9.742367571857091e-06, "loss": 1.6986, "step": 403 }, { "epoch": 0.5829725829725829, "grad_norm": 0.2520330548286438, "learning_rate": 9.549150281252633e-06, "loss": 1.7644, "step": 404 }, { "epoch": 0.5844155844155844, "grad_norm": 0.23222126066684723, "learning_rate": 9.357665770419244e-06, "loss": 1.8559, "step": 405 }, { "epoch": 0.5858585858585859, "grad_norm": 0.25231245160102844, "learning_rate": 9.167922241916055e-06, "loss": 1.9885, "step": 406 }, { "epoch": 0.5873015873015873, "grad_norm": 0.24505528807640076, "learning_rate": 8.97992782372432e-06, "loss": 1.8296, "step": 407 }, { "epoch": 0.5887445887445888, "grad_norm": 0.32213860750198364, "learning_rate": 8.793690568899216e-06, "loss": 1.928, "step": 408 }, { "epoch": 0.5901875901875901, "grad_norm": 0.28556668758392334, "learning_rate": 8.609218455224893e-06, "loss": 1.9806, "step": 409 }, { "epoch": 0.5916305916305916, "grad_norm": 0.28748056292533875, "learning_rate": 8.426519384872733e-06, "loss": 2.0535, "step": 410 }, { "epoch": 0.5930735930735931, "grad_norm": 0.3097786605358124, "learning_rate": 8.245601184062852e-06, "loss": 1.8546, "step": 411 }, { "epoch": 0.5945165945165946, "grad_norm": 0.34151002764701843, "learning_rate": 8.066471602728803e-06, "loss": 2.0569, "step": 412 }, { "epoch": 0.5959595959595959, "grad_norm": 0.31291815638542175, "learning_rate": 7.889138314185678e-06, "loss": 1.7969, "step": 413 }, { "epoch": 0.5974025974025974, "grad_norm": 0.3612555265426636, "learning_rate": 7.71360891480134e-06, "loss": 1.7247, "step": 414 }, { "epoch": 0.5988455988455988, "grad_norm": 0.3746615946292877, "learning_rate": 7.539890923671062e-06, "loss": 1.9217, "step": 415 }, { "epoch": 0.6002886002886003, "grad_norm": 0.4344548285007477, "learning_rate": 7.367991782295391e-06, "loss": 2.1042, "step": 416 }, { "epoch": 0.6017316017316018, "grad_norm": 0.31650206446647644, "learning_rate": 7.197918854261432e-06, "loss": 1.814, "step": 417 }, { "epoch": 0.6031746031746031, "grad_norm": 0.37703970074653625, "learning_rate": 7.029679424927365e-06, "loss": 1.9636, "step": 418 }, { "epoch": 0.6046176046176046, "grad_norm": 0.37509685754776, "learning_rate": 6.863280701110408e-06, "loss": 1.7446, "step": 419 }, { "epoch": 0.6060606060606061, "grad_norm": 0.33785712718963623, "learning_rate": 6.698729810778065e-06, "loss": 1.8171, "step": 420 }, { "epoch": 0.6075036075036075, "grad_norm": 0.36930790543556213, "learning_rate": 6.536033802742813e-06, "loss": 1.9212, "step": 421 }, { "epoch": 0.6089466089466089, "grad_norm": 0.41896510124206543, "learning_rate": 6.375199646360142e-06, "loss": 1.9301, "step": 422 }, { "epoch": 0.6103896103896104, "grad_norm": 0.3452630639076233, "learning_rate": 6.216234231230012e-06, "loss": 2.0033, "step": 423 }, { "epoch": 0.6118326118326118, "grad_norm": 0.4596862196922302, "learning_rate": 6.059144366901736e-06, "loss": 1.9092, "step": 424 }, { "epoch": 0.6132756132756133, "grad_norm": 0.42444470524787903, "learning_rate": 5.903936782582253e-06, "loss": 1.9446, "step": 425 }, { "epoch": 0.6147186147186147, "grad_norm": 0.45510318875312805, "learning_rate": 5.750618126847912e-06, "loss": 1.9616, "step": 426 }, { "epoch": 0.6161616161616161, "grad_norm": 0.4246458113193512, "learning_rate": 5.599194967359639e-06, "loss": 2.0316, "step": 427 }, { "epoch": 0.6176046176046176, "grad_norm": 0.43000173568725586, "learning_rate": 5.449673790581611e-06, "loss": 1.8851, "step": 428 }, { "epoch": 0.6190476190476191, "grad_norm": 0.4248652160167694, "learning_rate": 5.302061001503394e-06, "loss": 2.0064, "step": 429 }, { "epoch": 0.6204906204906205, "grad_norm": 0.4813087284564972, "learning_rate": 5.156362923365588e-06, "loss": 1.9639, "step": 430 }, { "epoch": 0.6219336219336219, "grad_norm": 0.42517709732055664, "learning_rate": 5.012585797388936e-06, "loss": 1.9537, "step": 431 }, { "epoch": 0.6233766233766234, "grad_norm": 0.413674920797348, "learning_rate": 4.87073578250698e-06, "loss": 1.911, "step": 432 }, { "epoch": 0.6248196248196248, "grad_norm": 0.44890064001083374, "learning_rate": 4.730818955102234e-06, "loss": 1.8757, "step": 433 }, { "epoch": 0.6262626262626263, "grad_norm": 0.5599820613861084, "learning_rate": 4.592841308745932e-06, "loss": 1.7325, "step": 434 }, { "epoch": 0.6277056277056277, "grad_norm": 0.504962682723999, "learning_rate": 4.456808753941205e-06, "loss": 1.9556, "step": 435 }, { "epoch": 0.6291486291486291, "grad_norm": 0.4306437373161316, "learning_rate": 4.322727117869951e-06, "loss": 1.8258, "step": 436 }, { "epoch": 0.6305916305916306, "grad_norm": 0.481152206659317, "learning_rate": 4.190602144143207e-06, "loss": 1.9023, "step": 437 }, { "epoch": 0.6320346320346321, "grad_norm": 0.43929627537727356, "learning_rate": 4.06043949255509e-06, "loss": 1.6293, "step": 438 }, { "epoch": 0.6334776334776335, "grad_norm": 0.5508288145065308, "learning_rate": 3.932244738840379e-06, "loss": 1.8766, "step": 439 }, { "epoch": 0.6349206349206349, "grad_norm": 0.503813624382019, "learning_rate": 3.8060233744356633e-06, "loss": 1.7276, "step": 440 }, { "epoch": 0.6363636363636364, "grad_norm": 0.5829843282699585, "learning_rate": 3.681780806244095e-06, "loss": 1.7256, "step": 441 }, { "epoch": 0.6378066378066378, "grad_norm": 0.5285375714302063, "learning_rate": 3.5595223564037884e-06, "loss": 1.7845, "step": 442 }, { "epoch": 0.6392496392496393, "grad_norm": 0.5721433758735657, "learning_rate": 3.4392532620598216e-06, "loss": 1.6747, "step": 443 }, { "epoch": 0.6406926406926406, "grad_norm": 0.5801364183425903, "learning_rate": 3.3209786751399187e-06, "loss": 1.9831, "step": 444 }, { "epoch": 0.6421356421356421, "grad_norm": 0.6420133113861084, "learning_rate": 3.2047036621337236e-06, "loss": 1.842, "step": 445 }, { "epoch": 0.6435786435786436, "grad_norm": 0.657759428024292, "learning_rate": 3.0904332038757977e-06, "loss": 1.732, "step": 446 }, { "epoch": 0.645021645021645, "grad_norm": 0.6824421882629395, "learning_rate": 2.978172195332263e-06, "loss": 1.5208, "step": 447 }, { "epoch": 0.6464646464646465, "grad_norm": 0.7443853616714478, "learning_rate": 2.8679254453910785e-06, "loss": 1.7159, "step": 448 }, { "epoch": 0.6479076479076479, "grad_norm": 0.8423395156860352, "learning_rate": 2.759697676656098e-06, "loss": 1.6525, "step": 449 }, { "epoch": 0.6493506493506493, "grad_norm": 1.326777696609497, "learning_rate": 2.653493525244721e-06, "loss": 1.8648, "step": 450 }, { "epoch": 0.6507936507936508, "grad_norm": 0.20779266953468323, "learning_rate": 2.549317540589308e-06, "loss": 1.6705, "step": 451 }, { "epoch": 0.6522366522366523, "grad_norm": 0.21033185720443726, "learning_rate": 2.4471741852423237e-06, "loss": 1.7687, "step": 452 }, { "epoch": 0.6536796536796536, "grad_norm": 0.2421332150697708, "learning_rate": 2.3470678346851518e-06, "loss": 1.689, "step": 453 }, { "epoch": 0.6551226551226551, "grad_norm": 0.26601821184158325, "learning_rate": 2.2490027771406687e-06, "loss": 1.8311, "step": 454 }, { "epoch": 0.6565656565656566, "grad_norm": 0.2268337607383728, "learning_rate": 2.152983213389559e-06, "loss": 1.8122, "step": 455 }, { "epoch": 0.658008658008658, "grad_norm": 0.27295178174972534, "learning_rate": 2.0590132565903476e-06, "loss": 1.8871, "step": 456 }, { "epoch": 0.6594516594516594, "grad_norm": 0.25046512484550476, "learning_rate": 1.9670969321032407e-06, "loss": 1.9112, "step": 457 }, { "epoch": 0.6608946608946609, "grad_norm": 0.2787816822528839, "learning_rate": 1.8772381773176417e-06, "loss": 1.8829, "step": 458 }, { "epoch": 0.6623376623376623, "grad_norm": 0.28614646196365356, "learning_rate": 1.7894408414835362e-06, "loss": 1.9613, "step": 459 }, { "epoch": 0.6637806637806638, "grad_norm": 0.29104024171829224, "learning_rate": 1.70370868554659e-06, "loss": 1.883, "step": 460 }, { "epoch": 0.6652236652236653, "grad_norm": 0.3024556040763855, "learning_rate": 1.620045381987012e-06, "loss": 1.7852, "step": 461 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3180261552333832, "learning_rate": 1.5384545146622852e-06, "loss": 1.8743, "step": 462 }, { "epoch": 0.6681096681096681, "grad_norm": 0.34793442487716675, "learning_rate": 1.4589395786535953e-06, "loss": 1.7692, "step": 463 }, { "epoch": 0.6695526695526696, "grad_norm": 0.3252454698085785, "learning_rate": 1.3815039801161721e-06, "loss": 1.9941, "step": 464 }, { "epoch": 0.670995670995671, "grad_norm": 0.38873234391212463, "learning_rate": 1.3061510361333185e-06, "loss": 1.8367, "step": 465 }, { "epoch": 0.6724386724386724, "grad_norm": 0.3405764102935791, "learning_rate": 1.232883974574367e-06, "loss": 1.9568, "step": 466 }, { "epoch": 0.6738816738816739, "grad_norm": 0.4072246253490448, "learning_rate": 1.1617059339563807e-06, "loss": 1.9118, "step": 467 }, { "epoch": 0.6753246753246753, "grad_norm": 0.38311657309532166, "learning_rate": 1.0926199633097157e-06, "loss": 1.9212, "step": 468 }, { "epoch": 0.6767676767676768, "grad_norm": 0.372346431016922, "learning_rate": 1.0256290220474307e-06, "loss": 1.8625, "step": 469 }, { "epoch": 0.6782106782106783, "grad_norm": 0.32480254769325256, "learning_rate": 9.607359798384785e-07, "loss": 2.0927, "step": 470 }, { "epoch": 0.6796536796536796, "grad_norm": 0.349486380815506, "learning_rate": 8.979436164848088e-07, "loss": 1.8977, "step": 471 }, { "epoch": 0.6810966810966811, "grad_norm": 0.5002724528312683, "learning_rate": 8.372546218022747e-07, "loss": 1.7969, "step": 472 }, { "epoch": 0.6825396825396826, "grad_norm": 0.43671897053718567, "learning_rate": 7.786715955054203e-07, "loss": 2.0032, "step": 473 }, { "epoch": 0.683982683982684, "grad_norm": 0.43542805314064026, "learning_rate": 7.221970470961125e-07, "loss": 1.9024, "step": 474 }, { "epoch": 0.6854256854256854, "grad_norm": 0.4874102771282196, "learning_rate": 6.678333957560512e-07, "loss": 1.6566, "step": 475 }, { "epoch": 0.6868686868686869, "grad_norm": 0.4170205891132355, "learning_rate": 6.15582970243117e-07, "loss": 1.8643, "step": 476 }, { "epoch": 0.6883116883116883, "grad_norm": 0.4337291717529297, "learning_rate": 5.654480087916303e-07, "loss": 2.0587, "step": 477 }, { "epoch": 0.6897546897546898, "grad_norm": 0.4315203130245209, "learning_rate": 5.174306590164879e-07, "loss": 2.0489, "step": 478 }, { "epoch": 0.6911976911976911, "grad_norm": 0.43969807028770447, "learning_rate": 4.715329778211375e-07, "loss": 1.9063, "step": 479 }, { "epoch": 0.6926406926406926, "grad_norm": 0.39184656739234924, "learning_rate": 4.277569313094809e-07, "loss": 1.7943, "step": 480 }, { "epoch": 0.6940836940836941, "grad_norm": 0.43797311186790466, "learning_rate": 3.8610439470164737e-07, "loss": 1.9579, "step": 481 }, { "epoch": 0.6955266955266955, "grad_norm": 0.45759716629981995, "learning_rate": 3.465771522536854e-07, "loss": 1.7868, "step": 482 }, { "epoch": 0.696969696969697, "grad_norm": 0.530158281326294, "learning_rate": 3.09176897181096e-07, "loss": 1.9113, "step": 483 }, { "epoch": 0.6984126984126984, "grad_norm": 0.4709303081035614, "learning_rate": 2.7390523158633554e-07, "loss": 1.9702, "step": 484 }, { "epoch": 0.6998556998556998, "grad_norm": 0.5338622331619263, "learning_rate": 2.407636663901591e-07, "loss": 1.8965, "step": 485 }, { "epoch": 0.7012987012987013, "grad_norm": 0.48375779390335083, "learning_rate": 2.0975362126691712e-07, "loss": 1.6047, "step": 486 }, { "epoch": 0.7027417027417028, "grad_norm": 0.4987216293811798, "learning_rate": 1.8087642458373134e-07, "loss": 1.8085, "step": 487 }, { "epoch": 0.7041847041847041, "grad_norm": 0.5824238657951355, "learning_rate": 1.5413331334360182e-07, "loss": 1.6619, "step": 488 }, { "epoch": 0.7056277056277056, "grad_norm": 0.5118747353553772, "learning_rate": 1.2952543313240472e-07, "loss": 1.7263, "step": 489 }, { "epoch": 0.7070707070707071, "grad_norm": 0.6156334280967712, "learning_rate": 1.0705383806982606e-07, "loss": 1.8636, "step": 490 }, { "epoch": 0.7085137085137085, "grad_norm": 0.587309718132019, "learning_rate": 8.671949076420882e-08, "loss": 1.6529, "step": 491 }, { "epoch": 0.70995670995671, "grad_norm": 0.5975311398506165, "learning_rate": 6.852326227130834e-08, "loss": 1.875, "step": 492 }, { "epoch": 0.7113997113997114, "grad_norm": 0.5370944142341614, "learning_rate": 5.246593205699424e-08, "loss": 1.8962, "step": 493 }, { "epoch": 0.7128427128427128, "grad_norm": 0.6013877391815186, "learning_rate": 3.8548187963854956e-08, "loss": 1.9036, "step": 494 }, { "epoch": 0.7142857142857143, "grad_norm": 0.6089028120040894, "learning_rate": 2.6770626181715773e-08, "loss": 1.7561, "step": 495 }, { "epoch": 0.7157287157287158, "grad_norm": 0.6195142865180969, "learning_rate": 1.7133751222137007e-08, "loss": 1.8843, "step": 496 }, { "epoch": 0.7171717171717171, "grad_norm": 0.7459947466850281, "learning_rate": 9.637975896759077e-09, "loss": 1.5594, "step": 497 }, { "epoch": 0.7186147186147186, "grad_norm": 0.8303995132446289, "learning_rate": 4.2836212996499865e-09, "loss": 1.6858, "step": 498 }, { "epoch": 0.7200577200577201, "grad_norm": 0.9504781365394592, "learning_rate": 1.0709167935385455e-09, "loss": 1.4528, "step": 499 }, { "epoch": 0.7215007215007215, "grad_norm": 1.5718448162078857, "learning_rate": 0.0, "loss": 1.6021, "step": 500 }, { "epoch": 0.7215007215007215, "eval_loss": 1.8335492610931396, "eval_runtime": 156.8048, "eval_samples_per_second": 7.442, "eval_steps_per_second": 1.862, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4073840298371318e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }