{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 3.155007898233803, "learning_rate": 8.974358974358974e-08, "loss": 0.7002, "step": 1 }, { "epoch": 0.01, "grad_norm": 3.1501311823834692, "learning_rate": 1.7948717948717948e-07, "loss": 0.5469, "step": 2 }, { "epoch": 0.02, "grad_norm": 3.2709527976504122, "learning_rate": 2.692307692307692e-07, "loss": 0.6252, "step": 3 }, { "epoch": 0.03, "grad_norm": 3.1490588816296174, "learning_rate": 3.5897435897435896e-07, "loss": 0.6215, "step": 4 }, { "epoch": 0.03, "grad_norm": 2.9970198743522634, "learning_rate": 4.4871794871794865e-07, "loss": 0.5639, "step": 5 }, { "epoch": 0.04, "grad_norm": 3.3295427761499448, "learning_rate": 5.384615384615384e-07, "loss": 0.6563, "step": 6 }, { "epoch": 0.04, "grad_norm": 3.048225120103655, "learning_rate": 6.282051282051282e-07, "loss": 0.5892, "step": 7 }, { "epoch": 0.05, "grad_norm": 2.8352144056173314, "learning_rate": 7.179487179487179e-07, "loss": 0.6691, "step": 8 }, { "epoch": 0.06, "grad_norm": 3.037755430732799, "learning_rate": 8.076923076923077e-07, "loss": 0.6171, "step": 9 }, { "epoch": 0.06, "grad_norm": 2.726311486575841, "learning_rate": 8.974358974358973e-07, "loss": 0.5419, "step": 10 }, { "epoch": 0.07, "grad_norm": 2.8423737121143846, "learning_rate": 9.871794871794872e-07, "loss": 0.5734, "step": 11 }, { "epoch": 0.08, "grad_norm": 2.8530931377361877, "learning_rate": 1.0769230769230769e-06, "loss": 0.6041, "step": 12 }, { "epoch": 0.08, "grad_norm": 2.549681259857384, "learning_rate": 1.1666666666666666e-06, "loss": 0.5621, "step": 13 }, { "epoch": 0.09, "grad_norm": 2.5648145355299543, "learning_rate": 1.2564102564102565e-06, "loss": 0.6351, "step": 14 }, { "epoch": 0.1, "grad_norm": 2.6187703396840027, "learning_rate": 1.3461538461538462e-06, "loss": 0.5119, "step": 15 }, { "epoch": 0.1, "grad_norm": 2.4643502254723932, "learning_rate": 1.4358974358974359e-06, "loss": 0.6282, "step": 16 }, { "epoch": 0.11, "grad_norm": 2.0662967591437957, "learning_rate": 1.5256410256410255e-06, "loss": 0.5957, "step": 17 }, { "epoch": 0.12, "grad_norm": 2.1946771330544164, "learning_rate": 1.6153846153846154e-06, "loss": 0.6085, "step": 18 }, { "epoch": 0.12, "grad_norm": 2.0721275066847005, "learning_rate": 1.7051282051282051e-06, "loss": 0.5127, "step": 19 }, { "epoch": 0.13, "grad_norm": 2.088536700329697, "learning_rate": 1.7948717948717946e-06, "loss": 0.512, "step": 20 }, { "epoch": 0.13, "grad_norm": 2.0169376562070176, "learning_rate": 1.8846153846153845e-06, "loss": 0.5363, "step": 21 }, { "epoch": 0.14, "grad_norm": 1.9535445729471628, "learning_rate": 1.9743589743589744e-06, "loss": 0.5398, "step": 22 }, { "epoch": 0.15, "grad_norm": 1.8891775230026828, "learning_rate": 2.064102564102564e-06, "loss": 0.4532, "step": 23 }, { "epoch": 0.15, "grad_norm": 2.033499357097621, "learning_rate": 2.1538461538461538e-06, "loss": 0.5398, "step": 24 }, { "epoch": 0.16, "grad_norm": 2.0971684944020184, "learning_rate": 2.243589743589744e-06, "loss": 0.5513, "step": 25 }, { "epoch": 0.17, "grad_norm": 1.9400424507855618, "learning_rate": 2.333333333333333e-06, "loss": 0.4096, "step": 26 }, { "epoch": 0.17, "grad_norm": 1.7906588387942997, "learning_rate": 2.423076923076923e-06, "loss": 0.5276, "step": 27 }, { "epoch": 0.18, "grad_norm": 2.0798185942573713, "learning_rate": 2.512820512820513e-06, "loss": 0.5219, "step": 28 }, { "epoch": 0.19, "grad_norm": 1.8753577770843581, "learning_rate": 2.6025641025641026e-06, "loss": 0.5799, "step": 29 }, { "epoch": 0.19, "grad_norm": 1.9186158642556645, "learning_rate": 2.6923076923076923e-06, "loss": 0.5151, "step": 30 }, { "epoch": 0.2, "grad_norm": 1.855939003192239, "learning_rate": 2.782051282051282e-06, "loss": 0.496, "step": 31 }, { "epoch": 0.21, "grad_norm": 1.8486864087415182, "learning_rate": 2.8717948717948717e-06, "loss": 0.5204, "step": 32 }, { "epoch": 0.21, "grad_norm": 1.8021726042394228, "learning_rate": 2.9615384615384614e-06, "loss": 0.5043, "step": 33 }, { "epoch": 0.22, "grad_norm": 2.0357378109965776, "learning_rate": 3.051282051282051e-06, "loss": 0.5788, "step": 34 }, { "epoch": 0.22, "grad_norm": 1.7843582385646046, "learning_rate": 3.141025641025641e-06, "loss": 0.4452, "step": 35 }, { "epoch": 0.23, "grad_norm": 2.0000957562889896, "learning_rate": 3.230769230769231e-06, "loss": 0.5486, "step": 36 }, { "epoch": 0.24, "grad_norm": 1.861686036469186, "learning_rate": 3.32051282051282e-06, "loss": 0.468, "step": 37 }, { "epoch": 0.24, "grad_norm": 1.9887255288842596, "learning_rate": 3.4102564102564103e-06, "loss": 0.4729, "step": 38 }, { "epoch": 0.25, "grad_norm": 2.0718650709658224, "learning_rate": 3.5e-06, "loss": 0.5457, "step": 39 }, { "epoch": 0.26, "grad_norm": 1.9777425001308604, "learning_rate": 3.5897435897435892e-06, "loss": 0.5257, "step": 40 }, { "epoch": 0.26, "grad_norm": 1.6671876468169715, "learning_rate": 3.6794871794871797e-06, "loss": 0.4937, "step": 41 }, { "epoch": 0.27, "grad_norm": 1.7049607096852184, "learning_rate": 3.769230769230769e-06, "loss": 0.4769, "step": 42 }, { "epoch": 0.28, "grad_norm": 1.7345139622673087, "learning_rate": 3.858974358974359e-06, "loss": 0.3981, "step": 43 }, { "epoch": 0.28, "grad_norm": 1.9180834947990224, "learning_rate": 3.948717948717949e-06, "loss": 0.536, "step": 44 }, { "epoch": 0.29, "grad_norm": 1.8152016108847437, "learning_rate": 4.038461538461538e-06, "loss": 0.4885, "step": 45 }, { "epoch": 0.29, "grad_norm": 1.9589341729000858, "learning_rate": 4.128205128205128e-06, "loss": 0.5547, "step": 46 }, { "epoch": 0.3, "grad_norm": 1.6766248131889971, "learning_rate": 4.217948717948718e-06, "loss": 0.4473, "step": 47 }, { "epoch": 0.31, "grad_norm": 1.7792287546251313, "learning_rate": 4.3076923076923076e-06, "loss": 0.5166, "step": 48 }, { "epoch": 0.31, "grad_norm": 1.6833930033458306, "learning_rate": 4.397435897435897e-06, "loss": 0.5004, "step": 49 }, { "epoch": 0.32, "grad_norm": 1.766876180946489, "learning_rate": 4.487179487179488e-06, "loss": 0.5508, "step": 50 }, { "epoch": 0.33, "grad_norm": 1.5947570972439933, "learning_rate": 4.576923076923077e-06, "loss": 0.3858, "step": 51 }, { "epoch": 0.33, "grad_norm": 1.7707975081953566, "learning_rate": 4.666666666666666e-06, "loss": 0.4734, "step": 52 }, { "epoch": 0.34, "grad_norm": 1.496511191973904, "learning_rate": 4.756410256410257e-06, "loss": 0.4058, "step": 53 }, { "epoch": 0.35, "grad_norm": 1.5824823317523948, "learning_rate": 4.846153846153846e-06, "loss": 0.4332, "step": 54 }, { "epoch": 0.35, "grad_norm": 1.7090461388411984, "learning_rate": 4.935897435897436e-06, "loss": 0.4714, "step": 55 }, { "epoch": 0.36, "grad_norm": 1.7426447758933667, "learning_rate": 5.025641025641026e-06, "loss": 0.4239, "step": 56 }, { "epoch": 0.37, "grad_norm": 1.6842573390959508, "learning_rate": 5.115384615384615e-06, "loss": 0.4872, "step": 57 }, { "epoch": 0.37, "grad_norm": 1.5768381008703216, "learning_rate": 5.205128205128205e-06, "loss": 0.4409, "step": 58 }, { "epoch": 0.38, "grad_norm": 1.9230070457298252, "learning_rate": 5.294871794871795e-06, "loss": 0.404, "step": 59 }, { "epoch": 0.38, "grad_norm": 1.676341438975508, "learning_rate": 5.384615384615385e-06, "loss": 0.5008, "step": 60 }, { "epoch": 0.39, "grad_norm": 1.6303835534795739, "learning_rate": 5.474358974358974e-06, "loss": 0.4683, "step": 61 }, { "epoch": 0.4, "grad_norm": 1.9373304143465806, "learning_rate": 5.564102564102564e-06, "loss": 0.5097, "step": 62 }, { "epoch": 0.4, "grad_norm": 1.8345122891614065, "learning_rate": 5.653846153846154e-06, "loss": 0.5298, "step": 63 }, { "epoch": 0.41, "grad_norm": 1.7382245856352698, "learning_rate": 5.743589743589743e-06, "loss": 0.6012, "step": 64 }, { "epoch": 0.42, "grad_norm": 1.6453526661220774, "learning_rate": 5.833333333333333e-06, "loss": 0.4989, "step": 65 }, { "epoch": 0.42, "grad_norm": 2.3484183912221845, "learning_rate": 5.923076923076923e-06, "loss": 0.4826, "step": 66 }, { "epoch": 0.43, "grad_norm": 1.8908943045409214, "learning_rate": 6.0128205128205125e-06, "loss": 0.4744, "step": 67 }, { "epoch": 0.44, "grad_norm": 1.8316643602939897, "learning_rate": 6.102564102564102e-06, "loss": 0.446, "step": 68 }, { "epoch": 0.44, "grad_norm": 1.8795168400920537, "learning_rate": 6.192307692307692e-06, "loss": 0.5211, "step": 69 }, { "epoch": 0.45, "grad_norm": 1.7156763474826677, "learning_rate": 6.282051282051282e-06, "loss": 0.4573, "step": 70 }, { "epoch": 0.46, "grad_norm": 1.697549263992999, "learning_rate": 6.371794871794871e-06, "loss": 0.5127, "step": 71 }, { "epoch": 0.46, "grad_norm": 1.8628486662084527, "learning_rate": 6.461538461538462e-06, "loss": 0.368, "step": 72 }, { "epoch": 0.47, "grad_norm": 1.7585623399087367, "learning_rate": 6.5512820512820515e-06, "loss": 0.4697, "step": 73 }, { "epoch": 0.47, "grad_norm": 1.6350247762941712, "learning_rate": 6.64102564102564e-06, "loss": 0.4568, "step": 74 }, { "epoch": 0.48, "grad_norm": 1.625191280670622, "learning_rate": 6.730769230769231e-06, "loss": 0.4232, "step": 75 }, { "epoch": 0.49, "grad_norm": 1.8473726342325616, "learning_rate": 6.8205128205128205e-06, "loss": 0.5019, "step": 76 }, { "epoch": 0.49, "grad_norm": 1.743248260743822, "learning_rate": 6.91025641025641e-06, "loss": 0.475, "step": 77 }, { "epoch": 0.5, "grad_norm": 1.62305010114068, "learning_rate": 7e-06, "loss": 0.4467, "step": 78 }, { "epoch": 0.51, "grad_norm": 1.7466573021923082, "learning_rate": 6.9999649520318915e-06, "loss": 0.4301, "step": 79 }, { "epoch": 0.51, "grad_norm": 1.6130437414343075, "learning_rate": 6.999859808829483e-06, "loss": 0.4234, "step": 80 }, { "epoch": 0.52, "grad_norm": 1.714296938271923, "learning_rate": 6.999684572498523e-06, "loss": 0.4479, "step": 81 }, { "epoch": 0.53, "grad_norm": 1.6478015443180727, "learning_rate": 6.999439246548541e-06, "loss": 0.4092, "step": 82 }, { "epoch": 0.53, "grad_norm": 1.772703416513747, "learning_rate": 6.999123835892781e-06, "loss": 0.504, "step": 83 }, { "epoch": 0.54, "grad_norm": 1.8216260385526901, "learning_rate": 6.998738346848099e-06, "loss": 0.5069, "step": 84 }, { "epoch": 0.54, "grad_norm": 1.6744716503404546, "learning_rate": 6.998282787134845e-06, "loss": 0.4726, "step": 85 }, { "epoch": 0.55, "grad_norm": 1.772184185014806, "learning_rate": 6.997757165876698e-06, "loss": 0.5105, "step": 86 }, { "epoch": 0.56, "grad_norm": 1.7107134524844578, "learning_rate": 6.9971614936004935e-06, "loss": 0.4361, "step": 87 }, { "epoch": 0.56, "grad_norm": 1.7373273309454584, "learning_rate": 6.996495782236003e-06, "loss": 0.4909, "step": 88 }, { "epoch": 0.57, "grad_norm": 1.73191097304193, "learning_rate": 6.9957600451157e-06, "loss": 0.456, "step": 89 }, { "epoch": 0.58, "grad_norm": 1.543591250610673, "learning_rate": 6.9949542969744955e-06, "loss": 0.4159, "step": 90 }, { "epoch": 0.58, "grad_norm": 1.9119020098570365, "learning_rate": 6.9940785539494385e-06, "loss": 0.5059, "step": 91 }, { "epoch": 0.59, "grad_norm": 1.7231017193785318, "learning_rate": 6.9931328335793926e-06, "loss": 0.4661, "step": 92 }, { "epoch": 0.6, "grad_norm": 1.6314842551489181, "learning_rate": 6.992117154804688e-06, "loss": 0.4521, "step": 93 }, { "epoch": 0.6, "grad_norm": 1.7528843946113917, "learning_rate": 6.991031537966741e-06, "loss": 0.4864, "step": 94 }, { "epoch": 0.61, "grad_norm": 1.773871809828818, "learning_rate": 6.989876004807644e-06, "loss": 0.5162, "step": 95 }, { "epoch": 0.62, "grad_norm": 1.5399343944436803, "learning_rate": 6.9886505784697354e-06, "loss": 0.4338, "step": 96 }, { "epoch": 0.62, "grad_norm": 1.7514912857415266, "learning_rate": 6.98735528349513e-06, "loss": 0.4747, "step": 97 }, { "epoch": 0.63, "grad_norm": 1.555552457796697, "learning_rate": 6.985990145825233e-06, "loss": 0.4169, "step": 98 }, { "epoch": 0.63, "grad_norm": 1.8134922361539518, "learning_rate": 6.984555192800216e-06, "loss": 0.5132, "step": 99 }, { "epoch": 0.64, "grad_norm": 1.897574482765648, "learning_rate": 6.983050453158471e-06, "loss": 0.4651, "step": 100 }, { "epoch": 0.65, "grad_norm": 1.7158733456321913, "learning_rate": 6.981475957036039e-06, "loss": 0.4445, "step": 101 }, { "epoch": 0.65, "grad_norm": 1.4473946002542672, "learning_rate": 6.979831735965997e-06, "loss": 0.4479, "step": 102 }, { "epoch": 0.66, "grad_norm": 1.7162901512740472, "learning_rate": 6.9781178228778385e-06, "loss": 0.4596, "step": 103 }, { "epoch": 0.67, "grad_norm": 1.6337084396881052, "learning_rate": 6.9763342520968e-06, "loss": 0.3578, "step": 104 }, { "epoch": 0.67, "grad_norm": 1.6651285180833972, "learning_rate": 6.974481059343188e-06, "loss": 0.3986, "step": 105 }, { "epoch": 0.68, "grad_norm": 1.5096225766725835, "learning_rate": 6.972558281731655e-06, "loss": 0.384, "step": 106 }, { "epoch": 0.69, "grad_norm": 1.511217953681732, "learning_rate": 6.970565957770456e-06, "loss": 0.4286, "step": 107 }, { "epoch": 0.69, "grad_norm": 1.7549162394009117, "learning_rate": 6.96850412736068e-06, "loss": 0.4961, "step": 108 }, { "epoch": 0.7, "grad_norm": 1.6794558501418015, "learning_rate": 6.9663728317954505e-06, "loss": 0.488, "step": 109 }, { "epoch": 0.71, "grad_norm": 1.8013458849210429, "learning_rate": 6.9641721137591e-06, "loss": 0.5055, "step": 110 }, { "epoch": 0.71, "grad_norm": 1.5223342051762156, "learning_rate": 6.961902017326311e-06, "loss": 0.4384, "step": 111 }, { "epoch": 0.72, "grad_norm": 1.75400017947144, "learning_rate": 6.959562587961235e-06, "loss": 0.4275, "step": 112 }, { "epoch": 0.72, "grad_norm": 1.4432884930631757, "learning_rate": 6.9571538725165855e-06, "loss": 0.3806, "step": 113 }, { "epoch": 0.73, "grad_norm": 1.3728512834355604, "learning_rate": 6.9546759192326944e-06, "loss": 0.3347, "step": 114 }, { "epoch": 0.74, "grad_norm": 1.7014083625164993, "learning_rate": 6.95212877773655e-06, "loss": 0.4893, "step": 115 }, { "epoch": 0.74, "grad_norm": 1.7435972771230723, "learning_rate": 6.949512499040799e-06, "loss": 0.4518, "step": 116 }, { "epoch": 0.75, "grad_norm": 1.6443871246884252, "learning_rate": 6.946827135542729e-06, "loss": 0.4675, "step": 117 }, { "epoch": 0.76, "grad_norm": 1.4381128278032245, "learning_rate": 6.944072741023215e-06, "loss": 0.3713, "step": 118 }, { "epoch": 0.76, "grad_norm": 1.6569315631240311, "learning_rate": 6.941249370645649e-06, "loss": 0.4382, "step": 119 }, { "epoch": 0.77, "grad_norm": 1.6251022822204857, "learning_rate": 6.938357080954826e-06, "loss": 0.4447, "step": 120 }, { "epoch": 0.78, "grad_norm": 1.7598566755719935, "learning_rate": 6.935395929875821e-06, "loss": 0.4624, "step": 121 }, { "epoch": 0.78, "grad_norm": 1.5423964387806426, "learning_rate": 6.93236597671282e-06, "loss": 0.3977, "step": 122 }, { "epoch": 0.79, "grad_norm": 1.7628631989109886, "learning_rate": 6.929267282147936e-06, "loss": 0.446, "step": 123 }, { "epoch": 0.79, "grad_norm": 1.6468615154504636, "learning_rate": 6.9260999082400014e-06, "loss": 0.474, "step": 124 }, { "epoch": 0.8, "grad_norm": 1.69755130421299, "learning_rate": 6.922863918423311e-06, "loss": 0.4332, "step": 125 }, { "epoch": 0.81, "grad_norm": 1.8083022682191605, "learning_rate": 6.91955937750636e-06, "loss": 0.5805, "step": 126 }, { "epoch": 0.81, "grad_norm": 1.6518789318820335, "learning_rate": 6.916186351670546e-06, "loss": 0.4257, "step": 127 }, { "epoch": 0.82, "grad_norm": 1.488823939533215, "learning_rate": 6.912744908468841e-06, "loss": 0.3751, "step": 128 }, { "epoch": 0.83, "grad_norm": 1.7757966013804762, "learning_rate": 6.909235116824441e-06, "loss": 0.469, "step": 129 }, { "epoch": 0.83, "grad_norm": 1.592505278493427, "learning_rate": 6.905657047029383e-06, "loss": 0.4143, "step": 130 }, { "epoch": 0.84, "grad_norm": 1.7305116612692355, "learning_rate": 6.90201077074314e-06, "loss": 0.4315, "step": 131 }, { "epoch": 0.85, "grad_norm": 1.7311239078049492, "learning_rate": 6.898296360991182e-06, "loss": 0.4603, "step": 132 }, { "epoch": 0.85, "grad_norm": 1.7269770749278417, "learning_rate": 6.894513892163519e-06, "loss": 0.4569, "step": 133 }, { "epoch": 0.86, "grad_norm": 1.7096122263349194, "learning_rate": 6.890663440013204e-06, "loss": 0.4859, "step": 134 }, { "epoch": 0.87, "grad_norm": 1.8247290588131861, "learning_rate": 6.886745081654823e-06, "loss": 0.4798, "step": 135 }, { "epoch": 0.87, "grad_norm": 1.6368752515489966, "learning_rate": 6.882758895562948e-06, "loss": 0.4624, "step": 136 }, { "epoch": 0.88, "grad_norm": 1.7503694358624682, "learning_rate": 6.8787049615705635e-06, "loss": 0.4587, "step": 137 }, { "epoch": 0.88, "grad_norm": 1.6383457284082776, "learning_rate": 6.8745833608674685e-06, "loss": 0.4008, "step": 138 }, { "epoch": 0.89, "grad_norm": 1.5622812578004324, "learning_rate": 6.870394175998651e-06, "loss": 0.4348, "step": 139 }, { "epoch": 0.9, "grad_norm": 1.5762025328026612, "learning_rate": 6.866137490862636e-06, "loss": 0.4095, "step": 140 }, { "epoch": 0.9, "grad_norm": 1.673415899230875, "learning_rate": 6.861813390709803e-06, "loss": 0.4659, "step": 141 }, { "epoch": 0.91, "grad_norm": 1.5962555180641838, "learning_rate": 6.857421962140681e-06, "loss": 0.4213, "step": 142 }, { "epoch": 0.92, "grad_norm": 1.7562554854422003, "learning_rate": 6.852963293104211e-06, "loss": 0.5064, "step": 143 }, { "epoch": 0.92, "grad_norm": 1.6633585065572871, "learning_rate": 6.848437472895989e-06, "loss": 0.4975, "step": 144 }, { "epoch": 0.93, "grad_norm": 1.7373610441369685, "learning_rate": 6.84384459215647e-06, "loss": 0.4097, "step": 145 }, { "epoch": 0.94, "grad_norm": 1.8113935248295507, "learning_rate": 6.839184742869166e-06, "loss": 0.539, "step": 146 }, { "epoch": 0.94, "grad_norm": 1.6602786354588224, "learning_rate": 6.8344580183587866e-06, "loss": 0.4478, "step": 147 }, { "epoch": 0.95, "grad_norm": 1.738271233998496, "learning_rate": 6.829664513289387e-06, "loss": 0.424, "step": 148 }, { "epoch": 0.96, "grad_norm": 1.814406857491063, "learning_rate": 6.824804323662456e-06, "loss": 0.5051, "step": 149 }, { "epoch": 0.96, "grad_norm": 1.678131337111829, "learning_rate": 6.8198775468150085e-06, "loss": 0.5028, "step": 150 }, { "epoch": 0.97, "grad_norm": 1.6059802835602406, "learning_rate": 6.814884281417627e-06, "loss": 0.4861, "step": 151 }, { "epoch": 0.97, "grad_norm": 1.5551551029291752, "learning_rate": 6.8098246274724835e-06, "loss": 0.4236, "step": 152 }, { "epoch": 0.98, "grad_norm": 1.6508065774889, "learning_rate": 6.8046986863113455e-06, "loss": 0.5092, "step": 153 }, { "epoch": 0.99, "grad_norm": 1.6796345029479292, "learning_rate": 6.7995065605935405e-06, "loss": 0.4368, "step": 154 }, { "epoch": 0.99, "grad_norm": 1.6592107174838626, "learning_rate": 6.7942483543039e-06, "loss": 0.3755, "step": 155 }, { "epoch": 1.0, "grad_norm": 1.4850945503983306, "learning_rate": 6.788924172750679e-06, "loss": 0.419, "step": 156 }, { "epoch": 1.01, "grad_norm": 1.636480713605989, "learning_rate": 6.783534122563447e-06, "loss": 0.4985, "step": 157 }, { "epoch": 1.01, "grad_norm": 1.4905731963500435, "learning_rate": 6.7780783116909495e-06, "loss": 0.3661, "step": 158 }, { "epoch": 1.02, "grad_norm": 1.656763138900859, "learning_rate": 6.772556849398952e-06, "loss": 0.4249, "step": 159 }, { "epoch": 1.03, "grad_norm": 1.6619926692208726, "learning_rate": 6.7669698462680434e-06, "loss": 0.4119, "step": 160 }, { "epoch": 1.03, "grad_norm": 1.5244839973693285, "learning_rate": 6.761317414191428e-06, "loss": 0.3802, "step": 161 }, { "epoch": 1.04, "grad_norm": 1.705620627532888, "learning_rate": 6.755599666372685e-06, "loss": 0.439, "step": 162 }, { "epoch": 1.04, "grad_norm": 1.6267790742105397, "learning_rate": 6.749816717323493e-06, "loss": 0.3791, "step": 163 }, { "epoch": 1.05, "grad_norm": 1.6069385218303462, "learning_rate": 6.743968682861346e-06, "loss": 0.4668, "step": 164 }, { "epoch": 1.06, "grad_norm": 1.656574398737437, "learning_rate": 6.738055680107233e-06, "loss": 0.404, "step": 165 }, { "epoch": 1.06, "grad_norm": 1.4902964557198772, "learning_rate": 6.7320778274832836e-06, "loss": 0.3513, "step": 166 }, { "epoch": 1.07, "grad_norm": 1.5935267332780712, "learning_rate": 6.726035244710406e-06, "loss": 0.3731, "step": 167 }, { "epoch": 1.08, "grad_norm": 1.6490590043388837, "learning_rate": 6.7199280528058844e-06, "loss": 0.3951, "step": 168 }, { "epoch": 1.08, "grad_norm": 1.5982484017746137, "learning_rate": 6.713756374080959e-06, "loss": 0.3872, "step": 169 }, { "epoch": 1.09, "grad_norm": 1.7217766197864206, "learning_rate": 6.70752033213837e-06, "loss": 0.423, "step": 170 }, { "epoch": 1.1, "grad_norm": 1.616116261962514, "learning_rate": 6.7012200518698904e-06, "loss": 0.3299, "step": 171 }, { "epoch": 1.1, "grad_norm": 1.7860181512527011, "learning_rate": 6.6948556594538185e-06, "loss": 0.4138, "step": 172 }, { "epoch": 1.11, "grad_norm": 1.5952278874239347, "learning_rate": 6.688427282352449e-06, "loss": 0.4096, "step": 173 }, { "epoch": 1.12, "grad_norm": 1.7136967077306529, "learning_rate": 6.681935049309533e-06, "loss": 0.4069, "step": 174 }, { "epoch": 1.12, "grad_norm": 1.5749205024444042, "learning_rate": 6.6753790903476814e-06, "loss": 0.3344, "step": 175 }, { "epoch": 1.13, "grad_norm": 1.5363523923720421, "learning_rate": 6.668759536765778e-06, "loss": 0.3373, "step": 176 }, { "epoch": 1.13, "grad_norm": 1.6413945984220908, "learning_rate": 6.6620765211363376e-06, "loss": 0.3469, "step": 177 }, { "epoch": 1.14, "grad_norm": 1.6144791165228076, "learning_rate": 6.655330177302857e-06, "loss": 0.352, "step": 178 }, { "epoch": 1.15, "grad_norm": 1.6043475836227805, "learning_rate": 6.64852064037713e-06, "loss": 0.2867, "step": 179 }, { "epoch": 1.15, "grad_norm": 1.6935976486539148, "learning_rate": 6.6416480467365494e-06, "loss": 0.3408, "step": 180 }, { "epoch": 1.16, "grad_norm": 1.7124502452903085, "learning_rate": 6.634712534021367e-06, "loss": 0.353, "step": 181 }, { "epoch": 1.17, "grad_norm": 1.4992513277039652, "learning_rate": 6.627714241131943e-06, "loss": 0.2486, "step": 182 }, { "epoch": 1.17, "grad_norm": 1.593568781257124, "learning_rate": 6.62065330822596e-06, "loss": 0.3488, "step": 183 }, { "epoch": 1.18, "grad_norm": 1.6009882930842492, "learning_rate": 6.613529876715619e-06, "loss": 0.3361, "step": 184 }, { "epoch": 1.19, "grad_norm": 1.6418303297336283, "learning_rate": 6.606344089264805e-06, "loss": 0.3674, "step": 185 }, { "epoch": 1.19, "grad_norm": 1.6506486655738626, "learning_rate": 6.599096089786234e-06, "loss": 0.3231, "step": 186 }, { "epoch": 1.2, "grad_norm": 1.576597053257656, "learning_rate": 6.591786023438565e-06, "loss": 0.3197, "step": 187 }, { "epoch": 1.21, "grad_norm": 1.5882663115394593, "learning_rate": 6.5844140366234956e-06, "loss": 0.3217, "step": 188 }, { "epoch": 1.21, "grad_norm": 1.5005363443170452, "learning_rate": 6.576980276982832e-06, "loss": 0.3101, "step": 189 }, { "epoch": 1.22, "grad_norm": 1.7065502782313382, "learning_rate": 6.569484893395527e-06, "loss": 0.3607, "step": 190 }, { "epoch": 1.22, "grad_norm": 1.4576343806529883, "learning_rate": 6.5619280359747045e-06, "loss": 0.273, "step": 191 }, { "epoch": 1.23, "grad_norm": 1.6846657499731517, "learning_rate": 6.55430985606465e-06, "loss": 0.3221, "step": 192 }, { "epoch": 1.24, "grad_norm": 1.5588135974495478, "learning_rate": 6.546630506237778e-06, "loss": 0.2656, "step": 193 }, { "epoch": 1.24, "grad_norm": 1.669978644846293, "learning_rate": 6.538890140291578e-06, "loss": 0.2425, "step": 194 }, { "epoch": 1.25, "grad_norm": 1.788539476603885, "learning_rate": 6.531088913245536e-06, "loss": 0.3166, "step": 195 }, { "epoch": 1.26, "grad_norm": 1.7210858323037903, "learning_rate": 6.5232269813380254e-06, "loss": 0.3006, "step": 196 }, { "epoch": 1.26, "grad_norm": 1.702169038700813, "learning_rate": 6.5153045020231855e-06, "loss": 0.2967, "step": 197 }, { "epoch": 1.27, "grad_norm": 1.6563664353691507, "learning_rate": 6.507321633967758e-06, "loss": 0.2713, "step": 198 }, { "epoch": 1.28, "grad_norm": 1.5309368809239376, "learning_rate": 6.499278537047919e-06, "loss": 0.2147, "step": 199 }, { "epoch": 1.28, "grad_norm": 1.7209942943977308, "learning_rate": 6.49117537234607e-06, "loss": 0.3018, "step": 200 }, { "epoch": 1.29, "grad_norm": 1.7129877502808601, "learning_rate": 6.483012302147617e-06, "loss": 0.2729, "step": 201 }, { "epoch": 1.29, "grad_norm": 1.7496718789927703, "learning_rate": 6.474789489937715e-06, "loss": 0.2876, "step": 202 }, { "epoch": 1.3, "grad_norm": 1.5933095070593675, "learning_rate": 6.4665071003979985e-06, "loss": 0.2438, "step": 203 }, { "epoch": 1.31, "grad_norm": 1.7227892810373833, "learning_rate": 6.4581652994032816e-06, "loss": 0.2842, "step": 204 }, { "epoch": 1.31, "grad_norm": 1.6267649305204939, "learning_rate": 6.449764254018236e-06, "loss": 0.2789, "step": 205 }, { "epoch": 1.32, "grad_norm": 1.5850041512660988, "learning_rate": 6.441304132494045e-06, "loss": 0.297, "step": 206 }, { "epoch": 1.33, "grad_norm": 1.4696703594193863, "learning_rate": 6.432785104265034e-06, "loss": 0.2101, "step": 207 }, { "epoch": 1.33, "grad_norm": 1.656066428185347, "learning_rate": 6.424207339945278e-06, "loss": 0.2409, "step": 208 }, { "epoch": 1.34, "grad_norm": 1.4167653617356255, "learning_rate": 6.415571011325181e-06, "loss": 0.2209, "step": 209 }, { "epoch": 1.35, "grad_norm": 1.4983529166587497, "learning_rate": 6.406876291368041e-06, "loss": 0.2368, "step": 210 }, { "epoch": 1.35, "grad_norm": 1.5865913377222434, "learning_rate": 6.3981233542065824e-06, "loss": 0.2506, "step": 211 }, { "epoch": 1.36, "grad_norm": 1.5796504194067498, "learning_rate": 6.3893123751394695e-06, "loss": 0.2091, "step": 212 }, { "epoch": 1.37, "grad_norm": 1.5981542013013652, "learning_rate": 6.380443530627797e-06, "loss": 0.2462, "step": 213 }, { "epoch": 1.37, "grad_norm": 1.490322079361194, "learning_rate": 6.371516998291552e-06, "loss": 0.2259, "step": 214 }, { "epoch": 1.38, "grad_norm": 1.596961848168018, "learning_rate": 6.3625329569060595e-06, "loss": 0.1738, "step": 215 }, { "epoch": 1.38, "grad_norm": 1.5745062034098327, "learning_rate": 6.3534915863984045e-06, "loss": 0.2493, "step": 216 }, { "epoch": 1.39, "grad_norm": 1.5450698814984771, "learning_rate": 6.344393067843825e-06, "loss": 0.2377, "step": 217 }, { "epoch": 1.4, "grad_norm": 1.7029756458678342, "learning_rate": 6.335237583462083e-06, "loss": 0.2416, "step": 218 }, { "epoch": 1.4, "grad_norm": 1.7441512485411754, "learning_rate": 6.326025316613824e-06, "loss": 0.254, "step": 219 }, { "epoch": 1.41, "grad_norm": 1.6605837222679327, "learning_rate": 6.3167564517968944e-06, "loss": 0.314, "step": 220 }, { "epoch": 1.42, "grad_norm": 1.565133849108226, "learning_rate": 6.307431174642653e-06, "loss": 0.244, "step": 221 }, { "epoch": 1.42, "grad_norm": 1.5836143873013986, "learning_rate": 6.2980496719122544e-06, "loss": 0.2213, "step": 222 }, { "epoch": 1.43, "grad_norm": 1.528965218826486, "learning_rate": 6.288612131492901e-06, "loss": 0.2033, "step": 223 }, { "epoch": 1.44, "grad_norm": 1.5075906007645412, "learning_rate": 6.279118742394089e-06, "loss": 0.1995, "step": 224 }, { "epoch": 1.44, "grad_norm": 1.6753818522307469, "learning_rate": 6.2695696947438165e-06, "loss": 0.2259, "step": 225 }, { "epoch": 1.45, "grad_norm": 1.5963342713697721, "learning_rate": 6.25996517978478e-06, "loss": 0.2074, "step": 226 }, { "epoch": 1.46, "grad_norm": 1.619015544516661, "learning_rate": 6.2503053898705416e-06, "loss": 0.2407, "step": 227 }, { "epoch": 1.46, "grad_norm": 1.5457927237812485, "learning_rate": 6.2405905184616776e-06, "loss": 0.1524, "step": 228 }, { "epoch": 1.47, "grad_norm": 1.5882620229865172, "learning_rate": 6.230820760121904e-06, "loss": 0.1884, "step": 229 }, { "epoch": 1.47, "grad_norm": 1.561396094087553, "learning_rate": 6.220996310514181e-06, "loss": 0.1982, "step": 230 }, { "epoch": 1.48, "grad_norm": 1.5817943374237846, "learning_rate": 6.21111736639679e-06, "loss": 0.1935, "step": 231 }, { "epoch": 1.49, "grad_norm": 1.6950787867836972, "learning_rate": 6.201184125619403e-06, "loss": 0.2077, "step": 232 }, { "epoch": 1.49, "grad_norm": 1.6754831923835654, "learning_rate": 6.191196787119104e-06, "loss": 0.213, "step": 233 }, { "epoch": 1.5, "grad_norm": 1.6029146218294437, "learning_rate": 6.181155550916423e-06, "loss": 0.2059, "step": 234 }, { "epoch": 1.51, "grad_norm": 1.5323755489476472, "learning_rate": 6.171060618111317e-06, "loss": 0.174, "step": 235 }, { "epoch": 1.51, "grad_norm": 1.557257858564408, "learning_rate": 6.160912190879146e-06, "loss": 0.1968, "step": 236 }, { "epoch": 1.52, "grad_norm": 1.6171718420036003, "learning_rate": 6.15071047246663e-06, "loss": 0.18, "step": 237 }, { "epoch": 1.53, "grad_norm": 1.480227657890647, "learning_rate": 6.140455667187765e-06, "loss": 0.1717, "step": 238 }, { "epoch": 1.53, "grad_norm": 1.551408341338327, "learning_rate": 6.13014798041975e-06, "loss": 0.2161, "step": 239 }, { "epoch": 1.54, "grad_norm": 1.6207949878686716, "learning_rate": 6.119787618598854e-06, "loss": 0.2148, "step": 240 }, { "epoch": 1.54, "grad_norm": 1.5643103300094428, "learning_rate": 6.109374789216296e-06, "loss": 0.2138, "step": 241 }, { "epoch": 1.55, "grad_norm": 1.6967073475220011, "learning_rate": 6.098909700814082e-06, "loss": 0.2429, "step": 242 }, { "epoch": 1.56, "grad_norm": 1.5728184775316987, "learning_rate": 6.08839256298083e-06, "loss": 0.1942, "step": 243 }, { "epoch": 1.56, "grad_norm": 1.5637927560520009, "learning_rate": 6.077823586347579e-06, "loss": 0.2019, "step": 244 }, { "epoch": 1.57, "grad_norm": 1.568288688763992, "learning_rate": 6.06720298258356e-06, "loss": 0.1926, "step": 245 }, { "epoch": 1.58, "grad_norm": 1.4419153948220185, "learning_rate": 6.056530964391961e-06, "loss": 0.1975, "step": 246 }, { "epoch": 1.58, "grad_norm": 1.686492281338865, "learning_rate": 6.0458077455056704e-06, "loss": 0.219, "step": 247 }, { "epoch": 1.59, "grad_norm": 1.5332314891866488, "learning_rate": 6.035033540682993e-06, "loss": 0.2005, "step": 248 }, { "epoch": 1.6, "grad_norm": 1.5577465097613148, "learning_rate": 6.024208565703351e-06, "loss": 0.2131, "step": 249 }, { "epoch": 1.6, "grad_norm": 1.5794506409096423, "learning_rate": 6.013333037362959e-06, "loss": 0.2096, "step": 250 }, { "epoch": 1.61, "grad_norm": 1.6458078130518516, "learning_rate": 6.002407173470486e-06, "loss": 0.2421, "step": 251 }, { "epoch": 1.62, "grad_norm": 1.5012439233130601, "learning_rate": 5.991431192842692e-06, "loss": 0.2083, "step": 252 }, { "epoch": 1.62, "grad_norm": 1.5760275134953128, "learning_rate": 5.980405315300045e-06, "loss": 0.2047, "step": 253 }, { "epoch": 1.63, "grad_norm": 1.4555813197229432, "learning_rate": 5.969329761662319e-06, "loss": 0.1822, "step": 254 }, { "epoch": 1.63, "grad_norm": 1.5857225468346239, "learning_rate": 5.9582047537441716e-06, "loss": 0.219, "step": 255 }, { "epoch": 1.64, "grad_norm": 1.745530982159658, "learning_rate": 5.9470305143507e-06, "loss": 0.1774, "step": 256 }, { "epoch": 1.65, "grad_norm": 1.6320256493079581, "learning_rate": 5.9358072672729845e-06, "loss": 0.1942, "step": 257 }, { "epoch": 1.65, "grad_norm": 1.6361095890331265, "learning_rate": 5.924535237283598e-06, "loss": 0.2505, "step": 258 }, { "epoch": 1.66, "grad_norm": 1.5993814196455234, "learning_rate": 5.913214650132112e-06, "loss": 0.1935, "step": 259 }, { "epoch": 1.67, "grad_norm": 1.3895790850912417, "learning_rate": 5.901845732540568e-06, "loss": 0.1309, "step": 260 }, { "epoch": 1.67, "grad_norm": 1.4752869859546751, "learning_rate": 5.8904287121989455e-06, "loss": 0.1537, "step": 261 }, { "epoch": 1.68, "grad_norm": 1.5193526100418906, "learning_rate": 5.878963817760597e-06, "loss": 0.1722, "step": 262 }, { "epoch": 1.69, "grad_norm": 1.4815632509360366, "learning_rate": 5.867451278837666e-06, "loss": 0.1962, "step": 263 }, { "epoch": 1.69, "grad_norm": 1.6845916508822236, "learning_rate": 5.855891325996495e-06, "loss": 0.2095, "step": 264 }, { "epoch": 1.7, "grad_norm": 1.8207264207099068, "learning_rate": 5.8442841907530035e-06, "loss": 0.2485, "step": 265 }, { "epoch": 1.71, "grad_norm": 1.5523354061842987, "learning_rate": 5.83263010556805e-06, "loss": 0.2203, "step": 266 }, { "epoch": 1.71, "grad_norm": 1.488592846305629, "learning_rate": 5.820929303842783e-06, "loss": 0.2163, "step": 267 }, { "epoch": 1.72, "grad_norm": 1.5016962959606086, "learning_rate": 5.809182019913959e-06, "loss": 0.1769, "step": 268 }, { "epoch": 1.72, "grad_norm": 1.4132734874193642, "learning_rate": 5.797388489049253e-06, "loss": 0.1891, "step": 269 }, { "epoch": 1.73, "grad_norm": 1.3907078826471153, "learning_rate": 5.785548947442547e-06, "loss": 0.1639, "step": 270 }, { "epoch": 1.74, "grad_norm": 1.5952473846369721, "learning_rate": 5.7736636322092016e-06, "loss": 0.2225, "step": 271 }, { "epoch": 1.74, "grad_norm": 1.5983856197422255, "learning_rate": 5.7617327813813e-06, "loss": 0.1967, "step": 272 }, { "epoch": 1.75, "grad_norm": 1.6251445895164713, "learning_rate": 5.749756633902887e-06, "loss": 0.2141, "step": 273 }, { "epoch": 1.76, "grad_norm": 1.5239667480122763, "learning_rate": 5.7377354296251855e-06, "loss": 0.1789, "step": 274 }, { "epoch": 1.76, "grad_norm": 1.5733312077254948, "learning_rate": 5.725669409301782e-06, "loss": 0.1891, "step": 275 }, { "epoch": 1.77, "grad_norm": 1.579084689666494, "learning_rate": 5.71355881458382e-06, "loss": 0.2104, "step": 276 }, { "epoch": 1.78, "grad_norm": 1.6098678412924634, "learning_rate": 5.701403888015149e-06, "loss": 0.1883, "step": 277 }, { "epoch": 1.78, "grad_norm": 1.5520664946596863, "learning_rate": 5.689204873027471e-06, "loss": 0.1873, "step": 278 }, { "epoch": 1.79, "grad_norm": 1.643706186330864, "learning_rate": 5.676962013935464e-06, "loss": 0.1833, "step": 279 }, { "epoch": 1.79, "grad_norm": 1.574632884356156, "learning_rate": 5.664675555931892e-06, "loss": 0.2086, "step": 280 }, { "epoch": 1.8, "grad_norm": 1.7040654897451826, "learning_rate": 5.652345745082691e-06, "loss": 0.1947, "step": 281 }, { "epoch": 1.81, "grad_norm": 1.7990396739414352, "learning_rate": 5.639972828322043e-06, "loss": 0.2834, "step": 282 }, { "epoch": 1.81, "grad_norm": 1.545632160225994, "learning_rate": 5.627557053447427e-06, "loss": 0.1823, "step": 283 }, { "epoch": 1.82, "grad_norm": 1.4802183704681011, "learning_rate": 5.615098669114664e-06, "loss": 0.1591, "step": 284 }, { "epoch": 1.83, "grad_norm": 1.6097297868594553, "learning_rate": 5.6025979248329265e-06, "loss": 0.1851, "step": 285 }, { "epoch": 1.83, "grad_norm": 1.5317448245457972, "learning_rate": 5.590055070959752e-06, "loss": 0.1933, "step": 286 }, { "epoch": 1.84, "grad_norm": 1.6207216281170533, "learning_rate": 5.577470358696021e-06, "loss": 0.187, "step": 287 }, { "epoch": 1.85, "grad_norm": 1.6156124581387827, "learning_rate": 5.564844040080931e-06, "loss": 0.2033, "step": 288 }, { "epoch": 1.85, "grad_norm": 1.6248007053651101, "learning_rate": 5.5521763679869445e-06, "loss": 0.2027, "step": 289 }, { "epoch": 1.86, "grad_norm": 1.703201832873631, "learning_rate": 5.53946759611473e-06, "loss": 0.2355, "step": 290 }, { "epoch": 1.87, "grad_norm": 1.6347182792762331, "learning_rate": 5.526717978988076e-06, "loss": 0.196, "step": 291 }, { "epoch": 1.87, "grad_norm": 1.6347383876881978, "learning_rate": 5.513927771948798e-06, "loss": 0.2136, "step": 292 }, { "epoch": 1.88, "grad_norm": 1.6210512023086405, "learning_rate": 5.5010972311516184e-06, "loss": 0.2023, "step": 293 }, { "epoch": 1.88, "grad_norm": 1.5825507774419676, "learning_rate": 5.488226613559045e-06, "loss": 0.174, "step": 294 }, { "epoch": 1.89, "grad_norm": 1.6023419187659833, "learning_rate": 5.475316176936217e-06, "loss": 0.1988, "step": 295 }, { "epoch": 1.9, "grad_norm": 1.5025326070404128, "learning_rate": 5.462366179845746e-06, "loss": 0.182, "step": 296 }, { "epoch": 1.9, "grad_norm": 1.599375014112513, "learning_rate": 5.449376881642537e-06, "loss": 0.2045, "step": 297 }, { "epoch": 1.91, "grad_norm": 1.5465339822704995, "learning_rate": 5.436348542468598e-06, "loss": 0.2029, "step": 298 }, { "epoch": 1.92, "grad_norm": 1.6720010038710715, "learning_rate": 5.423281423247821e-06, "loss": 0.2109, "step": 299 }, { "epoch": 1.92, "grad_norm": 1.6400451846120492, "learning_rate": 5.4101757856807655e-06, "loss": 0.2411, "step": 300 }, { "epoch": 1.93, "grad_norm": 1.6036906269837266, "learning_rate": 5.397031892239414e-06, "loss": 0.1787, "step": 301 }, { "epoch": 1.94, "grad_norm": 1.726874638894533, "learning_rate": 5.383850006161913e-06, "loss": 0.2479, "step": 302 }, { "epoch": 1.94, "grad_norm": 1.5387638769935164, "learning_rate": 5.370630391447303e-06, "loss": 0.1976, "step": 303 }, { "epoch": 1.95, "grad_norm": 1.6078608967191599, "learning_rate": 5.357373312850236e-06, "loss": 0.1819, "step": 304 }, { "epoch": 1.96, "grad_norm": 1.6795308057760014, "learning_rate": 5.3440790358756615e-06, "loss": 0.2084, "step": 305 }, { "epoch": 1.96, "grad_norm": 1.6968139505178514, "learning_rate": 5.330747826773522e-06, "loss": 0.2359, "step": 306 }, { "epoch": 1.97, "grad_norm": 1.6327022776957227, "learning_rate": 5.317379952533411e-06, "loss": 0.2256, "step": 307 }, { "epoch": 1.97, "grad_norm": 1.5535212129206661, "learning_rate": 5.303975680879232e-06, "loss": 0.204, "step": 308 }, { "epoch": 1.98, "grad_norm": 1.6859303635271699, "learning_rate": 5.290535280263835e-06, "loss": 0.2601, "step": 309 }, { "epoch": 1.99, "grad_norm": 1.6081244411677793, "learning_rate": 5.277059019863637e-06, "loss": 0.1961, "step": 310 }, { "epoch": 1.99, "grad_norm": 1.4376728553681852, "learning_rate": 5.263547169573235e-06, "loss": 0.1625, "step": 311 }, { "epoch": 2.0, "grad_norm": 1.4754384583406648, "learning_rate": 5.25e-06, "loss": 0.2084, "step": 312 }, { "epoch": 2.01, "grad_norm": 1.629012127398148, "learning_rate": 5.236417782458656e-06, "loss": 0.2223, "step": 313 }, { "epoch": 2.01, "grad_norm": 1.4053184464424964, "learning_rate": 5.222800788965847e-06, "loss": 0.1541, "step": 314 }, { "epoch": 2.02, "grad_norm": 1.5168677999761422, "learning_rate": 5.2091492922346894e-06, "loss": 0.1835, "step": 315 }, { "epoch": 2.03, "grad_norm": 1.5300778099879906, "learning_rate": 5.195463565669309e-06, "loss": 0.1801, "step": 316 }, { "epoch": 2.03, "grad_norm": 1.5081736458359112, "learning_rate": 5.18174388335937e-06, "loss": 0.1612, "step": 317 }, { "epoch": 2.04, "grad_norm": 1.5873664179989553, "learning_rate": 5.167990520074577e-06, "loss": 0.1686, "step": 318 }, { "epoch": 2.04, "grad_norm": 1.586158856785377, "learning_rate": 5.154203751259183e-06, "loss": 0.1511, "step": 319 }, { "epoch": 2.05, "grad_norm": 1.6403439753706173, "learning_rate": 5.140383853026463e-06, "loss": 0.2097, "step": 320 }, { "epoch": 2.06, "grad_norm": 1.5021116951267204, "learning_rate": 5.12653110215319e-06, "loss": 0.1494, "step": 321 }, { "epoch": 2.06, "grad_norm": 1.5031304131151833, "learning_rate": 5.11264577607409e-06, "loss": 0.155, "step": 322 }, { "epoch": 2.07, "grad_norm": 1.4979853778197805, "learning_rate": 5.098728152876287e-06, "loss": 0.1474, "step": 323 }, { "epoch": 2.08, "grad_norm": 1.583104755788855, "learning_rate": 5.084778511293731e-06, "loss": 0.1551, "step": 324 }, { "epoch": 2.08, "grad_norm": 1.5052221933069465, "learning_rate": 5.070797130701618e-06, "loss": 0.1641, "step": 325 }, { "epoch": 2.09, "grad_norm": 1.5841092318010859, "learning_rate": 5.056784291110794e-06, "loss": 0.1523, "step": 326 }, { "epoch": 2.1, "grad_norm": 1.56655325522275, "learning_rate": 5.04274027316215e-06, "loss": 0.107, "step": 327 }, { "epoch": 2.1, "grad_norm": 1.546032887920805, "learning_rate": 5.028665358120995e-06, "loss": 0.1437, "step": 328 }, { "epoch": 2.11, "grad_norm": 1.6885562930270839, "learning_rate": 5.014559827871426e-06, "loss": 0.1569, "step": 329 }, { "epoch": 2.12, "grad_norm": 1.4483394611955436, "learning_rate": 5.00042396491069e-06, "loss": 0.137, "step": 330 }, { "epoch": 2.12, "grad_norm": 1.4448536208556515, "learning_rate": 4.9862580523435116e-06, "loss": 0.1276, "step": 331 }, { "epoch": 2.13, "grad_norm": 1.4797511760049469, "learning_rate": 4.972062373876435e-06, "loss": 0.1249, "step": 332 }, { "epoch": 2.13, "grad_norm": 1.5847877833278934, "learning_rate": 4.95783721381214e-06, "loss": 0.124, "step": 333 }, { "epoch": 2.14, "grad_norm": 1.5568154599963886, "learning_rate": 4.943582857043742e-06, "loss": 0.1214, "step": 334 }, { "epoch": 2.15, "grad_norm": 1.4041826443755792, "learning_rate": 4.9292995890490945e-06, "loss": 0.0838, "step": 335 }, { "epoch": 2.15, "grad_norm": 1.6919217726047346, "learning_rate": 4.914987695885067e-06, "loss": 0.1051, "step": 336 }, { "epoch": 2.16, "grad_norm": 1.7270792065913758, "learning_rate": 4.900647464181817e-06, "loss": 0.1149, "step": 337 }, { "epoch": 2.17, "grad_norm": 1.4384806218796988, "learning_rate": 4.886279181137049e-06, "loss": 0.0733, "step": 338 }, { "epoch": 2.17, "grad_norm": 1.5148604456334718, "learning_rate": 4.871883134510263e-06, "loss": 0.1204, "step": 339 }, { "epoch": 2.18, "grad_norm": 1.5573623515183024, "learning_rate": 4.8574596126169925e-06, "loss": 0.1158, "step": 340 }, { "epoch": 2.19, "grad_norm": 1.667599893884288, "learning_rate": 4.843008904323029e-06, "loss": 0.1218, "step": 341 }, { "epoch": 2.19, "grad_norm": 1.6336415171179288, "learning_rate": 4.828531299038638e-06, "loss": 0.1064, "step": 342 }, { "epoch": 2.2, "grad_norm": 1.7857223150803634, "learning_rate": 4.81402708671276e-06, "loss": 0.1101, "step": 343 }, { "epoch": 2.21, "grad_norm": 1.690625690003038, "learning_rate": 4.799496557827208e-06, "loss": 0.1141, "step": 344 }, { "epoch": 2.21, "grad_norm": 1.4918432091961047, "learning_rate": 4.7849400033908465e-06, "loss": 0.113, "step": 345 }, { "epoch": 2.22, "grad_norm": 1.616344221375453, "learning_rate": 4.770357714933765e-06, "loss": 0.1263, "step": 346 }, { "epoch": 2.22, "grad_norm": 1.493204431026351, "learning_rate": 4.755749984501437e-06, "loss": 0.0964, "step": 347 }, { "epoch": 2.23, "grad_norm": 1.5095159402266893, "learning_rate": 4.741117104648874e-06, "loss": 0.1008, "step": 348 }, { "epoch": 2.24, "grad_norm": 1.2650625741158785, "learning_rate": 4.726459368434768e-06, "loss": 0.0764, "step": 349 }, { "epoch": 2.24, "grad_norm": 1.4886734247960312, "learning_rate": 4.711777069415615e-06, "loss": 0.0717, "step": 350 }, { "epoch": 2.25, "grad_norm": 1.5028429790381546, "learning_rate": 4.697070501639841e-06, "loss": 0.0978, "step": 351 }, { "epoch": 2.26, "grad_norm": 1.5105459796954572, "learning_rate": 4.682339959641915e-06, "loss": 0.0997, "step": 352 }, { "epoch": 2.26, "grad_norm": 1.4581717109467751, "learning_rate": 4.667585738436448e-06, "loss": 0.0935, "step": 353 }, { "epoch": 2.27, "grad_norm": 1.3990942636732424, "learning_rate": 4.652808133512279e-06, "loss": 0.0848, "step": 354 }, { "epoch": 2.28, "grad_norm": 1.3125915080099024, "learning_rate": 4.638007440826568e-06, "loss": 0.06, "step": 355 }, { "epoch": 2.28, "grad_norm": 1.5922559613481908, "learning_rate": 4.62318395679886e-06, "loss": 0.09, "step": 356 }, { "epoch": 2.29, "grad_norm": 1.6473855344758028, "learning_rate": 4.6083379783051545e-06, "loss": 0.0883, "step": 357 }, { "epoch": 2.29, "grad_norm": 1.4863020248858034, "learning_rate": 4.593469802671951e-06, "loss": 0.0729, "step": 358 }, { "epoch": 2.3, "grad_norm": 1.559787426657181, "learning_rate": 4.5785797276703075e-06, "loss": 0.0678, "step": 359 }, { "epoch": 2.31, "grad_norm": 1.7039467911066226, "learning_rate": 4.563668051509864e-06, "loss": 0.0994, "step": 360 }, { "epoch": 2.31, "grad_norm": 1.7731905609375875, "learning_rate": 4.548735072832879e-06, "loss": 0.1051, "step": 361 }, { "epoch": 2.32, "grad_norm": 1.5882807257594604, "learning_rate": 4.533781090708244e-06, "loss": 0.0919, "step": 362 }, { "epoch": 2.33, "grad_norm": 1.4184027450810133, "learning_rate": 4.518806404625495e-06, "loss": 0.0648, "step": 363 }, { "epoch": 2.33, "grad_norm": 1.5279149470972586, "learning_rate": 4.503811314488816e-06, "loss": 0.0683, "step": 364 }, { "epoch": 2.34, "grad_norm": 1.4225043393982648, "learning_rate": 4.48879612061103e-06, "loss": 0.0767, "step": 365 }, { "epoch": 2.35, "grad_norm": 1.459950562847426, "learning_rate": 4.473761123707584e-06, "loss": 0.0894, "step": 366 }, { "epoch": 2.35, "grad_norm": 1.4577868793787234, "learning_rate": 4.458706624890534e-06, "loss": 0.0863, "step": 367 }, { "epoch": 2.36, "grad_norm": 1.4796537233042946, "learning_rate": 4.443632925662504e-06, "loss": 0.0741, "step": 368 }, { "epoch": 2.37, "grad_norm": 1.3954928355766592, "learning_rate": 4.428540327910652e-06, "loss": 0.0744, "step": 369 }, { "epoch": 2.37, "grad_norm": 1.3599680592870689, "learning_rate": 4.41342913390063e-06, "loss": 0.0769, "step": 370 }, { "epoch": 2.38, "grad_norm": 1.3432745418057788, "learning_rate": 4.398299646270518e-06, "loss": 0.0469, "step": 371 }, { "epoch": 2.38, "grad_norm": 1.3618470064275703, "learning_rate": 4.3831521680247765e-06, "loss": 0.0801, "step": 372 }, { "epoch": 2.39, "grad_norm": 1.335796702130415, "learning_rate": 4.3679870025281644e-06, "loss": 0.0733, "step": 373 }, { "epoch": 2.4, "grad_norm": 1.4972832230856088, "learning_rate": 4.352804453499677e-06, "loss": 0.0701, "step": 374 }, { "epoch": 2.4, "grad_norm": 1.566065032089918, "learning_rate": 4.3376048250064525e-06, "loss": 0.0878, "step": 375 }, { "epoch": 2.41, "grad_norm": 1.9440213727928068, "learning_rate": 4.322388421457687e-06, "loss": 0.1117, "step": 376 }, { "epoch": 2.42, "grad_norm": 1.6075238415353954, "learning_rate": 4.30715554759854e-06, "loss": 0.0834, "step": 377 }, { "epoch": 2.42, "grad_norm": 1.4916088511503431, "learning_rate": 4.2919065085040285e-06, "loss": 0.0674, "step": 378 }, { "epoch": 2.43, "grad_norm": 1.5864994661720633, "learning_rate": 4.276641609572911e-06, "loss": 0.0748, "step": 379 }, { "epoch": 2.44, "grad_norm": 1.5877220777630603, "learning_rate": 4.261361156521586e-06, "loss": 0.0673, "step": 380 }, { "epoch": 2.44, "grad_norm": 1.5952419379371832, "learning_rate": 4.246065455377956e-06, "loss": 0.0723, "step": 381 }, { "epoch": 2.45, "grad_norm": 1.3659196106216351, "learning_rate": 4.230754812475306e-06, "loss": 0.0678, "step": 382 }, { "epoch": 2.46, "grad_norm": 1.5150129082800552, "learning_rate": 4.215429534446161e-06, "loss": 0.0873, "step": 383 }, { "epoch": 2.46, "grad_norm": 1.328231293830379, "learning_rate": 4.200089928216156e-06, "loss": 0.0469, "step": 384 }, { "epoch": 2.47, "grad_norm": 1.3644208951729628, "learning_rate": 4.1847363009978776e-06, "loss": 0.0558, "step": 385 }, { "epoch": 2.47, "grad_norm": 1.3631190033041523, "learning_rate": 4.169368960284718e-06, "loss": 0.0642, "step": 386 }, { "epoch": 2.48, "grad_norm": 1.3380963083993747, "learning_rate": 4.153988213844717e-06, "loss": 0.0651, "step": 387 }, { "epoch": 2.49, "grad_norm": 1.6232655969183094, "learning_rate": 4.138594369714394e-06, "loss": 0.0768, "step": 388 }, { "epoch": 2.49, "grad_norm": 1.738615411907169, "learning_rate": 4.123187736192583e-06, "loss": 0.0631, "step": 389 }, { "epoch": 2.5, "grad_norm": 1.3541880062004612, "learning_rate": 4.107768621834257e-06, "loss": 0.0646, "step": 390 }, { "epoch": 2.51, "grad_norm": 1.5937219608859803, "learning_rate": 4.092337335444343e-06, "loss": 0.0543, "step": 391 }, { "epoch": 2.51, "grad_norm": 1.2652905019689844, "learning_rate": 4.076894186071548e-06, "loss": 0.0624, "step": 392 }, { "epoch": 2.52, "grad_norm": 1.585692487433423, "learning_rate": 4.061439483002161e-06, "loss": 0.056, "step": 393 }, { "epoch": 2.53, "grad_norm": 1.4234670685396151, "learning_rate": 4.045973535753863e-06, "loss": 0.0563, "step": 394 }, { "epoch": 2.53, "grad_norm": 1.3660930904759059, "learning_rate": 4.030496654069524e-06, "loss": 0.0683, "step": 395 }, { "epoch": 2.54, "grad_norm": 1.5737973741759328, "learning_rate": 4.015009147911007e-06, "loss": 0.072, "step": 396 }, { "epoch": 2.54, "grad_norm": 1.4929980457042287, "learning_rate": 3.9995113274529506e-06, "loss": 0.0735, "step": 397 }, { "epoch": 2.55, "grad_norm": 1.4565626149680446, "learning_rate": 3.984003503076566e-06, "loss": 0.0871, "step": 398 }, { "epoch": 2.56, "grad_norm": 1.3794177772470047, "learning_rate": 3.968485985363416e-06, "loss": 0.0614, "step": 399 }, { "epoch": 2.56, "grad_norm": 1.44393677049457, "learning_rate": 3.952959085089193e-06, "loss": 0.0632, "step": 400 }, { "epoch": 2.57, "grad_norm": 1.3495277752899084, "learning_rate": 3.937423113217505e-06, "loss": 0.0619, "step": 401 }, { "epoch": 2.58, "grad_norm": 1.2902982047483085, "learning_rate": 3.92187838089363e-06, "loss": 0.0624, "step": 402 }, { "epoch": 2.58, "grad_norm": 1.4252355923208517, "learning_rate": 3.9063251994383055e-06, "loss": 0.0619, "step": 403 }, { "epoch": 2.59, "grad_norm": 1.3702111372981403, "learning_rate": 3.8907638803414774e-06, "loss": 0.0659, "step": 404 }, { "epoch": 2.6, "grad_norm": 1.358396927631374, "learning_rate": 3.875194735256067e-06, "loss": 0.0713, "step": 405 }, { "epoch": 2.6, "grad_norm": 1.4249331438647441, "learning_rate": 3.859618075991735e-06, "loss": 0.0658, "step": 406 }, { "epoch": 2.61, "grad_norm": 1.3059396981665108, "learning_rate": 3.844034214508625e-06, "loss": 0.0751, "step": 407 }, { "epoch": 2.62, "grad_norm": 1.3155398633895121, "learning_rate": 3.828443462911128e-06, "loss": 0.0679, "step": 408 }, { "epoch": 2.62, "grad_norm": 1.4663166371568865, "learning_rate": 3.8128461334416223e-06, "loss": 0.0669, "step": 409 }, { "epoch": 2.63, "grad_norm": 1.4387435039719887, "learning_rate": 3.7972425384742264e-06, "loss": 0.0578, "step": 410 }, { "epoch": 2.63, "grad_norm": 1.5441681248495072, "learning_rate": 3.781632990508541e-06, "loss": 0.0668, "step": 411 }, { "epoch": 2.64, "grad_norm": 1.509803326861479, "learning_rate": 3.766017802163386e-06, "loss": 0.0509, "step": 412 }, { "epoch": 2.65, "grad_norm": 1.4135151374037613, "learning_rate": 3.7503972861705478e-06, "loss": 0.0574, "step": 413 }, { "epoch": 2.65, "grad_norm": 1.5520725445811439, "learning_rate": 3.7347717553685084e-06, "loss": 0.0861, "step": 414 }, { "epoch": 2.66, "grad_norm": 1.7837110913227487, "learning_rate": 3.7191415226961867e-06, "loss": 0.0617, "step": 415 }, { "epoch": 2.67, "grad_norm": 1.384690904784385, "learning_rate": 3.703506901186665e-06, "loss": 0.0421, "step": 416 }, { "epoch": 2.67, "grad_norm": 1.58160512232892, "learning_rate": 3.6878682039609253e-06, "loss": 0.0465, "step": 417 }, { "epoch": 2.68, "grad_norm": 1.4583873146929784, "learning_rate": 3.6722257442215736e-06, "loss": 0.054, "step": 418 }, { "epoch": 2.69, "grad_norm": 1.392182390968472, "learning_rate": 3.6565798352465697e-06, "loss": 0.0648, "step": 419 }, { "epoch": 2.69, "grad_norm": 1.410291768179273, "learning_rate": 3.640930790382953e-06, "loss": 0.0539, "step": 420 }, { "epoch": 2.7, "grad_norm": 2.2129885462771672, "learning_rate": 3.625278923040567e-06, "loss": 0.119, "step": 421 }, { "epoch": 2.71, "grad_norm": 1.3548595450124818, "learning_rate": 3.6096245466857808e-06, "loss": 0.0741, "step": 422 }, { "epoch": 2.71, "grad_norm": 1.3440278963530723, "learning_rate": 3.5939679748352146e-06, "loss": 0.0773, "step": 423 }, { "epoch": 2.72, "grad_norm": 1.306530229127965, "learning_rate": 3.578309521049456e-06, "loss": 0.0562, "step": 424 }, { "epoch": 2.72, "grad_norm": 1.4782640258474615, "learning_rate": 3.562649498926785e-06, "loss": 0.0696, "step": 425 }, { "epoch": 2.73, "grad_norm": 1.480966118357646, "learning_rate": 3.546988222096891e-06, "loss": 0.0611, "step": 426 }, { "epoch": 2.74, "grad_norm": 1.465534512953903, "learning_rate": 3.531326004214592e-06, "loss": 0.07, "step": 427 }, { "epoch": 2.74, "grad_norm": 1.3384172211856384, "learning_rate": 3.515663158953552e-06, "loss": 0.0623, "step": 428 }, { "epoch": 2.75, "grad_norm": 1.325181333356099, "learning_rate": 3.5e-06, "loss": 0.0679, "step": 429 }, { "epoch": 2.76, "grad_norm": 1.1268047140927235, "learning_rate": 3.484336841046448e-06, "loss": 0.0549, "step": 430 }, { "epoch": 2.76, "grad_norm": 1.287545657132799, "learning_rate": 3.468673995785409e-06, "loss": 0.0554, "step": 431 }, { "epoch": 2.77, "grad_norm": 1.480635441568218, "learning_rate": 3.4530117779031096e-06, "loss": 0.0721, "step": 432 }, { "epoch": 2.78, "grad_norm": 1.251760291533078, "learning_rate": 3.4373505010732152e-06, "loss": 0.0488, "step": 433 }, { "epoch": 2.78, "grad_norm": 1.3619293632526979, "learning_rate": 3.4216904789505444e-06, "loss": 0.0567, "step": 434 }, { "epoch": 2.79, "grad_norm": 1.3634889684832445, "learning_rate": 3.4060320251647866e-06, "loss": 0.0505, "step": 435 }, { "epoch": 2.79, "grad_norm": 1.43374246584491, "learning_rate": 3.3903754533142195e-06, "loss": 0.0658, "step": 436 }, { "epoch": 2.8, "grad_norm": 1.553124280424844, "learning_rate": 3.374721076959433e-06, "loss": 0.0609, "step": 437 }, { "epoch": 2.81, "grad_norm": 1.7225796812921945, "learning_rate": 3.359069209617048e-06, "loss": 0.1126, "step": 438 }, { "epoch": 2.81, "grad_norm": 1.374779091521454, "learning_rate": 3.3434201647534306e-06, "loss": 0.0529, "step": 439 }, { "epoch": 2.82, "grad_norm": 1.4196983369781266, "learning_rate": 3.3277742557784263e-06, "loss": 0.0468, "step": 440 }, { "epoch": 2.83, "grad_norm": 1.475726843734466, "learning_rate": 3.312131796039074e-06, "loss": 0.0546, "step": 441 }, { "epoch": 2.83, "grad_norm": 1.303979627867413, "learning_rate": 3.296493098813335e-06, "loss": 0.0628, "step": 442 }, { "epoch": 2.84, "grad_norm": 1.3984315031120913, "learning_rate": 3.280858477303813e-06, "loss": 0.0573, "step": 443 }, { "epoch": 2.85, "grad_norm": 1.5264883558014777, "learning_rate": 3.265228244631491e-06, "loss": 0.0669, "step": 444 }, { "epoch": 2.85, "grad_norm": 1.3900299809827257, "learning_rate": 3.2496027138294534e-06, "loss": 0.06, "step": 445 }, { "epoch": 2.86, "grad_norm": 1.621781645789055, "learning_rate": 3.2339821978366144e-06, "loss": 0.087, "step": 446 }, { "epoch": 2.87, "grad_norm": 1.406176966599867, "learning_rate": 3.2183670094914596e-06, "loss": 0.0549, "step": 447 }, { "epoch": 2.87, "grad_norm": 1.4185651707707967, "learning_rate": 3.2027574615257726e-06, "loss": 0.0672, "step": 448 }, { "epoch": 2.88, "grad_norm": 1.4517014206041816, "learning_rate": 3.1871538665583784e-06, "loss": 0.0693, "step": 449 }, { "epoch": 2.88, "grad_norm": 1.3197298201511651, "learning_rate": 3.171556537088873e-06, "loss": 0.051, "step": 450 }, { "epoch": 2.89, "grad_norm": 1.4027977888678176, "learning_rate": 3.155965785491375e-06, "loss": 0.0628, "step": 451 }, { "epoch": 2.9, "grad_norm": 1.3210542029533012, "learning_rate": 3.140381924008266e-06, "loss": 0.0537, "step": 452 }, { "epoch": 2.9, "grad_norm": 1.3983814988636492, "learning_rate": 3.1248052647439327e-06, "loss": 0.0607, "step": 453 }, { "epoch": 2.91, "grad_norm": 1.3381469983224843, "learning_rate": 3.109236119658523e-06, "loss": 0.0629, "step": 454 }, { "epoch": 2.92, "grad_norm": 1.363641990737864, "learning_rate": 3.0936748005616936e-06, "loss": 0.0602, "step": 455 }, { "epoch": 2.92, "grad_norm": 1.337994198492209, "learning_rate": 3.0781216191063695e-06, "loss": 0.0799, "step": 456 }, { "epoch": 2.93, "grad_norm": 1.3335030606735412, "learning_rate": 3.0625768867824957e-06, "loss": 0.0545, "step": 457 }, { "epoch": 2.94, "grad_norm": 1.4769773993333548, "learning_rate": 3.047040914910806e-06, "loss": 0.0783, "step": 458 }, { "epoch": 2.94, "grad_norm": 1.3197947799572127, "learning_rate": 3.0315140146365854e-06, "loss": 0.0584, "step": 459 }, { "epoch": 2.95, "grad_norm": 1.4192189944677764, "learning_rate": 3.015996496923435e-06, "loss": 0.0531, "step": 460 }, { "epoch": 2.96, "grad_norm": 1.419406225194028, "learning_rate": 3.00048867254705e-06, "loss": 0.0583, "step": 461 }, { "epoch": 2.96, "grad_norm": 1.4550693486915325, "learning_rate": 2.9849908520889936e-06, "loss": 0.0714, "step": 462 }, { "epoch": 2.97, "grad_norm": 1.34618411271052, "learning_rate": 2.9695033459304766e-06, "loss": 0.0743, "step": 463 }, { "epoch": 2.97, "grad_norm": 1.4148143017886536, "learning_rate": 2.954026464246138e-06, "loss": 0.0687, "step": 464 }, { "epoch": 2.98, "grad_norm": 1.494976547180411, "learning_rate": 2.9385605169978387e-06, "loss": 0.0905, "step": 465 }, { "epoch": 2.99, "grad_norm": 1.4492598962322225, "learning_rate": 2.923105813928453e-06, "loss": 0.0686, "step": 466 }, { "epoch": 2.99, "grad_norm": 1.254765150326393, "learning_rate": 2.907662664555658e-06, "loss": 0.0492, "step": 467 }, { "epoch": 3.0, "grad_norm": 1.3978603716574316, "learning_rate": 2.8922313781657437e-06, "loss": 0.0733, "step": 468 }, { "epoch": 3.01, "grad_norm": 1.3568449348317895, "learning_rate": 2.876812263807417e-06, "loss": 0.0692, "step": 469 }, { "epoch": 3.01, "grad_norm": 1.5106491687723738, "learning_rate": 2.861405630285606e-06, "loss": 0.0533, "step": 470 }, { "epoch": 3.02, "grad_norm": 1.5179556314161937, "learning_rate": 2.8460117861552833e-06, "loss": 0.055, "step": 471 }, { "epoch": 3.03, "grad_norm": 1.5694567578058851, "learning_rate": 2.8306310397152817e-06, "loss": 0.0598, "step": 472 }, { "epoch": 3.03, "grad_norm": 1.2954069120683234, "learning_rate": 2.815263699002124e-06, "loss": 0.0538, "step": 473 }, { "epoch": 3.04, "grad_norm": 1.3105848212645954, "learning_rate": 2.799910071783845e-06, "loss": 0.0462, "step": 474 }, { "epoch": 3.04, "grad_norm": 1.3098901122342839, "learning_rate": 2.7845704655538383e-06, "loss": 0.0471, "step": 475 }, { "epoch": 3.05, "grad_norm": 1.3134707970328, "learning_rate": 2.7692451875246956e-06, "loss": 0.0683, "step": 476 }, { "epoch": 3.06, "grad_norm": 1.1406228014597528, "learning_rate": 2.7539345446220444e-06, "loss": 0.0477, "step": 477 }, { "epoch": 3.06, "grad_norm": 1.0924596106691973, "learning_rate": 2.7386388434784143e-06, "loss": 0.049, "step": 478 }, { "epoch": 3.07, "grad_norm": 1.257749327487133, "learning_rate": 2.723358390427089e-06, "loss": 0.0481, "step": 479 }, { "epoch": 3.08, "grad_norm": 1.2601405115815025, "learning_rate": 2.708093491495973e-06, "loss": 0.0464, "step": 480 }, { "epoch": 3.08, "grad_norm": 1.2396907483275874, "learning_rate": 2.6928444524014595e-06, "loss": 0.0526, "step": 481 }, { "epoch": 3.09, "grad_norm": 1.220905178010921, "learning_rate": 2.6776115785423123e-06, "loss": 0.0469, "step": 482 }, { "epoch": 3.1, "grad_norm": 1.134133024109752, "learning_rate": 2.6623951749935487e-06, "loss": 0.0334, "step": 483 }, { "epoch": 3.1, "grad_norm": 1.2225938422846696, "learning_rate": 2.6471955465003237e-06, "loss": 0.0405, "step": 484 }, { "epoch": 3.11, "grad_norm": 1.192978092896825, "learning_rate": 2.6320129974718355e-06, "loss": 0.0511, "step": 485 }, { "epoch": 3.12, "grad_norm": 1.2144105464154307, "learning_rate": 2.616847831975224e-06, "loss": 0.043, "step": 486 }, { "epoch": 3.12, "grad_norm": 1.3270694940879673, "learning_rate": 2.601700353729481e-06, "loss": 0.044, "step": 487 }, { "epoch": 3.13, "grad_norm": 1.2909647509362592, "learning_rate": 2.58657086609937e-06, "loss": 0.0382, "step": 488 }, { "epoch": 3.13, "grad_norm": 1.557891789297615, "learning_rate": 2.5714596720893473e-06, "loss": 0.0484, "step": 489 }, { "epoch": 3.14, "grad_norm": 1.3632837500193322, "learning_rate": 2.5563670743374973e-06, "loss": 0.0406, "step": 490 }, { "epoch": 3.15, "grad_norm": 1.128374695442261, "learning_rate": 2.5412933751094662e-06, "loss": 0.0273, "step": 491 }, { "epoch": 3.15, "grad_norm": 1.2324598795816455, "learning_rate": 2.5262388762924157e-06, "loss": 0.0341, "step": 492 }, { "epoch": 3.16, "grad_norm": 1.343419423571675, "learning_rate": 2.5112038793889706e-06, "loss": 0.0373, "step": 493 }, { "epoch": 3.17, "grad_norm": 1.1526235554463253, "learning_rate": 2.496188685511185e-06, "loss": 0.0276, "step": 494 }, { "epoch": 3.17, "grad_norm": 1.201453054797899, "learning_rate": 2.481193595374505e-06, "loss": 0.0439, "step": 495 }, { "epoch": 3.18, "grad_norm": 1.4351161008323083, "learning_rate": 2.4662189092917563e-06, "loss": 0.0429, "step": 496 }, { "epoch": 3.19, "grad_norm": 1.5663274417487951, "learning_rate": 2.4512649271671214e-06, "loss": 0.0554, "step": 497 }, { "epoch": 3.19, "grad_norm": 1.3595537176740793, "learning_rate": 2.436331948490136e-06, "loss": 0.0393, "step": 498 }, { "epoch": 3.2, "grad_norm": 1.4896573920852347, "learning_rate": 2.4214202723296924e-06, "loss": 0.0405, "step": 499 }, { "epoch": 3.21, "grad_norm": 1.2767253714566218, "learning_rate": 2.4065301973280486e-06, "loss": 0.0399, "step": 500 }, { "epoch": 3.21, "grad_norm": 1.2378296334262433, "learning_rate": 2.391662021694847e-06, "loss": 0.0405, "step": 501 }, { "epoch": 3.22, "grad_norm": 1.3468312150337725, "learning_rate": 2.3768160432011395e-06, "loss": 0.0404, "step": 502 }, { "epoch": 3.22, "grad_norm": 1.256283638660665, "learning_rate": 2.3619925591734323e-06, "loss": 0.0333, "step": 503 }, { "epoch": 3.23, "grad_norm": 1.1908716392621157, "learning_rate": 2.3471918664877217e-06, "loss": 0.0338, "step": 504 }, { "epoch": 3.24, "grad_norm": 1.0561449925755435, "learning_rate": 2.332414261563553e-06, "loss": 0.0252, "step": 505 }, { "epoch": 3.24, "grad_norm": 1.0161119154501226, "learning_rate": 2.317660040358085e-06, "loss": 0.0213, "step": 506 }, { "epoch": 3.25, "grad_norm": 1.3396465580980972, "learning_rate": 2.3029294983601598e-06, "loss": 0.033, "step": 507 }, { "epoch": 3.26, "grad_norm": 1.0469883632933366, "learning_rate": 2.2882229305843866e-06, "loss": 0.0275, "step": 508 }, { "epoch": 3.26, "grad_norm": 1.1713965135820492, "learning_rate": 2.2735406315652323e-06, "loss": 0.0334, "step": 509 }, { "epoch": 3.27, "grad_norm": 1.0156278207651381, "learning_rate": 2.258882895351125e-06, "loss": 0.0316, "step": 510 }, { "epoch": 3.28, "grad_norm": 0.9597778807188877, "learning_rate": 2.2442500154985643e-06, "loss": 0.0211, "step": 511 }, { "epoch": 3.28, "grad_norm": 1.056684870602395, "learning_rate": 2.229642285066236e-06, "loss": 0.031, "step": 512 }, { "epoch": 3.29, "grad_norm": 1.2111529373318868, "learning_rate": 2.215059996609154e-06, "loss": 0.034, "step": 513 }, { "epoch": 3.29, "grad_norm": 1.0403499427426597, "learning_rate": 2.200503442172792e-06, "loss": 0.0266, "step": 514 }, { "epoch": 3.3, "grad_norm": 1.023896201002211, "learning_rate": 2.185972913287241e-06, "loss": 0.0245, "step": 515 }, { "epoch": 3.31, "grad_norm": 1.09541799539132, "learning_rate": 2.1714687009613628e-06, "loss": 0.0325, "step": 516 }, { "epoch": 3.31, "grad_norm": 1.2592978524239982, "learning_rate": 2.156991095676971e-06, "loss": 0.038, "step": 517 }, { "epoch": 3.32, "grad_norm": 1.0543296286017256, "learning_rate": 2.1425403873830083e-06, "loss": 0.0346, "step": 518 }, { "epoch": 3.33, "grad_norm": 1.0001186604444399, "learning_rate": 2.1281168654897376e-06, "loss": 0.0208, "step": 519 }, { "epoch": 3.33, "grad_norm": 1.10411761212027, "learning_rate": 2.113720818862951e-06, "loss": 0.026, "step": 520 }, { "epoch": 3.34, "grad_norm": 1.076138981995205, "learning_rate": 2.099352535818182e-06, "loss": 0.0244, "step": 521 }, { "epoch": 3.35, "grad_norm": 1.0196811903638796, "learning_rate": 2.085012304114933e-06, "loss": 0.0296, "step": 522 }, { "epoch": 3.35, "grad_norm": 1.0314845239229538, "learning_rate": 2.070700410950906e-06, "loss": 0.0324, "step": 523 }, { "epoch": 3.36, "grad_norm": 1.5222487359020664, "learning_rate": 2.0564171429562587e-06, "loss": 0.029, "step": 524 }, { "epoch": 3.37, "grad_norm": 1.1000193388139592, "learning_rate": 2.042162786187862e-06, "loss": 0.0252, "step": 525 }, { "epoch": 3.37, "grad_norm": 1.0947526780154235, "learning_rate": 2.027937626123565e-06, "loss": 0.027, "step": 526 }, { "epoch": 3.38, "grad_norm": 1.1771718630654349, "learning_rate": 2.0137419476564896e-06, "loss": 0.0157, "step": 527 }, { "epoch": 3.38, "grad_norm": 0.9908516635250878, "learning_rate": 1.9995760350893098e-06, "loss": 0.0286, "step": 528 }, { "epoch": 3.39, "grad_norm": 0.9791274664173555, "learning_rate": 1.985440172128573e-06, "loss": 0.0243, "step": 529 }, { "epoch": 3.4, "grad_norm": 1.3203297232694446, "learning_rate": 1.9713346418790058e-06, "loss": 0.0261, "step": 530 }, { "epoch": 3.4, "grad_norm": 1.13911901666537, "learning_rate": 1.957259726837849e-06, "loss": 0.029, "step": 531 }, { "epoch": 3.41, "grad_norm": 1.048570931624032, "learning_rate": 1.9432157088892064e-06, "loss": 0.0374, "step": 532 }, { "epoch": 3.42, "grad_norm": 1.2103418865964828, "learning_rate": 1.9292028692983824e-06, "loss": 0.0306, "step": 533 }, { "epoch": 3.42, "grad_norm": 1.029334123069811, "learning_rate": 1.91522148870627e-06, "loss": 0.0217, "step": 534 }, { "epoch": 3.43, "grad_norm": 1.0198377079922467, "learning_rate": 1.9012718471237144e-06, "loss": 0.0251, "step": 535 }, { "epoch": 3.44, "grad_norm": 1.166818734002997, "learning_rate": 1.887354223925911e-06, "loss": 0.0215, "step": 536 }, { "epoch": 3.44, "grad_norm": 1.085856728273484, "learning_rate": 1.87346889784681e-06, "loss": 0.0241, "step": 537 }, { "epoch": 3.45, "grad_norm": 0.9340299089299968, "learning_rate": 1.8596161469735374e-06, "loss": 0.0222, "step": 538 }, { "epoch": 3.46, "grad_norm": 0.9582885459137026, "learning_rate": 1.8457962487408175e-06, "loss": 0.0306, "step": 539 }, { "epoch": 3.46, "grad_norm": 0.7818765763548071, "learning_rate": 1.8320094799254222e-06, "loss": 0.0108, "step": 540 }, { "epoch": 3.47, "grad_norm": 0.8674372954189106, "learning_rate": 1.8182561166406308e-06, "loss": 0.0173, "step": 541 }, { "epoch": 3.47, "grad_norm": 0.9225859827659534, "learning_rate": 1.8045364343306915e-06, "loss": 0.0189, "step": 542 }, { "epoch": 3.48, "grad_norm": 0.9050743041899307, "learning_rate": 1.7908507077653124e-06, "loss": 0.0185, "step": 543 }, { "epoch": 3.49, "grad_norm": 0.9749205541180587, "learning_rate": 1.7771992110341533e-06, "loss": 0.0219, "step": 544 }, { "epoch": 3.49, "grad_norm": 1.0069281642083787, "learning_rate": 1.7635822175413446e-06, "loss": 0.0231, "step": 545 }, { "epoch": 3.5, "grad_norm": 0.8029510509892319, "learning_rate": 1.7500000000000008e-06, "loss": 0.02, "step": 546 }, { "epoch": 3.51, "grad_norm": 1.0472669216241044, "learning_rate": 1.7364528304267646e-06, "loss": 0.0169, "step": 547 }, { "epoch": 3.51, "grad_norm": 0.9119333217497559, "learning_rate": 1.7229409801363635e-06, "loss": 0.0194, "step": 548 }, { "epoch": 3.52, "grad_norm": 1.0558846501029286, "learning_rate": 1.7094647197361656e-06, "loss": 0.018, "step": 549 }, { "epoch": 3.53, "grad_norm": 1.0384510779811111, "learning_rate": 1.6960243191207686e-06, "loss": 0.0182, "step": 550 }, { "epoch": 3.53, "grad_norm": 0.8341867375991961, "learning_rate": 1.6826200474665891e-06, "loss": 0.0204, "step": 551 }, { "epoch": 3.54, "grad_norm": 1.1531429797253752, "learning_rate": 1.669252173226479e-06, "loss": 0.0221, "step": 552 }, { "epoch": 3.54, "grad_norm": 1.046052155783273, "learning_rate": 1.6559209641243388e-06, "loss": 0.0251, "step": 553 }, { "epoch": 3.55, "grad_norm": 1.0056558783597644, "learning_rate": 1.642626687149765e-06, "loss": 0.0297, "step": 554 }, { "epoch": 3.56, "grad_norm": 1.0388548400576412, "learning_rate": 1.629369608552696e-06, "loss": 0.0207, "step": 555 }, { "epoch": 3.56, "grad_norm": 0.9676580813043666, "learning_rate": 1.6161499938380873e-06, "loss": 0.0207, "step": 556 }, { "epoch": 3.57, "grad_norm": 0.910615049954989, "learning_rate": 1.6029681077605864e-06, "loss": 0.0194, "step": 557 }, { "epoch": 3.58, "grad_norm": 0.8154656860131219, "learning_rate": 1.5898242143192336e-06, "loss": 0.0179, "step": 558 }, { "epoch": 3.58, "grad_norm": 0.8639595706943999, "learning_rate": 1.576718576752179e-06, "loss": 0.0156, "step": 559 }, { "epoch": 3.59, "grad_norm": 0.9189700127422249, "learning_rate": 1.5636514575314024e-06, "loss": 0.0192, "step": 560 }, { "epoch": 3.6, "grad_norm": 0.9330229070746087, "learning_rate": 1.550623118357463e-06, "loss": 0.0224, "step": 561 }, { "epoch": 3.6, "grad_norm": 0.9035600157428585, "learning_rate": 1.5376338201542538e-06, "loss": 0.0198, "step": 562 }, { "epoch": 3.61, "grad_norm": 0.8321348110175191, "learning_rate": 1.5246838230637831e-06, "loss": 0.0222, "step": 563 }, { "epoch": 3.62, "grad_norm": 0.8596805606431125, "learning_rate": 1.511773386440955e-06, "loss": 0.0195, "step": 564 }, { "epoch": 3.62, "grad_norm": 0.9537922409494741, "learning_rate": 1.4989027688483808e-06, "loss": 0.0205, "step": 565 }, { "epoch": 3.63, "grad_norm": 0.7806033695969874, "learning_rate": 1.4860722280512022e-06, "loss": 0.0168, "step": 566 }, { "epoch": 3.63, "grad_norm": 0.8391056339361249, "learning_rate": 1.473282021011924e-06, "loss": 0.0192, "step": 567 }, { "epoch": 3.64, "grad_norm": 0.9361363433592843, "learning_rate": 1.4605324038852707e-06, "loss": 0.0137, "step": 568 }, { "epoch": 3.65, "grad_norm": 0.9165200492322241, "learning_rate": 1.4478236320130554e-06, "loss": 0.0183, "step": 569 }, { "epoch": 3.65, "grad_norm": 1.0311350391531426, "learning_rate": 1.4351559599190708e-06, "loss": 0.0278, "step": 570 }, { "epoch": 3.66, "grad_norm": 1.002796327201144, "learning_rate": 1.4225296413039794e-06, "loss": 0.0207, "step": 571 }, { "epoch": 3.67, "grad_norm": 0.8663434564935976, "learning_rate": 1.4099449290402492e-06, "loss": 0.0145, "step": 572 }, { "epoch": 3.67, "grad_norm": 0.7829556851775933, "learning_rate": 1.3974020751670734e-06, "loss": 0.0119, "step": 573 }, { "epoch": 3.68, "grad_norm": 0.7838935562572859, "learning_rate": 1.3849013308853369e-06, "loss": 0.014, "step": 574 }, { "epoch": 3.69, "grad_norm": 0.8089531524154776, "learning_rate": 1.3724429465525733e-06, "loss": 0.0172, "step": 575 }, { "epoch": 3.69, "grad_norm": 1.0476096426575607, "learning_rate": 1.360027171677957e-06, "loss": 0.0159, "step": 576 }, { "epoch": 3.7, "grad_norm": 1.1492757887504752, "learning_rate": 1.3476542549173097e-06, "loss": 0.0509, "step": 577 }, { "epoch": 3.71, "grad_norm": 0.9034375118433278, "learning_rate": 1.335324444068108e-06, "loss": 0.0221, "step": 578 }, { "epoch": 3.71, "grad_norm": 0.8650183804828405, "learning_rate": 1.3230379860645363e-06, "loss": 0.0213, "step": 579 }, { "epoch": 3.72, "grad_norm": 1.0015608184240266, "learning_rate": 1.3107951269725286e-06, "loss": 0.018, "step": 580 }, { "epoch": 3.72, "grad_norm": 0.9368052136067349, "learning_rate": 1.2985961119848508e-06, "loss": 0.0197, "step": 581 }, { "epoch": 3.73, "grad_norm": 0.959349340544164, "learning_rate": 1.28644118541618e-06, "loss": 0.0201, "step": 582 }, { "epoch": 3.74, "grad_norm": 0.8723031906642617, "learning_rate": 1.2743305906982184e-06, "loss": 0.0186, "step": 583 }, { "epoch": 3.74, "grad_norm": 0.9478403521550213, "learning_rate": 1.2622645703748163e-06, "loss": 0.0194, "step": 584 }, { "epoch": 3.75, "grad_norm": 0.8494970290194015, "learning_rate": 1.2502433660971122e-06, "loss": 0.0186, "step": 585 }, { "epoch": 3.76, "grad_norm": 0.8551225612629588, "learning_rate": 1.2382672186187003e-06, "loss": 0.017, "step": 586 }, { "epoch": 3.76, "grad_norm": 0.8488361613538855, "learning_rate": 1.2263363677907975e-06, "loss": 0.0169, "step": 587 }, { "epoch": 3.77, "grad_norm": 0.9808510003867794, "learning_rate": 1.214451052557453e-06, "loss": 0.0221, "step": 588 }, { "epoch": 3.78, "grad_norm": 0.8108841674151576, "learning_rate": 1.202611510950747e-06, "loss": 0.0151, "step": 589 }, { "epoch": 3.78, "grad_norm": 0.8886038022214681, "learning_rate": 1.1908179800860415e-06, "loss": 0.0182, "step": 590 }, { "epoch": 3.79, "grad_norm": 0.9119958116007242, "learning_rate": 1.1790706961572176e-06, "loss": 0.0138, "step": 591 }, { "epoch": 3.79, "grad_norm": 0.7574159445793109, "learning_rate": 1.167369894431949e-06, "loss": 0.0162, "step": 592 }, { "epoch": 3.8, "grad_norm": 0.8830721642283024, "learning_rate": 1.1557158092469968e-06, "loss": 0.017, "step": 593 }, { "epoch": 3.81, "grad_norm": 0.8752065538129817, "learning_rate": 1.1441086740035036e-06, "loss": 0.0312, "step": 594 }, { "epoch": 3.81, "grad_norm": 0.7802871014880685, "learning_rate": 1.1325487211623343e-06, "loss": 0.014, "step": 595 }, { "epoch": 3.82, "grad_norm": 0.822063110348498, "learning_rate": 1.121036182239403e-06, "loss": 0.0109, "step": 596 }, { "epoch": 3.83, "grad_norm": 0.7316479690003538, "learning_rate": 1.1095712878010542e-06, "loss": 0.0104, "step": 597 }, { "epoch": 3.83, "grad_norm": 0.7093603836144392, "learning_rate": 1.0981542674594327e-06, "loss": 0.0184, "step": 598 }, { "epoch": 3.84, "grad_norm": 0.7794338741880872, "learning_rate": 1.08678534986789e-06, "loss": 0.0168, "step": 599 }, { "epoch": 3.85, "grad_norm": 1.0811588463022737, "learning_rate": 1.0754647627164022e-06, "loss": 0.0191, "step": 600 }, { "epoch": 3.85, "grad_norm": 0.9111404871044384, "learning_rate": 1.064192732727016e-06, "loss": 0.0189, "step": 601 }, { "epoch": 3.86, "grad_norm": 0.8869358514707676, "learning_rate": 1.0529694856493002e-06, "loss": 0.0265, "step": 602 }, { "epoch": 3.87, "grad_norm": 1.0300105674738138, "learning_rate": 1.0417952462558286e-06, "loss": 0.0174, "step": 603 }, { "epoch": 3.87, "grad_norm": 0.9154400665993732, "learning_rate": 1.0306702383376813e-06, "loss": 0.0216, "step": 604 }, { "epoch": 3.88, "grad_norm": 0.7790200245067616, "learning_rate": 1.0195946846999551e-06, "loss": 0.0207, "step": 605 }, { "epoch": 3.88, "grad_norm": 0.9245289976289252, "learning_rate": 1.0085688071573086e-06, "loss": 0.0157, "step": 606 }, { "epoch": 3.89, "grad_norm": 0.845440385987704, "learning_rate": 9.97592826529514e-07, "loss": 0.0168, "step": 607 }, { "epoch": 3.9, "grad_norm": 0.6828942627029853, "learning_rate": 9.866669626370412e-07, "loss": 0.0136, "step": 608 }, { "epoch": 3.9, "grad_norm": 0.7507599698976973, "learning_rate": 9.757914342966495e-07, "loss": 0.0159, "step": 609 }, { "epoch": 3.91, "grad_norm": 0.7608568860765801, "learning_rate": 9.649664593170062e-07, "loss": 0.0182, "step": 610 }, { "epoch": 3.92, "grad_norm": 0.6964025046289981, "learning_rate": 9.541922544943295e-07, "loss": 0.0143, "step": 611 }, { "epoch": 3.92, "grad_norm": 0.8040858287775848, "learning_rate": 9.434690356080394e-07, "loss": 0.0226, "step": 612 }, { "epoch": 3.93, "grad_norm": 0.8367198110469669, "learning_rate": 9.327970174164409e-07, "loss": 0.0146, "step": 613 }, { "epoch": 3.94, "grad_norm": 0.8193340863552715, "learning_rate": 9.221764136524202e-07, "loss": 0.0192, "step": 614 }, { "epoch": 3.94, "grad_norm": 0.7265415845223846, "learning_rate": 9.116074370191705e-07, "loss": 0.0142, "step": 615 }, { "epoch": 3.95, "grad_norm": 0.69231773481907, "learning_rate": 9.010902991859196e-07, "loss": 0.0127, "step": 616 }, { "epoch": 3.96, "grad_norm": 0.7678911416679844, "learning_rate": 8.906252107837054e-07, "loss": 0.0135, "step": 617 }, { "epoch": 3.96, "grad_norm": 0.9669858607123597, "learning_rate": 8.802123814011458e-07, "loss": 0.0203, "step": 618 }, { "epoch": 3.97, "grad_norm": 0.7067832443423266, "learning_rate": 8.698520195802499e-07, "loss": 0.0216, "step": 619 }, { "epoch": 3.97, "grad_norm": 0.9954743141268118, "learning_rate": 8.595443328122345e-07, "loss": 0.0185, "step": 620 }, { "epoch": 3.98, "grad_norm": 0.9698572023393062, "learning_rate": 8.492895275333705e-07, "loss": 0.0258, "step": 621 }, { "epoch": 3.99, "grad_norm": 0.8472663094138484, "learning_rate": 8.390878091208544e-07, "loss": 0.0175, "step": 622 }, { "epoch": 3.99, "grad_norm": 0.6974557493903148, "learning_rate": 8.289393818886837e-07, "loss": 0.0148, "step": 623 }, { "epoch": 4.0, "grad_norm": 0.8038533284087307, "learning_rate": 8.188444490835774e-07, "loss": 0.0214, "step": 624 }, { "epoch": 4.01, "grad_norm": 0.7210389335963133, "learning_rate": 8.088032128808952e-07, "loss": 0.0196, "step": 625 }, { "epoch": 4.01, "grad_norm": 0.7781631119204625, "learning_rate": 7.988158743805973e-07, "loss": 0.0127, "step": 626 }, { "epoch": 4.02, "grad_norm": 0.8340070618057464, "learning_rate": 7.888826336032093e-07, "loss": 0.0146, "step": 627 }, { "epoch": 4.03, "grad_norm": 0.8166490795788324, "learning_rate": 7.790036894858198e-07, "loss": 0.0145, "step": 628 }, { "epoch": 4.03, "grad_norm": 0.7195731647426649, "learning_rate": 7.691792398780962e-07, "loss": 0.0119, "step": 629 }, { "epoch": 4.04, "grad_norm": 0.7441257497162197, "learning_rate": 7.594094815383223e-07, "loss": 0.0125, "step": 630 }, { "epoch": 4.04, "grad_norm": 0.8162804472924826, "learning_rate": 7.496946101294585e-07, "loss": 0.0096, "step": 631 }, { "epoch": 4.05, "grad_norm": 0.7311628316123926, "learning_rate": 7.400348202152192e-07, "loss": 0.0191, "step": 632 }, { "epoch": 4.06, "grad_norm": 0.6708461799554153, "learning_rate": 7.304303052561841e-07, "loss": 0.0128, "step": 633 }, { "epoch": 4.06, "grad_norm": 0.6419666578612593, "learning_rate": 7.208812576059113e-07, "loss": 0.014, "step": 634 }, { "epoch": 4.07, "grad_norm": 0.571786193933189, "learning_rate": 7.113878685070994e-07, "loss": 0.0126, "step": 635 }, { "epoch": 4.08, "grad_norm": 0.47401578964273894, "learning_rate": 7.019503280877466e-07, "loss": 0.0089, "step": 636 }, { "epoch": 4.08, "grad_norm": 0.6164893786390164, "learning_rate": 6.925688253573465e-07, "loss": 0.0145, "step": 637 }, { "epoch": 4.09, "grad_norm": 0.5908416108258375, "learning_rate": 6.832435482031064e-07, "loss": 0.0102, "step": 638 }, { "epoch": 4.1, "grad_norm": 0.8989690237248916, "learning_rate": 6.73974683386176e-07, "loss": 0.0121, "step": 639 }, { "epoch": 4.1, "grad_norm": 0.5982510810482049, "learning_rate": 6.647624165379173e-07, "loss": 0.0103, "step": 640 }, { "epoch": 4.11, "grad_norm": 0.5954747425687978, "learning_rate": 6.55606932156175e-07, "loss": 0.0147, "step": 641 }, { "epoch": 4.12, "grad_norm": 0.5239184374248897, "learning_rate": 6.465084136015951e-07, "loss": 0.0119, "step": 642 }, { "epoch": 4.12, "grad_norm": 0.7241845644184443, "learning_rate": 6.374670430939404e-07, "loss": 0.0134, "step": 643 }, { "epoch": 4.13, "grad_norm": 0.6317510180810488, "learning_rate": 6.284830017084488e-07, "loss": 0.0103, "step": 644 }, { "epoch": 4.13, "grad_norm": 0.6913499172359965, "learning_rate": 6.195564693722027e-07, "loss": 0.0118, "step": 645 }, { "epoch": 4.14, "grad_norm": 0.5052384619401886, "learning_rate": 6.106876248605299e-07, "loss": 0.0113, "step": 646 }, { "epoch": 4.15, "grad_norm": 0.5731989344051904, "learning_rate": 6.018766457934177e-07, "loss": 0.0074, "step": 647 }, { "epoch": 4.15, "grad_norm": 0.6142999396736151, "learning_rate": 5.931237086319592e-07, "loss": 0.0084, "step": 648 }, { "epoch": 4.16, "grad_norm": 0.823064223506891, "learning_rate": 5.844289886748196e-07, "loss": 0.0105, "step": 649 }, { "epoch": 4.17, "grad_norm": 0.4979417009348811, "learning_rate": 5.757926600547231e-07, "loss": 0.0084, "step": 650 }, { "epoch": 4.17, "grad_norm": 0.5818651933396465, "learning_rate": 5.672148957349661e-07, "loss": 0.0122, "step": 651 }, { "epoch": 4.18, "grad_norm": 0.6507789337199134, "learning_rate": 5.586958675059548e-07, "loss": 0.0122, "step": 652 }, { "epoch": 4.19, "grad_norm": 0.6049124608829605, "learning_rate": 5.502357459817639e-07, "loss": 0.013, "step": 653 }, { "epoch": 4.19, "grad_norm": 0.7753742285832561, "learning_rate": 5.418347005967189e-07, "loss": 0.0091, "step": 654 }, { "epoch": 4.2, "grad_norm": 0.7274448172473815, "learning_rate": 5.334928996020013e-07, "loss": 0.0087, "step": 655 }, { "epoch": 4.21, "grad_norm": 0.6502094548231069, "learning_rate": 5.252105100622848e-07, "loss": 0.0106, "step": 656 }, { "epoch": 4.21, "grad_norm": 0.40178486721608847, "learning_rate": 5.169876978523828e-07, "loss": 0.007, "step": 657 }, { "epoch": 4.22, "grad_norm": 0.659442151178035, "learning_rate": 5.088246276539292e-07, "loss": 0.0113, "step": 658 }, { "epoch": 4.22, "grad_norm": 0.46880646234385925, "learning_rate": 5.0072146295208e-07, "loss": 0.0073, "step": 659 }, { "epoch": 4.23, "grad_norm": 0.7953540459085934, "learning_rate": 4.926783660322411e-07, "loss": 0.0085, "step": 660 }, { "epoch": 4.24, "grad_norm": 0.48150007551722795, "learning_rate": 4.846954979768149e-07, "loss": 0.0057, "step": 661 }, { "epoch": 4.24, "grad_norm": 0.6684968423071089, "learning_rate": 4.7677301866197455e-07, "loss": 0.0072, "step": 662 }, { "epoch": 4.25, "grad_norm": 0.6745759023213915, "learning_rate": 4.6891108675446453e-07, "loss": 0.0076, "step": 663 }, { "epoch": 4.26, "grad_norm": 0.43582709520930096, "learning_rate": 4.611098597084226e-07, "loss": 0.0074, "step": 664 }, { "epoch": 4.26, "grad_norm": 0.5144595833892877, "learning_rate": 4.533694937622227e-07, "loss": 0.0088, "step": 665 }, { "epoch": 4.27, "grad_norm": 0.46455722697398655, "learning_rate": 4.456901439353499e-07, "loss": 0.0089, "step": 666 }, { "epoch": 4.28, "grad_norm": 0.38039382254115717, "learning_rate": 4.3807196402529535e-07, "loss": 0.0059, "step": 667 }, { "epoch": 4.28, "grad_norm": 0.5159499420296817, "learning_rate": 4.3051510660447336e-07, "loss": 0.0087, "step": 668 }, { "epoch": 4.29, "grad_norm": 0.7569018705460623, "learning_rate": 4.2301972301716934e-07, "loss": 0.0108, "step": 669 }, { "epoch": 4.29, "grad_norm": 0.3133032924877106, "learning_rate": 4.155859633765044e-07, "loss": 0.0067, "step": 670 }, { "epoch": 4.3, "grad_norm": 0.44083154398560964, "learning_rate": 4.0821397656143503e-07, "loss": 0.0054, "step": 671 }, { "epoch": 4.31, "grad_norm": 0.4568048829370673, "learning_rate": 4.009039102137657e-07, "loss": 0.0077, "step": 672 }, { "epoch": 4.31, "grad_norm": 0.3862359262132665, "learning_rate": 3.9365591073519387e-07, "loss": 0.0082, "step": 673 }, { "epoch": 4.32, "grad_norm": 0.41096373612079085, "learning_rate": 3.8647012328438085e-07, "loss": 0.0103, "step": 674 }, { "epoch": 4.33, "grad_norm": 0.4363393359078843, "learning_rate": 3.793466917740402e-07, "loss": 0.0049, "step": 675 }, { "epoch": 4.33, "grad_norm": 0.5352286390397114, "learning_rate": 3.7228575886805744e-07, "loss": 0.007, "step": 676 }, { "epoch": 4.34, "grad_norm": 0.4879159236521443, "learning_rate": 3.6528746597863283e-07, "loss": 0.0061, "step": 677 }, { "epoch": 4.35, "grad_norm": 0.4921806354878511, "learning_rate": 3.583519532634516e-07, "loss": 0.0085, "step": 678 }, { "epoch": 4.35, "grad_norm": 0.45260045929318704, "learning_rate": 3.514793596228702e-07, "loss": 0.0096, "step": 679 }, { "epoch": 4.36, "grad_norm": 0.5256395174007804, "learning_rate": 3.44669822697144e-07, "loss": 0.0078, "step": 680 }, { "epoch": 4.37, "grad_norm": 0.47644883575785074, "learning_rate": 3.3792347886366265e-07, "loss": 0.0057, "step": 681 }, { "epoch": 4.37, "grad_norm": 0.48359867196539846, "learning_rate": 3.31240463234221e-07, "loss": 0.0077, "step": 682 }, { "epoch": 4.38, "grad_norm": 0.4273121166226652, "learning_rate": 3.2462090965231767e-07, "loss": 0.0025, "step": 683 }, { "epoch": 4.38, "grad_norm": 0.37391829672343596, "learning_rate": 3.180649506904667e-07, "loss": 0.0089, "step": 684 }, { "epoch": 4.39, "grad_norm": 0.29650839546439783, "learning_rate": 3.1157271764755085e-07, "loss": 0.006, "step": 685 }, { "epoch": 4.4, "grad_norm": 0.4198572707034165, "learning_rate": 3.0514434054618216e-07, "loss": 0.0055, "step": 686 }, { "epoch": 4.4, "grad_norm": 0.6603558563893405, "learning_rate": 2.987799481301091e-07, "loss": 0.0088, "step": 687 }, { "epoch": 4.41, "grad_norm": 0.4060296688138882, "learning_rate": 2.924796678616297e-07, "loss": 0.0113, "step": 688 }, { "epoch": 4.42, "grad_norm": 0.39194103781295037, "learning_rate": 2.862436259190414e-07, "loss": 0.0073, "step": 689 }, { "epoch": 4.42, "grad_norm": 0.4346352843060614, "learning_rate": 2.800719471941152e-07, "loss": 0.0059, "step": 690 }, { "epoch": 4.43, "grad_norm": 0.3183987621587136, "learning_rate": 2.739647552895949e-07, "loss": 0.0069, "step": 691 }, { "epoch": 4.44, "grad_norm": 0.4423423752806832, "learning_rate": 2.6792217251671744e-07, "loss": 0.0059, "step": 692 }, { "epoch": 4.44, "grad_norm": 0.3864494243096827, "learning_rate": 2.619443198927677e-07, "loss": 0.006, "step": 693 }, { "epoch": 4.45, "grad_norm": 0.322457016979029, "learning_rate": 2.5603131713865374e-07, "loss": 0.0069, "step": 694 }, { "epoch": 4.46, "grad_norm": 0.4345066616936449, "learning_rate": 2.50183282676508e-07, "loss": 0.0091, "step": 695 }, { "epoch": 4.46, "grad_norm": 0.34388210592322604, "learning_rate": 2.444003336273163e-07, "loss": 0.0032, "step": 696 }, { "epoch": 4.47, "grad_norm": 0.2991066001834922, "learning_rate": 2.3868258580857164e-07, "loss": 0.0046, "step": 697 }, { "epoch": 4.47, "grad_norm": 0.2816700072013969, "learning_rate": 2.3303015373195713e-07, "loss": 0.0054, "step": 698 }, { "epoch": 4.48, "grad_norm": 0.2947482752142615, "learning_rate": 2.2744315060104846e-07, "loss": 0.0047, "step": 699 }, { "epoch": 4.49, "grad_norm": 0.3450546396283738, "learning_rate": 2.2192168830904963e-07, "loss": 0.006, "step": 700 }, { "epoch": 4.49, "grad_norm": 0.3822670216774389, "learning_rate": 2.1646587743655287e-07, "loss": 0.0062, "step": 701 }, { "epoch": 4.5, "grad_norm": 0.36389056812509235, "learning_rate": 2.1107582724932088e-07, "loss": 0.0074, "step": 702 }, { "epoch": 4.51, "grad_norm": 0.2241853060874336, "learning_rate": 2.0575164569610016e-07, "loss": 0.0031, "step": 703 }, { "epoch": 4.51, "grad_norm": 0.3284239718422547, "learning_rate": 2.0049343940645935e-07, "loss": 0.006, "step": 704 }, { "epoch": 4.52, "grad_norm": 0.27416640208522525, "learning_rate": 1.953013136886541e-07, "loss": 0.0043, "step": 705 }, { "epoch": 4.53, "grad_norm": 0.48892924275252236, "learning_rate": 1.901753725275166e-07, "loss": 0.0054, "step": 706 }, { "epoch": 4.53, "grad_norm": 0.3072478855542186, "learning_rate": 1.8511571858237357e-07, "loss": 0.0073, "step": 707 }, { "epoch": 4.54, "grad_norm": 0.2971096027391294, "learning_rate": 1.801224531849908e-07, "loss": 0.0045, "step": 708 }, { "epoch": 4.54, "grad_norm": 0.41409498453561594, "learning_rate": 1.7519567633754352e-07, "loss": 0.0072, "step": 709 }, { "epoch": 4.55, "grad_norm": 0.4406136254485443, "learning_rate": 1.70335486710614e-07, "loss": 0.0103, "step": 710 }, { "epoch": 4.56, "grad_norm": 0.42158029678243275, "learning_rate": 1.6554198164121265e-07, "loss": 0.0052, "step": 711 }, { "epoch": 4.56, "grad_norm": 0.4431835034360088, "learning_rate": 1.6081525713083428e-07, "loss": 0.0065, "step": 712 }, { "epoch": 4.57, "grad_norm": 0.3242131925310783, "learning_rate": 1.561554078435296e-07, "loss": 0.0049, "step": 713 }, { "epoch": 4.58, "grad_norm": 0.30171217982310045, "learning_rate": 1.5156252710401207e-07, "loss": 0.0055, "step": 714 }, { "epoch": 4.58, "grad_norm": 0.310486222626081, "learning_rate": 1.4703670689578884e-07, "loss": 0.0058, "step": 715 }, { "epoch": 4.59, "grad_norm": 0.3339175632499386, "learning_rate": 1.4257803785931926e-07, "loss": 0.0068, "step": 716 }, { "epoch": 4.6, "grad_norm": 0.3042650666913759, "learning_rate": 1.3818660929019717e-07, "loss": 0.0071, "step": 717 }, { "epoch": 4.6, "grad_norm": 0.41181026400823106, "learning_rate": 1.3386250913736408e-07, "loss": 0.0075, "step": 718 }, { "epoch": 4.61, "grad_norm": 0.33075765366170357, "learning_rate": 1.296058240013491e-07, "loss": 0.0075, "step": 719 }, { "epoch": 4.62, "grad_norm": 0.313752325847203, "learning_rate": 1.2541663913253191e-07, "loss": 0.0059, "step": 720 }, { "epoch": 4.62, "grad_norm": 0.3825929288696057, "learning_rate": 1.2129503842943645e-07, "loss": 0.0067, "step": 721 }, { "epoch": 4.63, "grad_norm": 0.2587277820133266, "learning_rate": 1.1724110443705115e-07, "loss": 0.0047, "step": 722 }, { "epoch": 4.63, "grad_norm": 0.2844582633390916, "learning_rate": 1.1325491834517676e-07, "loss": 0.0059, "step": 723 }, { "epoch": 4.64, "grad_norm": 0.3590012407056503, "learning_rate": 1.0933655998679653e-07, "loss": 0.0035, "step": 724 }, { "epoch": 4.65, "grad_norm": 0.29637783616327534, "learning_rate": 1.0548610783648199e-07, "loss": 0.0052, "step": 725 }, { "epoch": 4.65, "grad_norm": 0.468740471256091, "learning_rate": 1.0170363900881795e-07, "loss": 0.0089, "step": 726 }, { "epoch": 4.66, "grad_norm": 0.3818989755993541, "learning_rate": 9.798922925685994e-08, "loss": 0.0066, "step": 727 }, { "epoch": 4.67, "grad_norm": 0.19662730812801094, "learning_rate": 9.434295297061668e-08, "loss": 0.0029, "step": 728 }, { "epoch": 4.67, "grad_norm": 0.4179664781848645, "learning_rate": 9.076488317555886e-08, "loss": 0.0041, "step": 729 }, { "epoch": 4.68, "grad_norm": 0.3006062367530363, "learning_rate": 8.725509153115918e-08, "loss": 0.0053, "step": 730 }, { "epoch": 4.69, "grad_norm": 0.3304263245245509, "learning_rate": 8.38136483294546e-08, "loss": 0.0063, "step": 731 }, { "epoch": 4.69, "grad_norm": 0.33457641464311794, "learning_rate": 8.044062249364048e-08, "loss": 0.0042, "step": 732 }, { "epoch": 4.7, "grad_norm": 0.5879062825513777, "learning_rate": 7.713608157668921e-08, "loss": 0.0215, "step": 733 }, { "epoch": 4.71, "grad_norm": 0.3577419876216768, "learning_rate": 7.390009175999835e-08, "loss": 0.0082, "step": 734 }, { "epoch": 4.71, "grad_norm": 0.3770406600422441, "learning_rate": 7.073271785206314e-08, "loss": 0.0079, "step": 735 }, { "epoch": 4.72, "grad_norm": 0.33565852275052394, "learning_rate": 6.763402328718116e-08, "loss": 0.0062, "step": 736 }, { "epoch": 4.72, "grad_norm": 0.3033892045381189, "learning_rate": 6.460407012417918e-08, "loss": 0.0053, "step": 737 }, { "epoch": 4.73, "grad_norm": 0.30091494834673244, "learning_rate": 6.164291904517333e-08, "loss": 0.0054, "step": 738 }, { "epoch": 4.74, "grad_norm": 0.3765855319872929, "learning_rate": 5.875062935435121e-08, "loss": 0.0054, "step": 739 }, { "epoch": 4.74, "grad_norm": 0.32640485923404255, "learning_rate": 5.592725897678446e-08, "loss": 0.0078, "step": 740 }, { "epoch": 4.75, "grad_norm": 0.43041557447303824, "learning_rate": 5.3172864457271926e-08, "loss": 0.0073, "step": 741 }, { "epoch": 4.76, "grad_norm": 0.3145742514405517, "learning_rate": 5.048750095920151e-08, "loss": 0.0063, "step": 742 }, { "epoch": 4.76, "grad_norm": 0.298005394632711, "learning_rate": 4.787122226345014e-08, "loss": 0.0065, "step": 743 }, { "epoch": 4.77, "grad_norm": 0.3474563462151658, "learning_rate": 4.532408076730504e-08, "loss": 0.008, "step": 744 }, { "epoch": 4.78, "grad_norm": 0.29691793848547526, "learning_rate": 4.2846127483414206e-08, "loss": 0.005, "step": 745 }, { "epoch": 4.78, "grad_norm": 0.47996246825010264, "learning_rate": 4.043741203876483e-08, "loss": 0.0065, "step": 746 }, { "epoch": 4.79, "grad_norm": 0.2723339869975387, "learning_rate": 3.80979826736893e-08, "loss": 0.0038, "step": 747 }, { "epoch": 4.79, "grad_norm": 0.3193070254481293, "learning_rate": 3.58278862409e-08, "loss": 0.0064, "step": 748 }, { "epoch": 4.8, "grad_norm": 0.3658645249659323, "learning_rate": 3.3627168204549306e-08, "loss": 0.0063, "step": 749 }, { "epoch": 4.81, "grad_norm": 0.4539998378845374, "learning_rate": 3.1495872639320357e-08, "loss": 0.0146, "step": 750 }, { "epoch": 4.81, "grad_norm": 0.33061360188059097, "learning_rate": 2.9434042229544543e-08, "loss": 0.0056, "step": 751 }, { "epoch": 4.82, "grad_norm": 0.37571330796736685, "learning_rate": 2.7441718268344737e-08, "loss": 0.0042, "step": 752 }, { "epoch": 4.83, "grad_norm": 0.24563965048779188, "learning_rate": 2.5518940656811095e-08, "loss": 0.0035, "step": 753 }, { "epoch": 4.83, "grad_norm": 0.39737175448371265, "learning_rate": 2.3665747903199418e-08, "loss": 0.008, "step": 754 }, { "epoch": 4.84, "grad_norm": 0.36816172196985625, "learning_rate": 2.1882177122162173e-08, "loss": 0.0069, "step": 755 }, { "epoch": 4.85, "grad_norm": 0.32357455960281883, "learning_rate": 2.0168264034002404e-08, "loss": 0.0072, "step": 756 }, { "epoch": 4.85, "grad_norm": 0.327715642412912, "learning_rate": 1.8524042963961095e-08, "loss": 0.0068, "step": 757 }, { "epoch": 4.86, "grad_norm": 0.6097749311012022, "learning_rate": 1.6949546841528607e-08, "loss": 0.013, "step": 758 }, { "epoch": 4.87, "grad_norm": 0.4321595864495073, "learning_rate": 1.544480719978447e-08, "loss": 0.0091, "step": 759 }, { "epoch": 4.87, "grad_norm": 0.522892152816329, "learning_rate": 1.4009854174767521e-08, "loss": 0.0095, "step": 760 }, { "epoch": 4.88, "grad_norm": 0.43910025769757355, "learning_rate": 1.2644716504870091e-08, "loss": 0.0092, "step": 761 }, { "epoch": 4.88, "grad_norm": 0.3819688600545869, "learning_rate": 1.1349421530265246e-08, "loss": 0.0057, "step": 762 }, { "epoch": 4.89, "grad_norm": 0.34216774949023093, "learning_rate": 1.0123995192356183e-08, "loss": 0.0061, "step": 763 }, { "epoch": 4.9, "grad_norm": 0.32918386903466523, "learning_rate": 8.968462033259405e-09, "loss": 0.0056, "step": 764 }, { "epoch": 4.9, "grad_norm": 0.32782865250712573, "learning_rate": 7.882845195312016e-09, "loss": 0.0069, "step": 765 }, { "epoch": 4.91, "grad_norm": 0.32386570553933425, "learning_rate": 6.8671664206073625e-09, "loss": 0.0085, "step": 766 }, { "epoch": 4.92, "grad_norm": 0.29040869309413037, "learning_rate": 5.921446050561386e-09, "loss": 0.0065, "step": 767 }, { "epoch": 4.92, "grad_norm": 0.39088721124754344, "learning_rate": 5.0457030255038334e-09, "loss": 0.0104, "step": 768 }, { "epoch": 4.93, "grad_norm": 0.35735404978476537, "learning_rate": 4.239954884299401e-09, "loss": 0.0053, "step": 769 }, { "epoch": 4.94, "grad_norm": 0.3435391858174354, "learning_rate": 3.5042177639972304e-09, "loss": 0.0091, "step": 770 }, { "epoch": 4.94, "grad_norm": 0.30805175946860186, "learning_rate": 2.838506399506446e-09, "loss": 0.006, "step": 771 }, { "epoch": 4.95, "grad_norm": 0.31450891017452326, "learning_rate": 2.2428341233012294e-09, "loss": 0.0052, "step": 772 }, { "epoch": 4.96, "grad_norm": 0.29550957672641986, "learning_rate": 1.7172128651554152e-09, "loss": 0.0053, "step": 773 }, { "epoch": 4.96, "grad_norm": 0.38035918617041065, "learning_rate": 1.2616531519011874e-09, "loss": 0.0082, "step": 774 }, { "epoch": 4.97, "grad_norm": 0.3753382938936097, "learning_rate": 8.761641072196346e-10, "loss": 0.0117, "step": 775 }, { "epoch": 4.97, "grad_norm": 0.34899017006560656, "learning_rate": 5.607534514585066e-10, "loss": 0.009, "step": 776 }, { "epoch": 4.98, "grad_norm": 0.4164441780723029, "learning_rate": 3.1542750147639517e-10, "loss": 0.0117, "step": 777 }, { "epoch": 4.99, "grad_norm": 0.34196331948745795, "learning_rate": 1.401911705168346e-10, "loss": 0.0072, "step": 778 }, { "epoch": 4.99, "grad_norm": 0.30827744639368837, "learning_rate": 3.5047968109214176e-11, "loss": 0.0079, "step": 779 }, { "epoch": 5.0, "grad_norm": 0.3980736767828406, "learning_rate": 0.0, "loss": 0.0105, "step": 780 }, { "epoch": 5.0, "step": 780, "total_flos": 0.0, "train_loss": 0.17049677305622027, "train_runtime": 2176.8836, "train_samples_per_second": 11.484, "train_steps_per_second": 0.358 } ], "logging_steps": 1.0, "max_steps": 780, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }