{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998741980123286, "eval_steps": 100, "global_step": 3974, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.3828125, "learning_rate": 1.2562814070351758e-09, "logits/chosen": 0.2628047466278076, "logits/rejected": 0.7914568185806274, "logps/chosen": -183.46725463867188, "logps/rejected": -164.62379455566406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.341796875, "learning_rate": 1.2562814070351759e-08, "logits/chosen": 0.2203846573829651, "logits/rejected": 0.38432157039642334, "logps/chosen": -209.1444549560547, "logps/rejected": -223.59898376464844, "loss": 0.6932, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.0006632560398429632, "rewards/margins": -0.0002014244528254494, "rewards/margins_max": 0.001730198273435235, "rewards/margins_min": -0.0021330472081899643, "rewards/margins_std": 0.002731727436184883, "rewards/rejected": -0.0004618316306732595, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.384765625, "learning_rate": 2.5125628140703518e-08, "logits/chosen": 0.10513466596603394, "logits/rejected": 0.49032872915267944, "logps/chosen": -212.03952026367188, "logps/rejected": -206.0270538330078, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0004767618083860725, "rewards/margins": 0.0005016979994252324, "rewards/margins_max": 0.002849506214261055, "rewards/margins_min": -0.0018461104482412338, "rewards/margins_std": 0.0033203023485839367, "rewards/rejected": -0.0009784598369151354, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.4609375, "learning_rate": 3.7688442211055274e-08, "logits/chosen": 0.18849435448646545, "logits/rejected": 0.577508270740509, "logps/chosen": -234.439453125, "logps/rejected": -218.8050994873047, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008481988916173577, "rewards/margins": 0.00045646479702554643, "rewards/margins_max": 0.0036784582771360874, "rewards/margins_min": -0.00276552839204669, "rewards/margins_std": 0.004556586500257254, "rewards/rejected": -0.001304663484916091, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.40234375, "learning_rate": 5.0251256281407036e-08, "logits/chosen": 0.06399735063314438, "logits/rejected": 0.3126963973045349, "logps/chosen": -229.8418731689453, "logps/rejected": -212.98974609375, "loss": 0.693, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0007826805231161416, "rewards/margins": 5.8324822020949796e-05, "rewards/margins_max": 0.003414541482925415, "rewards/margins_min": -0.0032978919334709644, "rewards/margins_std": 0.004746406339108944, "rewards/rejected": -0.0008410053560510278, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.38671875, "learning_rate": 6.281407035175879e-08, "logits/chosen": 0.24704048037528992, "logits/rejected": 0.32996249198913574, "logps/chosen": -208.44418334960938, "logps/rejected": -244.45327758789062, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.00014318754256237298, "rewards/margins": 0.00012233357119839638, "rewards/margins_max": 0.002655323129147291, "rewards/margins_min": -0.002410655841231346, "rewards/margins_std": 0.003582187695428729, "rewards/rejected": -0.00026552105555310845, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.43359375, "learning_rate": 7.537688442211055e-08, "logits/chosen": 0.1713772714138031, "logits/rejected": 0.5069630146026611, "logps/chosen": -227.8589324951172, "logps/rejected": -224.32861328125, "loss": 0.6926, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 3.094528801739216e-05, "rewards/margins": 0.0013918608892709017, "rewards/margins_max": 0.0035881102085113525, "rewards/margins_min": -0.0008043880807235837, "rewards/margins_std": 0.0031059649772942066, "rewards/rejected": -0.0013609156012535095, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.423828125, "learning_rate": 8.793969849246232e-08, "logits/chosen": 0.16970382630825043, "logits/rejected": 0.3980051279067993, "logps/chosen": -211.1843719482422, "logps/rejected": -210.8779754638672, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00044937609345652163, "rewards/margins": 0.00044458196498453617, "rewards/margins_max": 0.002646425273269415, "rewards/margins_min": -0.001757261110469699, "rewards/margins_std": 0.003113876562565565, "rewards/rejected": -0.0008939580875448883, "step": 70 }, { "epoch": 0.02, "grad_norm": 0.427734375, "learning_rate": 1.0050251256281407e-07, "logits/chosen": 0.11841484159231186, "logits/rejected": 0.36562803387641907, "logps/chosen": -212.44210815429688, "logps/rejected": -220.71353149414062, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0003548894892446697, "rewards/margins": 0.0006629957933910191, "rewards/margins_max": 0.003219243139028549, "rewards/margins_min": -0.0018932514358311892, "rewards/margins_std": 0.0036150794476270676, "rewards/rejected": -0.0010178850498050451, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.361328125, "learning_rate": 1.1306532663316583e-07, "logits/chosen": 0.06404288858175278, "logits/rejected": 0.43069329857826233, "logps/chosen": -222.18508911132812, "logps/rejected": -201.24208068847656, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00048076437087729573, "rewards/margins": 0.0008583302842453122, "rewards/margins_max": 0.0031086846720427275, "rewards/margins_min": -0.001392024103552103, "rewards/margins_std": 0.0031824815087020397, "rewards/rejected": -0.0013390944804996252, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.384765625, "learning_rate": 1.2562814070351758e-07, "logits/chosen": 0.13037420809268951, "logits/rejected": 0.397468239068985, "logps/chosen": -195.5172882080078, "logps/rejected": -211.7163543701172, "loss": 0.6926, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0003117876185569912, "rewards/margins": 0.00045311544090509415, "rewards/margins_max": 0.0027989645022898912, "rewards/margins_min": -0.001892733620479703, "rewards/margins_std": 0.0033175316639244556, "rewards/rejected": -0.0007649030303582549, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.40234375, "learning_rate": 1.3819095477386933e-07, "logits/chosen": 0.10546330362558365, "logits/rejected": 0.3221648633480072, "logps/chosen": -205.634033203125, "logps/rejected": -220.6438751220703, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00017706633661873639, "rewards/margins": 0.0008145031752064824, "rewards/margins_max": 0.0027033700607717037, "rewards/margins_min": -0.001074363710358739, "rewards/margins_std": 0.0026712610851973295, "rewards/rejected": -0.0009915695991367102, "step": 110 }, { "epoch": 0.03, "grad_norm": 0.3984375, "learning_rate": 1.507537688442211e-07, "logits/chosen": 0.15270324051380157, "logits/rejected": 0.6443465948104858, "logps/chosen": -218.09036254882812, "logps/rejected": -197.12924194335938, "loss": 0.6926, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.948692988662515e-05, "rewards/margins": 0.0013199648819863796, "rewards/margins_max": 0.0033932540100067854, "rewards/margins_min": -0.0007533244788646698, "rewards/margins_std": 0.0029320737812668085, "rewards/rejected": -0.0013004777720198035, "step": 120 }, { "epoch": 0.03, "grad_norm": 0.4140625, "learning_rate": 1.6331658291457286e-07, "logits/chosen": 0.05510733649134636, "logits/rejected": 0.503358006477356, "logps/chosen": -231.0702667236328, "logps/rejected": -220.4000701904297, "loss": 0.692, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.000123643854749389, "rewards/margins": 0.0013444090727716684, "rewards/margins_max": 0.00397081533446908, "rewards/margins_min": -0.0012819975381717086, "rewards/margins_std": 0.0037142992950975895, "rewards/rejected": -0.0014680528547614813, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.328125, "learning_rate": 1.7587939698492463e-07, "logits/chosen": 0.12965205311775208, "logits/rejected": 0.4733223021030426, "logps/chosen": -220.08065795898438, "logps/rejected": -227.9157257080078, "loss": 0.6918, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0006399345584213734, "rewards/margins": 0.002478922950103879, "rewards/margins_max": 0.004973983857780695, "rewards/margins_min": -1.6138097635121085e-05, "rewards/margins_std": 0.0035285488702356815, "rewards/rejected": -0.0018389882752671838, "step": 140 }, { "epoch": 0.04, "grad_norm": 0.361328125, "learning_rate": 1.8844221105527638e-07, "logits/chosen": 0.21623842418193817, "logits/rejected": 0.5844063758850098, "logps/chosen": -225.3024139404297, "logps/rejected": -217.4437713623047, "loss": 0.6916, "rewards/accuracies": 0.8125, "rewards/chosen": 0.00041880743810907006, "rewards/margins": 0.002954686526209116, "rewards/margins_max": 0.005462953355163336, "rewards/margins_min": 0.00044641969725489616, "rewards/margins_std": 0.0035472246818244457, "rewards/rejected": -0.002535879146307707, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.37890625, "learning_rate": 2.0100502512562815e-07, "logits/chosen": -0.0007796346908435225, "logits/rejected": 0.3263722062110901, "logps/chosen": -224.91897583007812, "logps/rejected": -222.8339385986328, "loss": 0.6913, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0014154270756989717, "rewards/margins": 0.0035228331107646227, "rewards/margins_max": 0.0065108961425721645, "rewards/margins_min": 0.0005347708938643336, "rewards/margins_std": 0.004225758835673332, "rewards/rejected": -0.002107406035065651, "step": 160 }, { "epoch": 0.04, "grad_norm": 0.4375, "learning_rate": 2.135678391959799e-07, "logits/chosen": 0.17786478996276855, "logits/rejected": 0.4296552538871765, "logps/chosen": -211.2811737060547, "logps/rejected": -215.7751007080078, "loss": 0.691, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.0017340866615995765, "rewards/margins": 0.004576126113533974, "rewards/margins_max": 0.007594869937747717, "rewards/margins_min": 0.0015573825221508741, "rewards/margins_std": 0.004269148223102093, "rewards/rejected": -0.0028420393355190754, "step": 170 }, { "epoch": 0.05, "grad_norm": 0.349609375, "learning_rate": 2.2613065326633166e-07, "logits/chosen": 0.1166311651468277, "logits/rejected": 0.3715800344944, "logps/chosen": -196.72335815429688, "logps/rejected": -200.13356018066406, "loss": 0.6907, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.002559956628829241, "rewards/margins": 0.005222607403993607, "rewards/margins_max": 0.008429496549069881, "rewards/margins_min": 0.0020157184917479753, "rewards/margins_std": 0.00453522615134716, "rewards/rejected": -0.0026626510079950094, "step": 180 }, { "epoch": 0.05, "grad_norm": 0.34765625, "learning_rate": 2.386934673366834e-07, "logits/chosen": 0.06572423875331879, "logits/rejected": 0.36690762639045715, "logps/chosen": -229.5546875, "logps/rejected": -229.4326171875, "loss": 0.6904, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.00023253441031556576, "rewards/margins": 0.005891885608434677, "rewards/margins_max": 0.009354737587273121, "rewards/margins_min": 0.002429033163934946, "rewards/margins_std": 0.0048972126096487045, "rewards/rejected": -0.005659351125359535, "step": 190 }, { "epoch": 0.05, "grad_norm": 0.388671875, "learning_rate": 2.5125628140703517e-07, "logits/chosen": 0.016662485897541046, "logits/rejected": 0.5593847632408142, "logps/chosen": -223.0364990234375, "logps/rejected": -203.10812377929688, "loss": 0.69, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.002444048412144184, "rewards/margins": 0.006343179848045111, "rewards/margins_max": 0.009832089766860008, "rewards/margins_min": 0.0028542689979076385, "rewards/margins_std": 0.004934064578264952, "rewards/rejected": -0.0038991314359009266, "step": 200 }, { "epoch": 0.05, "grad_norm": 0.384765625, "learning_rate": 2.638190954773869e-07, "logits/chosen": 0.20249362289905548, "logits/rejected": 0.42898306250572205, "logps/chosen": -187.63648986816406, "logps/rejected": -216.4218292236328, "loss": 0.6899, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.00258094584569335, "rewards/margins": 0.006774174980819225, "rewards/margins_max": 0.01055647898465395, "rewards/margins_min": 0.0029918726067990065, "rewards/margins_std": 0.005348983686417341, "rewards/rejected": -0.0041932291351258755, "step": 210 }, { "epoch": 0.06, "grad_norm": 0.39453125, "learning_rate": 2.7638190954773865e-07, "logits/chosen": 0.11423423141241074, "logits/rejected": 0.49864324927330017, "logps/chosen": -226.0482177734375, "logps/rejected": -233.07608032226562, "loss": 0.6894, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.004007662646472454, "rewards/margins": 0.008532622829079628, "rewards/margins_max": 0.012825270183384418, "rewards/margins_min": 0.00423997500911355, "rewards/margins_std": 0.0060707200318574905, "rewards/rejected": -0.004524959716945887, "step": 220 }, { "epoch": 0.06, "grad_norm": 0.43359375, "learning_rate": 2.889447236180904e-07, "logits/chosen": 0.18874336779117584, "logits/rejected": 0.4362645745277405, "logps/chosen": -177.40353393554688, "logps/rejected": -181.6932373046875, "loss": 0.6895, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.001044024946168065, "rewards/margins": 0.0068617770448327065, "rewards/margins_max": 0.010246103629469872, "rewards/margins_min": 0.003477449994534254, "rewards/margins_std": 0.00478616077452898, "rewards/rejected": -0.005817751865833998, "step": 230 }, { "epoch": 0.06, "grad_norm": 0.38671875, "learning_rate": 3.015075376884422e-07, "logits/chosen": 0.12174437195062637, "logits/rejected": 0.5348376035690308, "logps/chosen": -227.04061889648438, "logps/rejected": -216.23678588867188, "loss": 0.6887, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.0027362615801393986, "rewards/margins": 0.008490202017128468, "rewards/margins_max": 0.012433262541890144, "rewards/margins_min": 0.004547140095382929, "rewards/margins_std": 0.005576332099735737, "rewards/rejected": -0.005753940436989069, "step": 240 }, { "epoch": 0.06, "grad_norm": 0.392578125, "learning_rate": 3.14070351758794e-07, "logits/chosen": 0.13310304284095764, "logits/rejected": 0.5597547888755798, "logps/chosen": -218.4144744873047, "logps/rejected": -211.48135375976562, "loss": 0.689, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.002688343869522214, "rewards/margins": 0.00865443330258131, "rewards/margins_max": 0.01382468082010746, "rewards/margins_min": 0.003484186250716448, "rewards/margins_std": 0.007311833556741476, "rewards/rejected": -0.005966088734567165, "step": 250 }, { "epoch": 0.07, "grad_norm": 0.376953125, "learning_rate": 3.2663316582914573e-07, "logits/chosen": 0.12969402968883514, "logits/rejected": 0.45209813117980957, "logps/chosen": -218.6748046875, "logps/rejected": -212.16696166992188, "loss": 0.6881, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.004049009643495083, "rewards/margins": 0.011781349778175354, "rewards/margins_max": 0.01769203506410122, "rewards/margins_min": 0.005870664492249489, "rewards/margins_std": 0.008358972147107124, "rewards/rejected": -0.0077323406003415585, "step": 260 }, { "epoch": 0.07, "grad_norm": 0.40625, "learning_rate": 3.3919597989949747e-07, "logits/chosen": 0.2151048183441162, "logits/rejected": 0.4109871983528137, "logps/chosen": -199.90518188476562, "logps/rejected": -220.63064575195312, "loss": 0.6873, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.0035167939495295286, "rewards/margins": 0.011875099502503872, "rewards/margins_max": 0.01781926490366459, "rewards/margins_min": 0.0059309350326657295, "rewards/margins_std": 0.008406318724155426, "rewards/rejected": -0.0083583053201437, "step": 270 }, { "epoch": 0.07, "grad_norm": 0.357421875, "learning_rate": 3.5175879396984927e-07, "logits/chosen": 0.018076254054903984, "logits/rejected": 0.2650180459022522, "logps/chosen": -199.3602752685547, "logps/rejected": -197.60350036621094, "loss": 0.6871, "rewards/accuracies": 0.9375, "rewards/chosen": 0.003981609828770161, "rewards/margins": 0.011555609293282032, "rewards/margins_max": 0.017135946080088615, "rewards/margins_min": 0.005975270643830299, "rewards/margins_std": 0.007891789078712463, "rewards/rejected": -0.007573998533189297, "step": 280 }, { "epoch": 0.07, "grad_norm": 0.423828125, "learning_rate": 3.64321608040201e-07, "logits/chosen": 0.021956929937005043, "logits/rejected": 0.4818328022956848, "logps/chosen": -227.0093231201172, "logps/rejected": -190.78150939941406, "loss": 0.6858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.004705543629825115, "rewards/margins": 0.013502237387001514, "rewards/margins_max": 0.020169807597994804, "rewards/margins_min": 0.0068346657790243626, "rewards/margins_std": 0.009429369121789932, "rewards/rejected": -0.0087966937571764, "step": 290 }, { "epoch": 0.08, "grad_norm": 0.375, "learning_rate": 3.7688442211055275e-07, "logits/chosen": 0.1162613034248352, "logits/rejected": 0.412226140499115, "logps/chosen": -189.89956665039062, "logps/rejected": -196.8206329345703, "loss": 0.6854, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.006150436587631702, "rewards/margins": 0.01643942855298519, "rewards/margins_max": 0.021918311715126038, "rewards/margins_min": 0.010960548184812069, "rewards/margins_std": 0.007748307194560766, "rewards/rejected": -0.010288992896676064, "step": 300 }, { "epoch": 0.08, "grad_norm": 0.3671875, "learning_rate": 3.8944723618090455e-07, "logits/chosen": 0.14860202372074127, "logits/rejected": 0.4366937577724457, "logps/chosen": -219.4801025390625, "logps/rejected": -228.6793212890625, "loss": 0.6847, "rewards/accuracies": 0.9375, "rewards/chosen": 0.005727228242903948, "rewards/margins": 0.017180735245347023, "rewards/margins_max": 0.02436363324522972, "rewards/margins_min": 0.009997835382819176, "rewards/margins_std": 0.010158153250813484, "rewards/rejected": -0.011453505605459213, "step": 310 }, { "epoch": 0.08, "grad_norm": 0.41015625, "learning_rate": 4.020100502512563e-07, "logits/chosen": 0.19813205301761627, "logits/rejected": 0.5719536542892456, "logps/chosen": -219.52560424804688, "logps/rejected": -206.9777374267578, "loss": 0.6844, "rewards/accuracies": 0.9375, "rewards/chosen": 0.00709904357790947, "rewards/margins": 0.018895188346505165, "rewards/margins_max": 0.026702377945184708, "rewards/margins_min": 0.011087999679148197, "rewards/margins_std": 0.011041032150387764, "rewards/rejected": -0.011796144768595695, "step": 320 }, { "epoch": 0.08, "grad_norm": 0.41796875, "learning_rate": 4.1457286432160803e-07, "logits/chosen": 0.1293250024318695, "logits/rejected": 0.44655877351760864, "logps/chosen": -232.55789184570312, "logps/rejected": -224.1845703125, "loss": 0.6845, "rewards/accuracies": 0.9375, "rewards/chosen": 0.008051590994000435, "rewards/margins": 0.018806347623467445, "rewards/margins_max": 0.02616736851632595, "rewards/margins_min": 0.011445323936641216, "rewards/margins_std": 0.010410057380795479, "rewards/rejected": -0.010754754766821861, "step": 330 }, { "epoch": 0.09, "grad_norm": 0.40234375, "learning_rate": 4.271356783919598e-07, "logits/chosen": 0.28589877486228943, "logits/rejected": 0.6117247939109802, "logps/chosen": -195.87106323242188, "logps/rejected": -194.0666046142578, "loss": 0.684, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.005987205076962709, "rewards/margins": 0.01625213399529457, "rewards/margins_max": 0.023097028955817223, "rewards/margins_min": 0.009407236240804195, "rewards/margins_std": 0.009680144488811493, "rewards/rejected": -0.010264927521348, "step": 340 }, { "epoch": 0.09, "grad_norm": 0.41796875, "learning_rate": 4.3969849246231157e-07, "logits/chosen": 0.10216917842626572, "logits/rejected": 0.4644095301628113, "logps/chosen": -192.9041748046875, "logps/rejected": -199.19735717773438, "loss": 0.6823, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.006600483320653439, "rewards/margins": 0.022701654583215714, "rewards/margins_max": 0.03271324187517166, "rewards/margins_min": 0.012690064497292042, "rewards/margins_std": 0.014158526435494423, "rewards/rejected": -0.0161011703312397, "step": 350 }, { "epoch": 0.09, "grad_norm": 0.349609375, "learning_rate": 4.522613065326633e-07, "logits/chosen": 0.11445100605487823, "logits/rejected": 0.25530433654785156, "logps/chosen": -200.52883911132812, "logps/rejected": -247.53128051757812, "loss": 0.6826, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.006267626769840717, "rewards/margins": 0.02223733812570572, "rewards/margins_max": 0.03066398575901985, "rewards/margins_min": 0.013810688629746437, "rewards/margins_std": 0.011917082592844963, "rewards/rejected": -0.015969710424542427, "step": 360 }, { "epoch": 0.09, "grad_norm": 0.3671875, "learning_rate": 4.6482412060301506e-07, "logits/chosen": 0.2603782117366791, "logits/rejected": 0.4916529059410095, "logps/chosen": -207.676025390625, "logps/rejected": -217.3939971923828, "loss": 0.6811, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.009192247875034809, "rewards/margins": 0.02688160166144371, "rewards/margins_max": 0.03934247046709061, "rewards/margins_min": 0.014420740306377411, "rewards/margins_std": 0.017622319981455803, "rewards/rejected": -0.017689354717731476, "step": 370 }, { "epoch": 0.1, "grad_norm": 0.404296875, "learning_rate": 4.773869346733669e-07, "logits/chosen": 0.04369829222559929, "logits/rejected": 0.31353959441185, "logps/chosen": -222.0448455810547, "logps/rejected": -220.94876098632812, "loss": 0.6802, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.00785902701318264, "rewards/margins": 0.02590133808553219, "rewards/margins_max": 0.03766251727938652, "rewards/margins_min": 0.014140157029032707, "rewards/margins_std": 0.016632821410894394, "rewards/rejected": -0.0180423092097044, "step": 380 }, { "epoch": 0.1, "grad_norm": 0.455078125, "learning_rate": 4.899497487437185e-07, "logits/chosen": 0.36211511492729187, "logits/rejected": 0.5968670845031738, "logps/chosen": -223.30615234375, "logps/rejected": -223.81729125976562, "loss": 0.6803, "rewards/accuracies": 0.9375, "rewards/chosen": 0.007726097013801336, "rewards/margins": 0.024434225633740425, "rewards/margins_max": 0.034294672310352325, "rewards/margins_min": 0.014573772437870502, "rewards/margins_std": 0.013944784179329872, "rewards/rejected": -0.016708126291632652, "step": 390 }, { "epoch": 0.1, "grad_norm": 0.42578125, "learning_rate": 4.999996141001429e-07, "logits/chosen": 0.1628507822751999, "logits/rejected": 0.6023589968681335, "logps/chosen": -249.71286010742188, "logps/rejected": -226.10037231445312, "loss": 0.6771, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.011553863994777203, "rewards/margins": 0.03338228166103363, "rewards/margins_max": 0.047240838408470154, "rewards/margins_min": 0.01952372118830681, "rewards/margins_std": 0.019598964601755142, "rewards/rejected": -0.021828416734933853, "step": 400 }, { "epoch": 0.1, "grad_norm": 0.4375, "learning_rate": 4.999861077302358e-07, "logits/chosen": 0.2815466821193695, "logits/rejected": 0.5784285664558411, "logps/chosen": -207.37551879882812, "logps/rejected": -233.9344940185547, "loss": 0.6754, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.011329596862196922, "rewards/margins": 0.03751852363348007, "rewards/margins_max": 0.051182061433792114, "rewards/margins_min": 0.023854978382587433, "rewards/margins_std": 0.019323166459798813, "rewards/rejected": -0.026188921183347702, "step": 410 }, { "epoch": 0.11, "grad_norm": 0.384765625, "learning_rate": 4.999533075588069e-07, "logits/chosen": 0.09976382553577423, "logits/rejected": 0.40051308274269104, "logps/chosen": -216.06954956054688, "logps/rejected": -214.60031127929688, "loss": 0.6766, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01043777447193861, "rewards/margins": 0.0323273167014122, "rewards/margins_max": 0.044423263520002365, "rewards/margins_min": 0.020231369882822037, "rewards/margins_std": 0.017106251791119576, "rewards/rejected": -0.021889541298151016, "step": 420 }, { "epoch": 0.11, "grad_norm": 0.373046875, "learning_rate": 4.99901216117357e-07, "logits/chosen": 0.22171469032764435, "logits/rejected": 0.5321763753890991, "logps/chosen": -213.97531127929688, "logps/rejected": -213.2228546142578, "loss": 0.6748, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01062067411839962, "rewards/margins": 0.03721512481570244, "rewards/margins_max": 0.055003441870212555, "rewards/margins_min": 0.01942681148648262, "rewards/margins_std": 0.025156473740935326, "rewards/rejected": -0.02659444883465767, "step": 430 }, { "epoch": 0.11, "grad_norm": 0.357421875, "learning_rate": 4.99829837426277e-07, "logits/chosen": 0.23026391863822937, "logits/rejected": 0.5481154322624207, "logps/chosen": -245.66995239257812, "logps/rejected": -234.21224975585938, "loss": 0.6751, "rewards/accuracies": 0.9375, "rewards/chosen": 0.011488174088299274, "rewards/margins": 0.03548017889261246, "rewards/margins_max": 0.05049975961446762, "rewards/margins_min": 0.020460600033402443, "rewards/margins_std": 0.02124089002609253, "rewards/rejected": -0.023992005735635757, "step": 440 }, { "epoch": 0.11, "grad_norm": 0.30859375, "learning_rate": 4.997391769945384e-07, "logits/chosen": 0.22155161201953888, "logits/rejected": 0.5006781220436096, "logps/chosen": -200.9005584716797, "logps/rejected": -212.7847900390625, "loss": 0.6705, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015426164492964745, "rewards/margins": 0.047034576535224915, "rewards/margins_max": 0.06396710872650146, "rewards/margins_min": 0.030102048069238663, "rewards/margins_std": 0.023946214467287064, "rewards/rejected": -0.03160841390490532, "step": 450 }, { "epoch": 0.12, "grad_norm": 0.376953125, "learning_rate": 4.996292418192676e-07, "logits/chosen": -0.06238239258527756, "logits/rejected": 0.22471682727336884, "logps/chosen": -202.37594604492188, "logps/rejected": -213.65774536132812, "loss": 0.6726, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.011132136918604374, "rewards/margins": 0.04166955500841141, "rewards/margins_max": 0.05782913416624069, "rewards/margins_min": 0.025509972125291824, "rewards/margins_std": 0.022853100672364235, "rewards/rejected": -0.03053741715848446, "step": 460 }, { "epoch": 0.12, "grad_norm": 0.4140625, "learning_rate": 4.995000403852057e-07, "logits/chosen": 0.12296873331069946, "logits/rejected": 0.4353713393211365, "logps/chosen": -213.2423095703125, "logps/rejected": -215.2025909423828, "loss": 0.6722, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.010903555899858475, "rewards/margins": 0.04033030942082405, "rewards/margins_max": 0.05910085514187813, "rewards/margins_min": 0.021559763699769974, "rewards/margins_std": 0.026545559987425804, "rewards/rejected": -0.029426757246255875, "step": 470 }, { "epoch": 0.12, "grad_norm": 0.443359375, "learning_rate": 4.993515826640541e-07, "logits/chosen": 0.05906867980957031, "logits/rejected": 0.2597416937351227, "logps/chosen": -205.0470733642578, "logps/rejected": -230.1743927001953, "loss": 0.6706, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.014294694177806377, "rewards/margins": 0.045732028782367706, "rewards/margins_max": 0.06336824595928192, "rewards/margins_min": 0.028095800429582596, "rewards/margins_std": 0.02494138851761818, "rewards/rejected": -0.031437329947948456, "step": 480 }, { "epoch": 0.12, "grad_norm": 0.380859375, "learning_rate": 4.99183880113705e-07, "logits/chosen": 0.033489521592855453, "logits/rejected": 0.49867790937423706, "logps/chosen": -212.2375946044922, "logps/rejected": -210.4663543701172, "loss": 0.6693, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.012695546261966228, "rewards/margins": 0.04652177169919014, "rewards/margins_max": 0.0679313912987709, "rewards/margins_min": 0.025112157687544823, "rewards/margins_std": 0.030277764424681664, "rewards/rejected": -0.03382622450590134, "step": 490 }, { "epoch": 0.13, "grad_norm": 0.49609375, "learning_rate": 4.989969456773562e-07, "logits/chosen": 0.37133318185806274, "logits/rejected": 0.7661579847335815, "logps/chosen": -238.337890625, "logps/rejected": -213.4925537109375, "loss": 0.6683, "rewards/accuracies": 0.9375, "rewards/chosen": 0.016709476709365845, "rewards/margins": 0.051804352551698685, "rewards/margins_max": 0.07735023647546768, "rewards/margins_min": 0.026258450001478195, "rewards/margins_std": 0.036127351224422455, "rewards/rejected": -0.03509486839175224, "step": 500 }, { "epoch": 0.13, "grad_norm": 0.40625, "learning_rate": 4.987907937825132e-07, "logits/chosen": 0.1539391726255417, "logits/rejected": 0.5792306661605835, "logps/chosen": -247.6181640625, "logps/rejected": -226.03036499023438, "loss": 0.6691, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.011538401246070862, "rewards/margins": 0.046936750411987305, "rewards/margins_max": 0.06901126354932785, "rewards/margins_min": 0.02486223541200161, "rewards/margins_std": 0.031218087300658226, "rewards/rejected": -0.035398345440626144, "step": 510 }, { "epoch": 0.13, "grad_norm": 0.4296875, "learning_rate": 4.985654403398752e-07, "logits/chosen": 0.1426265388727188, "logits/rejected": 0.6754364967346191, "logps/chosen": -241.33590698242188, "logps/rejected": -216.03286743164062, "loss": 0.6664, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.016581550240516663, "rewards/margins": 0.04815268889069557, "rewards/margins_max": 0.06514059007167816, "rewards/margins_min": 0.031164783984422684, "rewards/margins_std": 0.02402452565729618, "rewards/rejected": -0.03157113492488861, "step": 520 }, { "epoch": 0.13, "grad_norm": 0.408203125, "learning_rate": 4.983209027421071e-07, "logits/chosen": 0.04625128582119942, "logits/rejected": 0.3388972878456116, "logps/chosen": -194.6774139404297, "logps/rejected": -212.7203369140625, "loss": 0.6677, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.011985935270786285, "rewards/margins": 0.05305248498916626, "rewards/margins_max": 0.07709073275327682, "rewards/margins_min": 0.029014229774475098, "rewards/margins_std": 0.03399522230029106, "rewards/rejected": -0.041066545993089676, "step": 530 }, { "epoch": 0.14, "grad_norm": 0.4453125, "learning_rate": 4.980571998624969e-07, "logits/chosen": 0.27958276867866516, "logits/rejected": 0.670660138130188, "logps/chosen": -212.99832153320312, "logps/rejected": -199.5636444091797, "loss": 0.6692, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.009202254936099052, "rewards/margins": 0.049084585160017014, "rewards/margins_max": 0.07124610990285873, "rewards/margins_min": 0.02692306414246559, "rewards/margins_std": 0.031341131776571274, "rewards/rejected": -0.03988233581185341, "step": 540 }, { "epoch": 0.14, "grad_norm": 0.431640625, "learning_rate": 4.977743520535e-07, "logits/chosen": 0.10749062150716782, "logits/rejected": 0.4664621353149414, "logps/chosen": -252.5918426513672, "logps/rejected": -230.98580932617188, "loss": 0.6595, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01803063228726387, "rewards/margins": 0.07705044001340866, "rewards/margins_max": 0.10558941215276718, "rewards/margins_min": 0.04851147532463074, "rewards/margins_std": 0.04036019369959831, "rewards/rejected": -0.05901981145143509, "step": 550 }, { "epoch": 0.14, "grad_norm": 0.44921875, "learning_rate": 4.974723811451672e-07, "logits/chosen": 0.28724437952041626, "logits/rejected": 0.6529378890991211, "logps/chosen": -204.7720489501953, "logps/rejected": -201.96408081054688, "loss": 0.6617, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.015387463383376598, "rewards/margins": 0.06067556142807007, "rewards/margins_max": 0.08319296687841415, "rewards/margins_min": 0.038158148527145386, "rewards/margins_std": 0.031844429671764374, "rewards/rejected": -0.045288097113370895, "step": 560 }, { "epoch": 0.14, "grad_norm": 0.380859375, "learning_rate": 4.971513104434608e-07, "logits/chosen": 0.262315571308136, "logits/rejected": 0.40090543031692505, "logps/chosen": -177.6724853515625, "logps/rejected": -192.7591552734375, "loss": 0.6614, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.011687718331813812, "rewards/margins": 0.06579925119876862, "rewards/margins_max": 0.09133808314800262, "rewards/margins_min": 0.04026041179895401, "rewards/margins_std": 0.036117371171712875, "rewards/rejected": -0.0541115328669548, "step": 570 }, { "epoch": 0.15, "grad_norm": 0.640625, "learning_rate": 4.968111647284552e-07, "logits/chosen": 0.26100075244903564, "logits/rejected": 0.567363977432251, "logps/chosen": -223.25003051757812, "logps/rejected": -237.2589874267578, "loss": 0.6603, "rewards/accuracies": 0.9375, "rewards/chosen": 0.012476488947868347, "rewards/margins": 0.07055070996284485, "rewards/margins_max": 0.09941631555557251, "rewards/margins_min": 0.04168509691953659, "rewards/margins_std": 0.04082213342189789, "rewards/rejected": -0.0580742172896862, "step": 580 }, { "epoch": 0.15, "grad_norm": 0.42578125, "learning_rate": 4.96451970252425e-07, "logits/chosen": 0.3430723547935486, "logits/rejected": 0.5974612832069397, "logps/chosen": -208.9197540283203, "logps/rejected": -219.6869659423828, "loss": 0.6642, "rewards/accuracies": 0.9375, "rewards/chosen": 0.009925360791385174, "rewards/margins": 0.05603013187646866, "rewards/margins_max": 0.08645769953727722, "rewards/margins_min": 0.0256025530397892, "rewards/margins_std": 0.04303108900785446, "rewards/rejected": -0.04610477015376091, "step": 590 }, { "epoch": 0.15, "grad_norm": 0.458984375, "learning_rate": 4.960737547378185e-07, "logits/chosen": 0.15235112607479095, "logits/rejected": 0.5123030543327332, "logps/chosen": -209.2238006591797, "logps/rejected": -216.80374145507812, "loss": 0.6575, "rewards/accuracies": 0.9375, "rewards/chosen": 0.01144417840987444, "rewards/margins": 0.07184401899576187, "rewards/margins_max": 0.10671252012252808, "rewards/margins_min": 0.03697553277015686, "rewards/margins_std": 0.04931149631738663, "rewards/rejected": -0.06039984151721001, "step": 600 }, { "epoch": 0.15, "grad_norm": 0.392578125, "learning_rate": 4.956765473751179e-07, "logits/chosen": 0.20621080696582794, "logits/rejected": 0.5001572370529175, "logps/chosen": -206.8876190185547, "logps/rejected": -202.76043701171875, "loss": 0.6607, "rewards/accuracies": 0.9375, "rewards/chosen": 0.015829239040613174, "rewards/margins": 0.06330656260251999, "rewards/margins_max": 0.09255840629339218, "rewards/margins_min": 0.034054726362228394, "rewards/margins_std": 0.04136834666132927, "rewards/rejected": -0.04747732728719711, "step": 610 }, { "epoch": 0.16, "grad_norm": 0.43359375, "learning_rate": 4.952603788205869e-07, "logits/chosen": 0.16123279929161072, "logits/rejected": 0.2520465552806854, "logps/chosen": -200.49819946289062, "logps/rejected": -230.7953338623047, "loss": 0.6543, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.018199261277914047, "rewards/margins": 0.08130113780498505, "rewards/margins_max": 0.12046916782855988, "rewards/margins_min": 0.04213310033082962, "rewards/margins_std": 0.05539196729660034, "rewards/rejected": -0.0631018728017807, "step": 620 }, { "epoch": 0.16, "grad_norm": 0.373046875, "learning_rate": 4.948252811939043e-07, "logits/chosen": 0.26655691862106323, "logits/rejected": 0.5585820078849792, "logps/chosen": -223.76467895507812, "logps/rejected": -227.7748565673828, "loss": 0.6563, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.013967275619506836, "rewards/margins": 0.07716921716928482, "rewards/margins_max": 0.10888046026229858, "rewards/margins_min": 0.045457981526851654, "rewards/margins_std": 0.044846463948488235, "rewards/rejected": -0.06320194900035858, "step": 630 }, { "epoch": 0.16, "grad_norm": 0.37890625, "learning_rate": 4.943712880756853e-07, "logits/chosen": 0.2654273808002472, "logits/rejected": 0.417758047580719, "logps/chosen": -187.88235473632812, "logps/rejected": -205.4129638671875, "loss": 0.6574, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.009831604547798634, "rewards/margins": 0.07818244397640228, "rewards/margins_max": 0.11118423938751221, "rewards/margins_min": 0.04518064111471176, "rewards/margins_std": 0.04667159169912338, "rewards/rejected": -0.06835083663463593, "step": 640 }, { "epoch": 0.16, "grad_norm": 0.392578125, "learning_rate": 4.938984345048891e-07, "logits/chosen": 0.005340480245649815, "logits/rejected": 0.5790532827377319, "logps/chosen": -240.7495574951172, "logps/rejected": -196.30654907226562, "loss": 0.6563, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.012155624106526375, "rewards/margins": 0.07187846302986145, "rewards/margins_max": 0.11071207374334335, "rewards/margins_min": 0.03304483741521835, "rewards/margins_std": 0.054919034242630005, "rewards/rejected": -0.05972283333539963, "step": 650 }, { "epoch": 0.17, "grad_norm": 0.37109375, "learning_rate": 4.934067569761159e-07, "logits/chosen": 0.10061736404895782, "logits/rejected": 0.47449779510498047, "logps/chosen": -208.8763427734375, "logps/rejected": -224.052978515625, "loss": 0.6527, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.012553298845887184, "rewards/margins": 0.08297617733478546, "rewards/margins_max": 0.11780568212270737, "rewards/margins_min": 0.048146676272153854, "rewards/margins_std": 0.049256350845098495, "rewards/rejected": -0.07042287290096283, "step": 660 }, { "epoch": 0.17, "grad_norm": 0.400390625, "learning_rate": 4.928962934367886e-07, "logits/chosen": 0.14203248918056488, "logits/rejected": 0.39017271995544434, "logps/chosen": -194.6610870361328, "logps/rejected": -225.5804443359375, "loss": 0.6526, "rewards/accuracies": 0.9375, "rewards/chosen": 0.018391985446214676, "rewards/margins": 0.07769852131605148, "rewards/margins_max": 0.11252578347921371, "rewards/margins_min": 0.042871274054050446, "rewards/margins_std": 0.04925317317247391, "rewards/rejected": -0.059306539595127106, "step": 670 }, { "epoch": 0.17, "grad_norm": 0.41015625, "learning_rate": 4.923670832842255e-07, "logits/chosen": 0.18619462847709656, "logits/rejected": 0.5959904193878174, "logps/chosen": -208.16043090820312, "logps/rejected": -184.84901428222656, "loss": 0.6561, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.011603322811424732, "rewards/margins": 0.06881751120090485, "rewards/margins_max": 0.10558997094631195, "rewards/margins_min": 0.032045066356658936, "rewards/margins_std": 0.05200410634279251, "rewards/rejected": -0.05721420794725418, "step": 680 }, { "epoch": 0.17, "grad_norm": 0.427734375, "learning_rate": 4.918191673625989e-07, "logits/chosen": 0.2073175609111786, "logits/rejected": 0.5069489479064941, "logps/chosen": -209.69064331054688, "logps/rejected": -212.9167022705078, "loss": 0.6526, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.017011545598506927, "rewards/margins": 0.08031044900417328, "rewards/margins_max": 0.11243252456188202, "rewards/margins_min": 0.04818839207291603, "rewards/margins_std": 0.045427460223436356, "rewards/rejected": -0.06329891085624695, "step": 690 }, { "epoch": 0.18, "grad_norm": 0.39453125, "learning_rate": 4.912525879597829e-07, "logits/chosen": 0.03431146591901779, "logits/rejected": 0.4064474105834961, "logps/chosen": -212.8287353515625, "logps/rejected": -212.6048583984375, "loss": 0.6517, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.01717870496213436, "rewards/margins": 0.075754314661026, "rewards/margins_max": 0.10932576656341553, "rewards/margins_min": 0.04218285530805588, "rewards/margins_std": 0.04747720807790756, "rewards/rejected": -0.05857561156153679, "step": 700 }, { "epoch": 0.18, "grad_norm": 0.396484375, "learning_rate": 4.906673888040894e-07, "logits/chosen": 0.22898809611797333, "logits/rejected": 0.5808016657829285, "logps/chosen": -208.4955596923828, "logps/rejected": -211.0413818359375, "loss": 0.6532, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.012025387957692146, "rewards/margins": 0.08007804304361343, "rewards/margins_max": 0.12182573974132538, "rewards/margins_min": 0.03833033889532089, "rewards/margins_std": 0.059040166437625885, "rewards/rejected": -0.06805265694856644, "step": 710 }, { "epoch": 0.18, "grad_norm": 0.41796875, "learning_rate": 4.900636150608939e-07, "logits/chosen": 0.14877888560295105, "logits/rejected": 0.6197251677513123, "logps/chosen": -274.15960693359375, "logps/rejected": -230.4556121826172, "loss": 0.6558, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.015803003683686256, "rewards/margins": 0.08023526519536972, "rewards/margins_max": 0.11162183433771133, "rewards/margins_min": 0.04884869232773781, "rewards/margins_std": 0.044387321919202805, "rewards/rejected": -0.06443226337432861, "step": 720 }, { "epoch": 0.18, "grad_norm": 0.408203125, "learning_rate": 4.894413133291487e-07, "logits/chosen": 0.07972065359354019, "logits/rejected": 0.43912968039512634, "logps/chosen": -212.00863647460938, "logps/rejected": -220.36184692382812, "loss": 0.6493, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.02098964713513851, "rewards/margins": 0.10028757899999619, "rewards/margins_max": 0.14547555148601532, "rewards/margins_min": 0.05509962514042854, "rewards/margins_std": 0.06390543282032013, "rewards/rejected": -0.07929793000221252, "step": 730 }, { "epoch": 0.19, "grad_norm": 0.443359375, "learning_rate": 4.888005316377872e-07, "logits/chosen": 0.09715773165225983, "logits/rejected": 0.3823370039463043, "logps/chosen": -203.8870391845703, "logps/rejected": -232.822509765625, "loss": 0.6494, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.021012386307120323, "rewards/margins": 0.10091584920883179, "rewards/margins_max": 0.1349639594554901, "rewards/margins_min": 0.06686773151159286, "rewards/margins_std": 0.048151303082704544, "rewards/rejected": -0.07990345358848572, "step": 740 }, { "epoch": 0.19, "grad_norm": 0.357421875, "learning_rate": 4.881413194420168e-07, "logits/chosen": 0.27572113275527954, "logits/rejected": 0.5720052123069763, "logps/chosen": -226.29190063476562, "logps/rejected": -228.05880737304688, "loss": 0.6465, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.019430547952651978, "rewards/margins": 0.10530813038349152, "rewards/margins_max": 0.15189756453037262, "rewards/margins_min": 0.05871870368719101, "rewards/margins_std": 0.06588739156723022, "rewards/rejected": -0.08587757498025894, "step": 750 }, { "epoch": 0.19, "grad_norm": 0.4296875, "learning_rate": 4.874637276195017e-07, "logits/chosen": 0.28007036447525024, "logits/rejected": 0.5462180972099304, "logps/chosen": -191.09292602539062, "logps/rejected": -223.82144165039062, "loss": 0.643, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.01735042780637741, "rewards/margins": 0.10428867489099503, "rewards/margins_max": 0.1485210359096527, "rewards/margins_min": 0.06005631759762764, "rewards/margins_std": 0.0625540092587471, "rewards/rejected": -0.08693825453519821, "step": 760 }, { "epoch": 0.19, "grad_norm": 0.443359375, "learning_rate": 4.867678084664364e-07, "logits/chosen": 0.0784984901547432, "logits/rejected": 0.45213785767555237, "logps/chosen": -218.5895538330078, "logps/rejected": -232.12203979492188, "loss": 0.6458, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022956818342208862, "rewards/margins": 0.1018822193145752, "rewards/margins_max": 0.1473909467458725, "rewards/margins_min": 0.056373514235019684, "rewards/margins_std": 0.06435903906822205, "rewards/rejected": -0.07892540842294693, "step": 770 }, { "epoch": 0.2, "grad_norm": 0.45703125, "learning_rate": 4.860536156935097e-07, "logits/chosen": 0.3110271096229553, "logits/rejected": 0.4671865403652191, "logps/chosen": -208.2735137939453, "logps/rejected": -228.9082794189453, "loss": 0.642, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02270892634987831, "rewards/margins": 0.11020022630691528, "rewards/margins_max": 0.14996013045310974, "rewards/margins_min": 0.07044032961130142, "rewards/margins_std": 0.05622899532318115, "rewards/rejected": -0.08749129623174667, "step": 780 }, { "epoch": 0.2, "grad_norm": 0.427734375, "learning_rate": 4.853212044217591e-07, "logits/chosen": 0.15716281533241272, "logits/rejected": 0.501904308795929, "logps/chosen": -206.5747528076172, "logps/rejected": -224.3306427001953, "loss": 0.6455, "rewards/accuracies": 0.9375, "rewards/chosen": 0.015814324840903282, "rewards/margins": 0.08730728924274445, "rewards/margins_max": 0.1228957548737526, "rewards/margins_min": 0.0517188124358654, "rewards/margins_std": 0.050329696387052536, "rewards/rejected": -0.07149295508861542, "step": 790 }, { "epoch": 0.2, "grad_norm": 0.365234375, "learning_rate": 4.845706311783165e-07, "logits/chosen": 0.25198808312416077, "logits/rejected": 0.6890451908111572, "logps/chosen": -218.2187957763672, "logps/rejected": -209.95297241210938, "loss": 0.6471, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.014763091690838337, "rewards/margins": 0.09641151130199432, "rewards/margins_max": 0.14849857985973358, "rewards/margins_min": 0.04432445764541626, "rewards/margins_std": 0.07366223633289337, "rewards/rejected": -0.08164842426776886, "step": 800 }, { "epoch": 0.2, "grad_norm": 0.44140625, "learning_rate": 4.838019538920458e-07, "logits/chosen": 0.22046689689159393, "logits/rejected": 0.4486332833766937, "logps/chosen": -225.94906616210938, "logps/rejected": -219.116943359375, "loss": 0.6444, "rewards/accuracies": 0.9375, "rewards/chosen": 0.012000517919659615, "rewards/margins": 0.09707773476839066, "rewards/margins_max": 0.15281417965888977, "rewards/margins_min": 0.04134128615260124, "rewards/margins_std": 0.07882323861122131, "rewards/rejected": -0.0850772112607956, "step": 810 }, { "epoch": 0.21, "grad_norm": 0.388671875, "learning_rate": 4.830152318890716e-07, "logits/chosen": 0.1163412481546402, "logits/rejected": 0.48530465364456177, "logps/chosen": -205.08029174804688, "logps/rejected": -216.31167602539062, "loss": 0.642, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.008090192452073097, "rewards/margins": 0.10197190195322037, "rewards/margins_max": 0.15650735795497894, "rewards/margins_min": 0.04743645340204239, "rewards/margins_std": 0.07712477445602417, "rewards/rejected": -0.09388169646263123, "step": 820 }, { "epoch": 0.21, "grad_norm": 0.408203125, "learning_rate": 4.822105258882006e-07, "logits/chosen": 0.07750881463289261, "logits/rejected": 0.4977690279483795, "logps/chosen": -244.5646514892578, "logps/rejected": -249.2985076904297, "loss": 0.6457, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.009753966704010963, "rewards/margins": 0.10527770221233368, "rewards/margins_max": 0.1562924087047577, "rewards/margins_min": 0.05426298826932907, "rewards/margins_std": 0.07214570045471191, "rewards/rejected": -0.09552373737096786, "step": 830 }, { "epoch": 0.21, "grad_norm": 0.443359375, "learning_rate": 4.813878979962358e-07, "logits/chosen": 0.20875442028045654, "logits/rejected": 0.4509933590888977, "logps/chosen": -239.2003631591797, "logps/rejected": -256.7997131347656, "loss": 0.6398, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015841351822018623, "rewards/margins": 0.10631036758422852, "rewards/margins_max": 0.15342703461647034, "rewards/margins_min": 0.0591936893761158, "rewards/margins_std": 0.06663304567337036, "rewards/rejected": -0.09046901017427444, "step": 840 }, { "epoch": 0.21, "grad_norm": 0.37109375, "learning_rate": 4.805474117031821e-07, "logits/chosen": 0.15148243308067322, "logits/rejected": 0.5351013541221619, "logps/chosen": -213.7112274169922, "logps/rejected": -219.25759887695312, "loss": 0.6459, "rewards/accuracies": 0.9375, "rewards/chosen": 0.013252089731395245, "rewards/margins": 0.10840978473424911, "rewards/margins_max": 0.15735021233558655, "rewards/margins_min": 0.05946936458349228, "rewards/margins_std": 0.06921220570802689, "rewards/rejected": -0.095157690346241, "step": 850 }, { "epoch": 0.22, "grad_norm": 0.421875, "learning_rate": 4.796891318773472e-07, "logits/chosen": 0.1185070276260376, "logits/rejected": 0.6079737544059753, "logps/chosen": -235.15219116210938, "logps/rejected": -210.1244659423828, "loss": 0.6422, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.009891767986118793, "rewards/margins": 0.09201665967702866, "rewards/margins_max": 0.13068418204784393, "rewards/margins_min": 0.053349148482084274, "rewards/margins_std": 0.054684124886989594, "rewards/rejected": -0.08212489634752274, "step": 860 }, { "epoch": 0.22, "grad_norm": 0.412109375, "learning_rate": 4.788131247603344e-07, "logits/chosen": 0.1515607237815857, "logits/rejected": 0.39225345849990845, "logps/chosen": -203.7932891845703, "logps/rejected": -211.4970703125, "loss": 0.6424, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022816309705376625, "rewards/margins": 0.11361672729253769, "rewards/margins_max": 0.16148778796195984, "rewards/margins_min": 0.06574566662311554, "rewards/margins_std": 0.06769990921020508, "rewards/rejected": -0.09080041944980621, "step": 870 }, { "epoch": 0.22, "grad_norm": 0.4921875, "learning_rate": 4.779194579619306e-07, "logits/chosen": 0.09543349593877792, "logits/rejected": 0.3949804902076721, "logps/chosen": -227.0799560546875, "logps/rejected": -229.57510375976562, "loss": 0.6375, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.014561304822564125, "rewards/margins": 0.1107596904039383, "rewards/margins_max": 0.16909492015838623, "rewards/margins_min": 0.05242444947361946, "rewards/margins_std": 0.08249849081039429, "rewards/rejected": -0.09619838744401932, "step": 880 }, { "epoch": 0.22, "grad_norm": 0.375, "learning_rate": 4.770082004548878e-07, "logits/chosen": 0.18146385252475739, "logits/rejected": 0.6459946632385254, "logps/chosen": -248.006591796875, "logps/rejected": -206.70742797851562, "loss": 0.6383, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.020160971209406853, "rewards/margins": 0.12306095659732819, "rewards/margins_max": 0.17820948362350464, "rewards/margins_min": 0.06791240721940994, "rewards/margins_std": 0.07799182087182999, "rewards/rejected": -0.10289998352527618, "step": 890 }, { "epoch": 0.23, "grad_norm": 0.50390625, "learning_rate": 4.7607942256960015e-07, "logits/chosen": 0.12000491470098495, "logits/rejected": 0.5695959329605103, "logps/chosen": -241.203369140625, "logps/rejected": -230.412109375, "loss": 0.6387, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.012702536769211292, "rewards/margins": 0.11182727664709091, "rewards/margins_max": 0.15916919708251953, "rewards/margins_min": 0.06448537111282349, "rewards/margins_std": 0.06695158779621124, "rewards/rejected": -0.09912474453449249, "step": 900 }, { "epoch": 0.23, "grad_norm": 0.47265625, "learning_rate": 4.7513319598867575e-07, "logits/chosen": -0.11135046184062958, "logits/rejected": 0.19874481856822968, "logps/chosen": -204.3514862060547, "logps/rejected": -230.79074096679688, "loss": 0.6381, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.020336521789431572, "rewards/margins": 0.12428624927997589, "rewards/margins_max": 0.17267899215221405, "rewards/margins_min": 0.07589351385831833, "rewards/margins_std": 0.06843766570091248, "rewards/rejected": -0.10394972562789917, "step": 910 }, { "epoch": 0.23, "grad_norm": 0.4375, "learning_rate": 4.7416959374140404e-07, "logits/chosen": 0.2369326651096344, "logits/rejected": 0.43091505765914917, "logps/chosen": -203.09075927734375, "logps/rejected": -236.984619140625, "loss": 0.6418, "rewards/accuracies": 0.9375, "rewards/chosen": 0.007758637424558401, "rewards/margins": 0.10979725420475006, "rewards/margins_max": 0.1725243777036667, "rewards/margins_min": 0.04707012698054314, "rewards/margins_std": 0.08870954811573029, "rewards/rejected": -0.10203862190246582, "step": 920 }, { "epoch": 0.23, "grad_norm": 0.435546875, "learning_rate": 4.731886901981198e-07, "logits/chosen": 0.17712104320526123, "logits/rejected": 0.4920099675655365, "logps/chosen": -230.0107421875, "logps/rejected": -229.26318359375, "loss": 0.6362, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.009715641848742962, "rewards/margins": 0.1110834851861, "rewards/margins_max": 0.158487468957901, "rewards/margins_min": 0.06367949396371841, "rewards/margins_std": 0.0670393630862236, "rewards/rejected": -0.10136783123016357, "step": 930 }, { "epoch": 0.24, "grad_norm": 0.443359375, "learning_rate": 4.721905610644631e-07, "logits/chosen": 0.20608584582805634, "logits/rejected": 0.6499342918395996, "logps/chosen": -209.8543701171875, "logps/rejected": -222.61795043945312, "loss": 0.6383, "rewards/accuracies": 0.9375, "rewards/chosen": 0.009353621862828732, "rewards/margins": 0.11810240894556046, "rewards/margins_max": 0.16636307537555695, "rewards/margins_min": 0.06984174996614456, "rewards/margins_std": 0.0682508796453476, "rewards/rejected": -0.10874877870082855, "step": 940 }, { "epoch": 0.24, "grad_norm": 0.439453125, "learning_rate": 4.7117528337553614e-07, "logits/chosen": 0.25856292247772217, "logits/rejected": 0.6672132015228271, "logps/chosen": -221.52072143554688, "logps/rejected": -213.8031768798828, "loss": 0.6355, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.017076514661312103, "rewards/margins": 0.11997471004724503, "rewards/margins_max": 0.17075984179973602, "rewards/margins_min": 0.06918957084417343, "rewards/margins_std": 0.07182103395462036, "rewards/rejected": -0.10289819538593292, "step": 950 }, { "epoch": 0.24, "grad_norm": 0.451171875, "learning_rate": 4.701429354899582e-07, "logits/chosen": 0.19641758501529694, "logits/rejected": 0.5728838443756104, "logps/chosen": -223.01187133789062, "logps/rejected": -236.22396850585938, "loss": 0.6363, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022213028743863106, "rewards/margins": 0.12641343474388123, "rewards/margins_max": 0.18326741456985474, "rewards/margins_min": 0.06955946236848831, "rewards/margins_std": 0.08040366321802139, "rewards/rejected": -0.10420040041208267, "step": 960 }, { "epoch": 0.24, "grad_norm": 0.470703125, "learning_rate": 4.6909359708381767e-07, "logits/chosen": 0.2699734568595886, "logits/rejected": 0.7172943353652954, "logps/chosen": -220.46945190429688, "logps/rejected": -221.70779418945312, "loss": 0.6369, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.009590959176421165, "rewards/margins": 0.11778979003429413, "rewards/margins_max": 0.16688065230846405, "rewards/margins_min": 0.0686989426612854, "rewards/margins_std": 0.06942495703697205, "rewards/rejected": -0.10819883644580841, "step": 970 }, { "epoch": 0.25, "grad_norm": 0.388671875, "learning_rate": 4.680273491445226e-07, "logits/chosen": 0.26696455478668213, "logits/rejected": 0.5499181747436523, "logps/chosen": -207.48355102539062, "logps/rejected": -224.742431640625, "loss": 0.6326, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.28259003860876e-05, "rewards/margins": 0.12857209146022797, "rewards/margins_max": 0.19557307660579681, "rewards/margins_min": 0.061571113765239716, "rewards/margins_std": 0.0947537049651146, "rewards/rejected": -0.12863494455814362, "step": 980 }, { "epoch": 0.25, "grad_norm": 0.44140625, "learning_rate": 4.6694427396455047e-07, "logits/chosen": 0.25230395793914795, "logits/rejected": 0.5381912589073181, "logps/chosen": -208.3915557861328, "logps/rejected": -212.63623046875, "loss": 0.638, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011081243865191936, "rewards/margins": 0.12333667278289795, "rewards/margins_max": 0.18219420313835144, "rewards/margins_min": 0.06447914242744446, "rewards/margins_std": 0.08323711901903152, "rewards/rejected": -0.11225543171167374, "step": 990 }, { "epoch": 0.25, "grad_norm": 0.40625, "learning_rate": 4.658444551350965e-07, "logits/chosen": 0.15902924537658691, "logits/rejected": 0.507973313331604, "logps/chosen": -226.2003936767578, "logps/rejected": -217.74850463867188, "loss": 0.6364, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.008196848444640636, "rewards/margins": 0.11990328133106232, "rewards/margins_max": 0.17256389558315277, "rewards/margins_min": 0.06724263727664948, "rewards/margins_std": 0.07447338849306107, "rewards/rejected": -0.11170642077922821, "step": 1000 }, { "epoch": 0.25, "grad_norm": 0.3515625, "learning_rate": 4.6472797753962243e-07, "logits/chosen": 0.14983822405338287, "logits/rejected": 0.3576236367225647, "logps/chosen": -200.7541961669922, "logps/rejected": -233.3649139404297, "loss": 0.6404, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.005986286327242851, "rewards/margins": 0.10681941360235214, "rewards/margins_max": 0.15522412955760956, "rewards/margins_min": 0.05841469764709473, "rewards/margins_std": 0.06845460832118988, "rewards/rejected": -0.10083313286304474, "step": 1010 }, { "epoch": 0.26, "grad_norm": 0.41015625, "learning_rate": 4.6359492734730523e-07, "logits/chosen": 0.04730183631181717, "logits/rejected": 0.522533118724823, "logps/chosen": -224.05599975585938, "logps/rejected": -228.7908172607422, "loss": 0.6423, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0031752493232488632, "rewards/margins": 0.10396416485309601, "rewards/margins_max": 0.14557965099811554, "rewards/margins_min": 0.062348686158657074, "rewards/margins_std": 0.05885317921638489, "rewards/rejected": -0.10713942348957062, "step": 1020 }, { "epoch": 0.26, "grad_norm": 0.43359375, "learning_rate": 4.6244539200638623e-07, "logits/chosen": 0.2216562032699585, "logits/rejected": 0.6307166814804077, "logps/chosen": -229.996337890625, "logps/rejected": -215.51840209960938, "loss": 0.6361, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.002399321412667632, "rewards/margins": 0.11388808488845825, "rewards/margins_max": 0.16466781497001648, "rewards/margins_min": 0.06310835480690002, "rewards/margins_std": 0.07181338220834732, "rewards/rejected": -0.11148877441883087, "step": 1030 }, { "epoch": 0.26, "grad_norm": 0.392578125, "learning_rate": 4.612794602374225e-07, "logits/chosen": 0.01616589166224003, "logits/rejected": 0.4125541150569916, "logps/chosen": -239.0602569580078, "logps/rejected": -232.44058227539062, "loss": 0.6369, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.010500753298401833, "rewards/margins": 0.12285512685775757, "rewards/margins_max": 0.1805230677127838, "rewards/margins_min": 0.06518720090389252, "rewards/margins_std": 0.08155475556850433, "rewards/rejected": -0.11235438287258148, "step": 1040 }, { "epoch": 0.26, "grad_norm": 0.40234375, "learning_rate": 4.6009722202643895e-07, "logits/chosen": 0.18480125069618225, "logits/rejected": 0.6441187858581543, "logps/chosen": -234.36904907226562, "logps/rejected": -218.74832153320312, "loss": 0.637, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.002767058089375496, "rewards/margins": 0.10596774518489838, "rewards/margins_max": 0.1534162163734436, "rewards/margins_min": 0.05851926654577255, "rewards/margins_std": 0.06710227578878403, "rewards/rejected": -0.10320068895816803, "step": 1050 }, { "epoch": 0.27, "grad_norm": 0.404296875, "learning_rate": 4.5889876861798364e-07, "logits/chosen": 0.2748715877532959, "logits/rejected": 0.5483818054199219, "logps/chosen": -208.03842163085938, "logps/rejected": -232.2275390625, "loss": 0.6354, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011083085089921951, "rewards/margins": 0.11929738521575928, "rewards/margins_max": 0.17670035362243652, "rewards/margins_min": 0.06189443916082382, "rewards/margins_std": 0.08118002861738205, "rewards/rejected": -0.10821430385112762, "step": 1060 }, { "epoch": 0.27, "grad_norm": 0.423828125, "learning_rate": 4.5768419250808527e-07, "logits/chosen": 0.2828555703163147, "logits/rejected": 0.48374295234680176, "logps/chosen": -198.8870391845703, "logps/rejected": -238.9009246826172, "loss": 0.6348, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.004056218545883894, "rewards/margins": 0.13111725449562073, "rewards/margins_max": 0.18967494368553162, "rewards/margins_min": 0.07255958020687103, "rewards/margins_std": 0.0828130692243576, "rewards/rejected": -0.12706103920936584, "step": 1070 }, { "epoch": 0.27, "grad_norm": 0.400390625, "learning_rate": 4.5645358743711445e-07, "logits/chosen": 0.04673098772764206, "logits/rejected": 0.4958357810974121, "logps/chosen": -221.9709014892578, "logps/rejected": -207.37271118164062, "loss": 0.6328, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.010686990804970264, "rewards/margins": 0.1280701458454132, "rewards/margins_max": 0.19120752811431885, "rewards/margins_min": 0.06493276357650757, "rewards/margins_std": 0.08928973972797394, "rewards/rejected": -0.11738315969705582, "step": 1080 }, { "epoch": 0.27, "grad_norm": 0.38671875, "learning_rate": 4.552070483825489e-07, "logits/chosen": 0.2870774567127228, "logits/rejected": 0.6332607865333557, "logps/chosen": -218.626220703125, "logps/rejected": -226.4871368408203, "loss": 0.6372, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.008383207023143768, "rewards/margins": 0.1219739317893982, "rewards/margins_max": 0.18133440613746643, "rewards/margins_min": 0.06261344254016876, "rewards/margins_std": 0.08394841104745865, "rewards/rejected": -0.11359071731567383, "step": 1090 }, { "epoch": 0.28, "grad_norm": 0.52734375, "learning_rate": 4.539446715516433e-07, "logits/chosen": 0.2350023239850998, "logits/rejected": 0.6149531602859497, "logps/chosen": -247.345703125, "logps/rejected": -232.6597137451172, "loss": 0.6435, "rewards/accuracies": 0.9375, "rewards/chosen": 0.005494078621268272, "rewards/margins": 0.12571188807487488, "rewards/margins_max": 0.18273046612739563, "rewards/margins_min": 0.06869328022003174, "rewards/margins_std": 0.08063645660877228, "rewards/rejected": -0.12021778523921967, "step": 1100 }, { "epoch": 0.28, "grad_norm": 0.42578125, "learning_rate": 4.5266655437400373e-07, "logits/chosen": 0.3315362334251404, "logits/rejected": 0.7173594832420349, "logps/chosen": -217.39102172851562, "logps/rejected": -223.75289916992188, "loss": 0.6386, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0021430067718029022, "rewards/margins": 0.11483003199100494, "rewards/margins_max": 0.17373351752758026, "rewards/margins_min": 0.05592656135559082, "rewards/margins_std": 0.0833020955324173, "rewards/rejected": -0.11697304248809814, "step": 1110 }, { "epoch": 0.28, "grad_norm": 0.3984375, "learning_rate": 4.5137279549406847e-07, "logits/chosen": 0.13369938731193542, "logits/rejected": 0.5289446115493774, "logps/chosen": -188.17312622070312, "logps/rejected": -191.70199584960938, "loss": 0.6418, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.008846143260598183, "rewards/margins": 0.09507354348897934, "rewards/margins_max": 0.1387803554534912, "rewards/margins_min": 0.05136672407388687, "rewards/margins_std": 0.061810772866010666, "rewards/rejected": -0.0862274020910263, "step": 1120 }, { "epoch": 0.28, "grad_norm": 0.4296875, "learning_rate": 4.5006349476349423e-07, "logits/chosen": 0.16019530594348907, "logits/rejected": 0.4802624583244324, "logps/chosen": -225.89627075195312, "logps/rejected": -232.423095703125, "loss": 0.6351, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011177858337759972, "rewards/margins": 0.13784930109977722, "rewards/margins_max": 0.19627857208251953, "rewards/margins_min": 0.07942002266645432, "rewards/margins_std": 0.08263148367404938, "rewards/rejected": -0.1266714483499527, "step": 1130 }, { "epoch": 0.29, "grad_norm": 0.421875, "learning_rate": 4.487387532334501e-07, "logits/chosen": 0.059213705360889435, "logits/rejected": 0.46955880522727966, "logps/chosen": -235.9820556640625, "logps/rejected": -246.6029815673828, "loss": 0.6349, "rewards/accuracies": 0.9375, "rewards/chosen": 0.00040880925371311605, "rewards/margins": 0.12326528877019882, "rewards/margins_max": 0.18753501772880554, "rewards/margins_min": 0.0589955635368824, "rewards/margins_std": 0.09089111536741257, "rewards/rejected": -0.12285648286342621, "step": 1140 }, { "epoch": 0.29, "grad_norm": 0.431640625, "learning_rate": 4.473986731468183e-07, "logits/chosen": 0.253683865070343, "logits/rejected": 0.555522084236145, "logps/chosen": -200.7284698486328, "logps/rejected": -225.14419555664062, "loss": 0.6284, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.015354807488620281, "rewards/margins": 0.13844987750053406, "rewards/margins_max": 0.19759990274906158, "rewards/margins_min": 0.07929985970258713, "rewards/margins_std": 0.08365076035261154, "rewards/rejected": -0.1230950728058815, "step": 1150 }, { "epoch": 0.29, "grad_norm": 0.404296875, "learning_rate": 4.460433579303029e-07, "logits/chosen": 0.08314850181341171, "logits/rejected": 0.5074546933174133, "logps/chosen": -222.1901092529297, "logps/rejected": -228.58462524414062, "loss": 0.633, "rewards/accuracies": 0.9375, "rewards/chosen": 0.004076335579156876, "rewards/margins": 0.12874075770378113, "rewards/margins_max": 0.18448428809642792, "rewards/margins_min": 0.07299719750881195, "rewards/margins_std": 0.07883326709270477, "rewards/rejected": -0.12466441094875336, "step": 1160 }, { "epoch": 0.29, "grad_norm": 0.41015625, "learning_rate": 4.446729121864478e-07, "logits/chosen": 0.10272110998630524, "logits/rejected": 0.5856494903564453, "logps/chosen": -220.8333740234375, "logps/rejected": -219.6571044921875, "loss": 0.6291, "rewards/accuracies": 0.9375, "rewards/chosen": 0.012485465034842491, "rewards/margins": 0.13196933269500732, "rewards/margins_max": 0.19599154591560364, "rewards/margins_min": 0.0679471343755722, "rewards/margins_std": 0.09054107218980789, "rewards/rejected": -0.11948386579751968, "step": 1170 }, { "epoch": 0.3, "grad_norm": 0.421875, "learning_rate": 4.4328744168556314e-07, "logits/chosen": 0.1438855081796646, "logits/rejected": 0.46074891090393066, "logps/chosen": -206.9652099609375, "logps/rejected": -220.42764282226562, "loss": 0.626, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.012216602452099323, "rewards/margins": 0.14871208369731903, "rewards/margins_max": 0.20919618010520935, "rewards/margins_min": 0.08822795748710632, "rewards/margins_std": 0.08553745597600937, "rewards/rejected": -0.1364954710006714, "step": 1180 }, { "epoch": 0.3, "grad_norm": 0.451171875, "learning_rate": 4.4188705335756246e-07, "logits/chosen": 0.25555095076560974, "logits/rejected": 0.5125433206558228, "logps/chosen": -202.17300415039062, "logps/rejected": -208.05978393554688, "loss": 0.633, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.004283900372684002, "rewards/margins": 0.11717404425144196, "rewards/margins_max": 0.1711912453174591, "rewards/margins_min": 0.06315682828426361, "rewards/margins_std": 0.07639187574386597, "rewards/rejected": -0.11289013922214508, "step": 1190 }, { "epoch": 0.3, "grad_norm": 0.427734375, "learning_rate": 4.4047185528370954e-07, "logits/chosen": 0.19039386510849, "logits/rejected": 0.5899905562400818, "logps/chosen": -216.4933624267578, "logps/rejected": -217.4136199951172, "loss": 0.6359, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.009880407713353634, "rewards/margins": 0.12567076086997986, "rewards/margins_max": 0.17633716762065887, "rewards/margins_min": 0.07500435411930084, "rewards/margins_std": 0.07165311276912689, "rewards/rejected": -0.13555116951465607, "step": 1200 }, { "epoch": 0.3, "grad_norm": 0.455078125, "learning_rate": 4.3904195668827694e-07, "logits/chosen": 0.1970784068107605, "logits/rejected": 0.541928768157959, "logps/chosen": -220.466552734375, "logps/rejected": -214.4608612060547, "loss": 0.6321, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.004724076017737389, "rewards/margins": 0.12655866146087646, "rewards/margins_max": 0.1726570725440979, "rewards/margins_min": 0.08046025037765503, "rewards/margins_std": 0.06519299000501633, "rewards/rejected": -0.12183459848165512, "step": 1210 }, { "epoch": 0.31, "grad_norm": 0.427734375, "learning_rate": 4.375974679301158e-07, "logits/chosen": 0.1839190125465393, "logits/rejected": 0.6055052876472473, "logps/chosen": -211.6237030029297, "logps/rejected": -211.16397094726562, "loss": 0.6347, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0040855249390006065, "rewards/margins": 0.13839025795459747, "rewards/margins_max": 0.18665581941604614, "rewards/margins_min": 0.09012471139431, "rewards/margins_std": 0.06825779378414154, "rewards/rejected": -0.13430474698543549, "step": 1220 }, { "epoch": 0.31, "grad_norm": 0.408203125, "learning_rate": 4.3613850049413896e-07, "logits/chosen": 0.1453840434551239, "logits/rejected": 0.4979386329650879, "logps/chosen": -194.91259765625, "logps/rejected": -222.55825805664062, "loss": 0.6314, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.009354735724627972, "rewards/margins": 0.1309475600719452, "rewards/margins_max": 0.19592466950416565, "rewards/margins_min": 0.06597042083740234, "rewards/margins_std": 0.09189152717590332, "rewards/rejected": -0.12159280478954315, "step": 1230 }, { "epoch": 0.31, "grad_norm": 0.44140625, "learning_rate": 4.346651669827162e-07, "logits/chosen": 0.2608383595943451, "logits/rejected": 0.5879455804824829, "logps/chosen": -231.02267456054688, "logps/rejected": -247.4701690673828, "loss": 0.6338, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.006313489284366369, "rewards/margins": 0.12605717778205872, "rewards/margins_max": 0.18184205889701843, "rewards/margins_min": 0.07027232646942139, "rewards/margins_std": 0.07889171689748764, "rewards/rejected": -0.11974368989467621, "step": 1240 }, { "epoch": 0.31, "grad_norm": 0.412109375, "learning_rate": 4.331775811069837e-07, "logits/chosen": 0.19402122497558594, "logits/rejected": 0.5059689283370972, "logps/chosen": -217.4102020263672, "logps/rejected": -245.49765014648438, "loss": 0.6316, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.005672194994986057, "rewards/margins": 0.13400006294250488, "rewards/margins_max": 0.18662145733833313, "rewards/margins_min": 0.08137868344783783, "rewards/margins_std": 0.07441785931587219, "rewards/rejected": -0.13967224955558777, "step": 1250 }, { "epoch": 0.32, "grad_norm": 0.546875, "learning_rate": 4.316758576780679e-07, "logits/chosen": 0.15562015771865845, "logits/rejected": 0.4818592965602875, "logps/chosen": -225.107666015625, "logps/rejected": -238.4374542236328, "loss": 0.6316, "rewards/accuracies": 0.9375, "rewards/chosen": -0.006252510007470846, "rewards/margins": 0.1173849105834961, "rewards/margins_max": 0.17692282795906067, "rewards/margins_min": 0.0578470341861248, "rewards/margins_std": 0.08419928699731827, "rewards/rejected": -0.12363742291927338, "step": 1260 }, { "epoch": 0.32, "grad_norm": 0.462890625, "learning_rate": 4.301601125982245e-07, "logits/chosen": 0.2666797935962677, "logits/rejected": 0.6440590620040894, "logps/chosen": -212.3495330810547, "logps/rejected": -222.29611206054688, "loss": 0.6274, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.009626020677387714, "rewards/margins": 0.1582595556974411, "rewards/margins_max": 0.22133047878742218, "rewards/margins_min": 0.09518863260746002, "rewards/margins_std": 0.08919575065374374, "rewards/rejected": -0.14863352477550507, "step": 1270 }, { "epoch": 0.32, "grad_norm": 0.412109375, "learning_rate": 4.286304628518932e-07, "logits/chosen": 0.06284558773040771, "logits/rejected": 0.4226767122745514, "logps/chosen": -202.75430297851562, "logps/rejected": -211.32473754882812, "loss": 0.6352, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.00244440627284348, "rewards/margins": 0.11437270790338516, "rewards/margins_max": 0.1629493087530136, "rewards/margins_min": 0.06579609215259552, "rewards/margins_std": 0.06869770586490631, "rewards/rejected": -0.11681709438562393, "step": 1280 }, { "epoch": 0.32, "grad_norm": 0.47265625, "learning_rate": 4.270870264966687e-07, "logits/chosen": 0.15754824876785278, "logits/rejected": 0.6121476888656616, "logps/chosen": -210.3895721435547, "logps/rejected": -220.9415283203125, "loss": 0.6248, "rewards/accuracies": 0.9375, "rewards/chosen": 0.011729549616575241, "rewards/margins": 0.1439180225133896, "rewards/margins_max": 0.20768460631370544, "rewards/margins_min": 0.08015145361423492, "rewards/margins_std": 0.09017957001924515, "rewards/rejected": -0.13218848407268524, "step": 1290 }, { "epoch": 0.33, "grad_norm": 0.4375, "learning_rate": 4.255299226541893e-07, "logits/chosen": 0.194209024310112, "logits/rejected": 0.6800872087478638, "logps/chosen": -227.7884979248047, "logps/rejected": -227.42794799804688, "loss": 0.6271, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.007912559434771538, "rewards/margins": 0.14282198250293732, "rewards/margins_max": 0.19915294647216797, "rewards/margins_min": 0.08649100363254547, "rewards/margins_std": 0.07966402173042297, "rewards/rejected": -0.13490942120552063, "step": 1300 }, { "epoch": 0.33, "grad_norm": 0.4375, "learning_rate": 4.239592715009429e-07, "logits/chosen": 0.044777266681194305, "logits/rejected": 0.4025615155696869, "logps/chosen": -208.9937744140625, "logps/rejected": -205.27285766601562, "loss": 0.6322, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.004150100983679295, "rewards/margins": 0.1141623854637146, "rewards/margins_max": 0.16347596049308777, "rewards/margins_min": 0.06484884023666382, "rewards/margins_std": 0.06973989307880402, "rewards/rejected": -0.11831249296665192, "step": 1310 }, { "epoch": 0.33, "grad_norm": 0.4140625, "learning_rate": 4.2237519425899243e-07, "logits/chosen": 0.1900787055492401, "logits/rejected": 0.49922212958335876, "logps/chosen": -235.10543823242188, "logps/rejected": -251.2314910888672, "loss": 0.6227, "rewards/accuracies": 0.9375, "rewards/chosen": -0.007621604949235916, "rewards/margins": 0.15270861983299255, "rewards/margins_max": 0.21048466861248016, "rewards/margins_min": 0.09493254870176315, "rewards/margins_std": 0.08170770108699799, "rewards/rejected": -0.16033020615577698, "step": 1320 }, { "epoch": 0.33, "grad_norm": 0.4921875, "learning_rate": 4.207778131866191e-07, "logits/chosen": 0.2092297077178955, "logits/rejected": 0.43651413917541504, "logps/chosen": -217.12350463867188, "logps/rejected": -222.86318969726562, "loss": 0.6292, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.005055149085819721, "rewards/margins": 0.12460893392562866, "rewards/margins_max": 0.18898361921310425, "rewards/margins_min": 0.060234226286411285, "rewards/margins_std": 0.09103957563638687, "rewards/rejected": -0.12966406345367432, "step": 1330 }, { "epoch": 0.34, "grad_norm": 0.423828125, "learning_rate": 4.191672515688872e-07, "logits/chosen": 0.14932546019554138, "logits/rejected": 0.4373621344566345, "logps/chosen": -208.5403594970703, "logps/rejected": -203.9051971435547, "loss": 0.6269, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0020638906862586737, "rewards/margins": 0.13373130559921265, "rewards/margins_max": 0.19542917609214783, "rewards/margins_min": 0.07203344255685806, "rewards/margins_std": 0.08725395798683167, "rewards/rejected": -0.13166742026805878, "step": 1340 }, { "epoch": 0.34, "grad_norm": 0.46484375, "learning_rate": 4.175436337081288e-07, "logits/chosen": 0.23434293270111084, "logits/rejected": 0.5381620526313782, "logps/chosen": -192.01748657226562, "logps/rejected": -231.3257598876953, "loss": 0.622, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0012298275250941515, "rewards/margins": 0.15280774235725403, "rewards/margins_max": 0.21529679000377655, "rewards/margins_min": 0.0903187245130539, "rewards/margins_std": 0.08837283402681351, "rewards/rejected": -0.1515779197216034, "step": 1350 }, { "epoch": 0.34, "grad_norm": 0.453125, "learning_rate": 4.1590708491435045e-07, "logits/chosen": 0.11555041372776031, "logits/rejected": 0.5687001943588257, "logps/chosen": -221.37313842773438, "logps/rejected": -210.83468627929688, "loss": 0.6277, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.005098782479763031, "rewards/margins": 0.1396380364894867, "rewards/margins_max": 0.19858750700950623, "rewards/margins_min": 0.08068858087062836, "rewards/margins_std": 0.08336714655160904, "rewards/rejected": -0.14473682641983032, "step": 1360 }, { "epoch": 0.34, "grad_norm": 0.44140625, "learning_rate": 4.142577314955613e-07, "logits/chosen": 0.17060637474060059, "logits/rejected": 0.5141999125480652, "logps/chosen": -206.1536102294922, "logps/rejected": -233.5289306640625, "loss": 0.6258, "rewards/accuracies": 0.9375, "rewards/chosen": -0.00100190460216254, "rewards/margins": 0.14934705197811127, "rewards/margins_max": 0.2153509110212326, "rewards/margins_min": 0.08334319293498993, "rewards/margins_std": 0.0933435708284378, "rewards/rejected": -0.150348961353302, "step": 1370 }, { "epoch": 0.35, "grad_norm": 0.408203125, "learning_rate": 4.1259570074802506e-07, "logits/chosen": 0.4289766252040863, "logits/rejected": 0.7257094383239746, "logps/chosen": -226.4285888671875, "logps/rejected": -213.3216552734375, "loss": 0.628, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0056352317333221436, "rewards/margins": 0.12981966137886047, "rewards/margins_max": 0.18293872475624084, "rewards/margins_min": 0.07670056819915771, "rewards/margins_std": 0.07512171566486359, "rewards/rejected": -0.13545487821102142, "step": 1380 }, { "epoch": 0.35, "grad_norm": 0.380859375, "learning_rate": 4.109211209464354e-07, "logits/chosen": 0.17007596790790558, "logits/rejected": 0.38958263397216797, "logps/chosen": -210.254150390625, "logps/rejected": -224.582275390625, "loss": 0.6273, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0023724420461803675, "rewards/margins": 0.13513801991939545, "rewards/margins_max": 0.19057369232177734, "rewards/margins_min": 0.07970234006643295, "rewards/margins_std": 0.07839788496494293, "rewards/rejected": -0.13276559114456177, "step": 1390 }, { "epoch": 0.35, "grad_norm": 0.466796875, "learning_rate": 4.0923412133401546e-07, "logits/chosen": 0.24399927258491516, "logits/rejected": 0.4683291018009186, "logps/chosen": -213.03842163085938, "logps/rejected": -266.65850830078125, "loss": 0.627, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.002091284841299057, "rewards/margins": 0.13846664130687714, "rewards/margins_max": 0.19727608561515808, "rewards/margins_min": 0.079657182097435, "rewards/margins_std": 0.0831691175699234, "rewards/rejected": -0.1363753378391266, "step": 1400 }, { "epoch": 0.35, "grad_norm": 0.40625, "learning_rate": 4.0753483211254326e-07, "logits/chosen": 0.10242321342229843, "logits/rejected": 0.42057886719703674, "logps/chosen": -199.49154663085938, "logps/rejected": -224.8021697998047, "loss": 0.6254, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.0012313395272940397, "rewards/margins": 0.15778854489326477, "rewards/margins_max": 0.22822323441505432, "rewards/margins_min": 0.08735384047031403, "rewards/margins_std": 0.09960971772670746, "rewards/rejected": -0.15901990234851837, "step": 1410 }, { "epoch": 0.36, "grad_norm": 0.439453125, "learning_rate": 4.0582338443230244e-07, "logits/chosen": -0.06408219784498215, "logits/rejected": 0.3505231738090515, "logps/chosen": -211.2571258544922, "logps/rejected": -215.91519165039062, "loss": 0.6322, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.004530089441686869, "rewards/margins": 0.13423126935958862, "rewards/margins_max": 0.18881027400493622, "rewards/margins_min": 0.07965225726366043, "rewards/margins_std": 0.07718638330698013, "rewards/rejected": -0.13876134157180786, "step": 1420 }, { "epoch": 0.36, "grad_norm": 0.341796875, "learning_rate": 4.040999103819606e-07, "logits/chosen": 0.22336915135383606, "logits/rejected": 0.41135939955711365, "logps/chosen": -195.76156616210938, "logps/rejected": -216.2519073486328, "loss": 0.6291, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.001288919011130929, "rewards/margins": 0.13204926252365112, "rewards/margins_max": 0.18971675634384155, "rewards/margins_min": 0.0743817538022995, "rewards/margins_std": 0.08155416697263718, "rewards/rejected": -0.1307603418827057, "step": 1430 }, { "epoch": 0.36, "grad_norm": 0.439453125, "learning_rate": 4.023645429783743e-07, "logits/chosen": 0.14188432693481445, "logits/rejected": 0.5488114356994629, "logps/chosen": -248.9943389892578, "logps/rejected": -237.12417602539062, "loss": 0.6285, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.000656818097922951, "rewards/margins": 0.14548137784004211, "rewards/margins_max": 0.20429666340351105, "rewards/margins_min": 0.08666609972715378, "rewards/margins_std": 0.0831773653626442, "rewards/rejected": -0.1448245495557785, "step": 1440 }, { "epoch": 0.36, "grad_norm": 0.443359375, "learning_rate": 4.0061741615632326e-07, "logits/chosen": 0.14987316727638245, "logits/rejected": 0.3361620306968689, "logps/chosen": -204.991943359375, "logps/rejected": -235.5940704345703, "loss": 0.6253, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.003025595098733902, "rewards/margins": 0.14620764553546906, "rewards/margins_max": 0.21231038868427277, "rewards/margins_min": 0.08010489493608475, "rewards/margins_std": 0.09348339587450027, "rewards/rejected": -0.14318202435970306, "step": 1450 }, { "epoch": 0.37, "grad_norm": 0.419921875, "learning_rate": 3.988586647581731e-07, "logits/chosen": 0.03841057047247887, "logits/rejected": 0.23656149208545685, "logps/chosen": -197.7430877685547, "logps/rejected": -218.1037139892578, "loss": 0.6278, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.004657519515603781, "rewards/margins": 0.13344493508338928, "rewards/margins_max": 0.19703121483325958, "rewards/margins_min": 0.06985869258642197, "rewards/margins_std": 0.08992455154657364, "rewards/rejected": -0.1381024718284607, "step": 1460 }, { "epoch": 0.37, "grad_norm": 0.375, "learning_rate": 3.970884245234683e-07, "logits/chosen": 0.1984628140926361, "logits/rejected": 0.4503410756587982, "logps/chosen": -202.2525177001953, "logps/rejected": -230.85122680664062, "loss": 0.6287, "rewards/accuracies": 0.9375, "rewards/chosen": -0.004781616386026144, "rewards/margins": 0.15508830547332764, "rewards/margins_max": 0.22772547602653503, "rewards/margins_min": 0.08245115727186203, "rewards/margins_std": 0.10272447764873505, "rewards/rejected": -0.1598699539899826, "step": 1470 }, { "epoch": 0.37, "grad_norm": 0.388671875, "learning_rate": 3.9530683207845596e-07, "logits/chosen": 0.2983730435371399, "logits/rejected": 0.37755218148231506, "logps/chosen": -223.62875366210938, "logps/rejected": -270.2645263671875, "loss": 0.624, "rewards/accuracies": 0.9375, "rewards/chosen": -0.015195081941783428, "rewards/margins": 0.13946563005447388, "rewards/margins_max": 0.20698609948158264, "rewards/margins_min": 0.07194516807794571, "rewards/margins_std": 0.09548836201429367, "rewards/rejected": -0.15466073155403137, "step": 1480 }, { "epoch": 0.37, "grad_norm": 0.40234375, "learning_rate": 3.9351402492554117e-07, "logits/chosen": 0.225258469581604, "logits/rejected": 0.5460050702095032, "logps/chosen": -218.29244995117188, "logps/rejected": -226.8291473388672, "loss": 0.624, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.01104720775038004, "rewards/margins": 0.14948385953903198, "rewards/margins_max": 0.21099543571472168, "rewards/margins_min": 0.08797230571508408, "rewards/margins_std": 0.08699048310518265, "rewards/rejected": -0.16053107380867004, "step": 1490 }, { "epoch": 0.38, "grad_norm": 0.51953125, "learning_rate": 3.917101414326743e-07, "logits/chosen": 0.30243515968322754, "logits/rejected": 0.555859386920929, "logps/chosen": -192.96510314941406, "logps/rejected": -226.6133575439453, "loss": 0.621, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.0012192493304610252, "rewards/margins": 0.16557331383228302, "rewards/margins_max": 0.23345895111560822, "rewards/margins_min": 0.09768766164779663, "rewards/margins_std": 0.09600480645895004, "rewards/rejected": -0.16435407102108002, "step": 1500 }, { "epoch": 0.38, "grad_norm": 0.38671875, "learning_rate": 3.8989532082267226e-07, "logits/chosen": 0.18203946948051453, "logits/rejected": 0.5518532395362854, "logps/chosen": -213.8331756591797, "logps/rejected": -230.1101837158203, "loss": 0.6186, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0075533525086939335, "rewards/margins": 0.17561054229736328, "rewards/margins_max": 0.2519463300704956, "rewards/margins_min": 0.09927478432655334, "rewards/margins_std": 0.10795507580041885, "rewards/rejected": -0.16805720329284668, "step": 1510 }, { "epoch": 0.38, "grad_norm": 0.443359375, "learning_rate": 3.8806970316247287e-07, "logits/chosen": 0.1255679875612259, "logits/rejected": 0.49999529123306274, "logps/chosen": -255.6925048828125, "logps/rejected": -236.8339080810547, "loss": 0.6362, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.017898105084896088, "rewards/margins": 0.11929675191640854, "rewards/margins_max": 0.18690548837184906, "rewards/margins_min": 0.05168800801038742, "rewards/margins_std": 0.09561319649219513, "rewards/rejected": -0.13719485700130463, "step": 1520 }, { "epoch": 0.38, "grad_norm": 0.416015625, "learning_rate": 3.8623342935232524e-07, "logits/chosen": 0.1429055631160736, "logits/rejected": 0.5958294868469238, "logps/chosen": -209.48355102539062, "logps/rejected": -229.17324829101562, "loss": 0.6257, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 5.754493031417951e-05, "rewards/margins": 0.1530180275440216, "rewards/margins_max": 0.21532678604125977, "rewards/margins_min": 0.09070924669504166, "rewards/margins_std": 0.08811791241168976, "rewards/rejected": -0.15296046435832977, "step": 1530 }, { "epoch": 0.39, "grad_norm": 0.35546875, "learning_rate": 3.8438664111491427e-07, "logits/chosen": 0.15772657096385956, "logits/rejected": 0.5604976415634155, "logps/chosen": -214.9139862060547, "logps/rejected": -206.59896850585938, "loss": 0.6304, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01669057086110115, "rewards/margins": 0.11732876300811768, "rewards/margins_max": 0.1783764511346817, "rewards/margins_min": 0.05628107860684395, "rewards/margins_std": 0.0863344669342041, "rewards/rejected": -0.13401933014392853, "step": 1540 }, { "epoch": 0.39, "grad_norm": 0.474609375, "learning_rate": 3.825294809844234e-07, "logits/chosen": -0.007230892777442932, "logits/rejected": 0.21603484451770782, "logps/chosen": -198.36126708984375, "logps/rejected": -217.1881561279297, "loss": 0.6281, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.010291590355336666, "rewards/margins": 0.1217050701379776, "rewards/margins_max": 0.17377857863903046, "rewards/margins_min": 0.06963153928518295, "rewards/margins_std": 0.07364308834075928, "rewards/rejected": -0.13199666142463684, "step": 1550 }, { "epoch": 0.39, "grad_norm": 0.5078125, "learning_rate": 3.806620922955334e-07, "logits/chosen": 0.27010488510131836, "logits/rejected": 0.5648306012153625, "logps/chosen": -216.64016723632812, "logps/rejected": -225.8220977783203, "loss": 0.6313, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.008960830047726631, "rewards/margins": 0.14888259768486023, "rewards/margins_max": 0.2264583855867386, "rewards/margins_min": 0.07130680233240128, "rewards/margins_std": 0.1097087413072586, "rewards/rejected": -0.1578434407711029, "step": 1560 }, { "epoch": 0.4, "grad_norm": 0.396484375, "learning_rate": 3.7878461917235986e-07, "logits/chosen": 0.07090526819229126, "logits/rejected": 0.42235785722732544, "logps/chosen": -222.59506225585938, "logps/rejected": -230.4830322265625, "loss": 0.6285, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.005762896034866571, "rewards/margins": 0.13526439666748047, "rewards/margins_max": 0.19429653882980347, "rewards/margins_min": 0.07623222470283508, "rewards/margins_std": 0.08348408341407776, "rewards/rejected": -0.1410272866487503, "step": 1570 }, { "epoch": 0.4, "grad_norm": 0.419921875, "learning_rate": 3.768972065173303e-07, "logits/chosen": -0.006213393993675709, "logits/rejected": 0.1914052963256836, "logps/chosen": -186.24368286132812, "logps/rejected": -217.54421997070312, "loss": 0.6242, "rewards/accuracies": 0.9375, "rewards/chosen": -0.016393940895795822, "rewards/margins": 0.14732101559638977, "rewards/margins_max": 0.2080848515033722, "rewards/margins_min": 0.08655720949172974, "rewards/margins_std": 0.08593300729990005, "rewards/rejected": -0.16371497511863708, "step": 1580 }, { "epoch": 0.4, "grad_norm": 0.4296875, "learning_rate": 3.75e-07, "logits/chosen": 0.12396266311407089, "logits/rejected": 0.5255619287490845, "logps/chosen": -210.00131225585938, "logps/rejected": -199.7407989501953, "loss": 0.6285, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.002386572305113077, "rewards/margins": 0.12589840590953827, "rewards/margins_max": 0.18197199702262878, "rewards/margins_min": 0.06982481479644775, "rewards/margins_std": 0.07930003106594086, "rewards/rejected": -0.1282849758863449, "step": 1590 }, { "epoch": 0.4, "grad_norm": 0.4375, "learning_rate": 3.7309314604580954e-07, "logits/chosen": 0.09753980487585068, "logits/rejected": 0.460299015045166, "logps/chosen": -229.70718383789062, "logps/rejected": -244.01565551757812, "loss": 0.627, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0024743343237787485, "rewards/margins": 0.13972654938697815, "rewards/margins_max": 0.20034757256507874, "rewards/margins_min": 0.07910553365945816, "rewards/margins_std": 0.08573105186223984, "rewards/rejected": -0.13725218176841736, "step": 1600 }, { "epoch": 0.41, "grad_norm": 0.42578125, "learning_rate": 3.711767918247841e-07, "logits/chosen": 0.14264684915542603, "logits/rejected": 0.29743626713752747, "logps/chosen": -193.8705596923828, "logps/rejected": -256.1227722167969, "loss": 0.6177, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.0023327013477683067, "rewards/margins": 0.18759290874004364, "rewards/margins_max": 0.2546507716178894, "rewards/margins_min": 0.12053501605987549, "rewards/margins_std": 0.09483416378498077, "rewards/rejected": -0.18992561101913452, "step": 1610 }, { "epoch": 0.41, "grad_norm": 0.388671875, "learning_rate": 3.6925108524017446e-07, "logits/chosen": 0.2644719183444977, "logits/rejected": 0.40883946418762207, "logps/chosen": -188.80471801757812, "logps/rejected": -216.24081420898438, "loss": 0.6284, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.004674865864217281, "rewards/margins": 0.14658722281455994, "rewards/margins_max": 0.210347980260849, "rewards/margins_min": 0.08282643556594849, "rewards/margins_std": 0.09017135202884674, "rewards/rejected": -0.1512620747089386, "step": 1620 }, { "epoch": 0.41, "grad_norm": 0.451171875, "learning_rate": 3.6731617491704194e-07, "logits/chosen": 0.25777751207351685, "logits/rejected": 0.3899232745170593, "logps/chosen": -190.67169189453125, "logps/rejected": -219.5882110595703, "loss": 0.6247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0033685460221022367, "rewards/margins": 0.1534571498632431, "rewards/margins_max": 0.21911291778087616, "rewards/margins_min": 0.08780137449502945, "rewards/margins_std": 0.09285128116607666, "rewards/rejected": -0.1568256914615631, "step": 1630 }, { "epoch": 0.41, "grad_norm": 0.466796875, "learning_rate": 3.6537221019078794e-07, "logits/chosen": 0.12218916416168213, "logits/rejected": 0.6251595616340637, "logps/chosen": -229.55642700195312, "logps/rejected": -223.08151245117188, "loss": 0.6212, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.010088443756103516, "rewards/margins": 0.14635497331619263, "rewards/margins_max": 0.22119002044200897, "rewards/margins_min": 0.07151994109153748, "rewards/margins_std": 0.10583271831274033, "rewards/rejected": -0.15644341707229614, "step": 1640 }, { "epoch": 0.42, "grad_norm": 0.455078125, "learning_rate": 3.634193410956282e-07, "logits/chosen": 0.18125967681407928, "logits/rejected": 0.4244523048400879, "logps/chosen": -222.82119750976562, "logps/rejected": -237.586181640625, "loss": 0.6287, "rewards/accuracies": 0.9375, "rewards/chosen": -0.012667866423726082, "rewards/margins": 0.12644030153751373, "rewards/margins_max": 0.18080410361289978, "rewards/margins_min": 0.07207650691270828, "rewards/margins_std": 0.0768820196390152, "rewards/rejected": -0.13910818099975586, "step": 1650 }, { "epoch": 0.42, "grad_norm": 0.35546875, "learning_rate": 3.614577183530131e-07, "logits/chosen": 0.16024748980998993, "logits/rejected": 0.5728967785835266, "logps/chosen": -198.57723999023438, "logps/rejected": -216.2920379638672, "loss": 0.6288, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.015681054443120956, "rewards/margins": 0.12363159656524658, "rewards/margins_max": 0.1898706704378128, "rewards/margins_min": 0.057392507791519165, "rewards/margins_std": 0.09367620199918747, "rewards/rejected": -0.13931265473365784, "step": 1660 }, { "epoch": 0.42, "grad_norm": 0.5, "learning_rate": 3.594874933599949e-07, "logits/chosen": 0.11142469942569733, "logits/rejected": 0.40762606263160706, "logps/chosen": -197.2635955810547, "logps/rejected": -232.860595703125, "loss": 0.6177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.002786412835121155, "rewards/margins": 0.16630126535892487, "rewards/margins_max": 0.24014155566692352, "rewards/margins_min": 0.09246097505092621, "rewards/margins_std": 0.10442592948675156, "rewards/rejected": -0.16351483762264252, "step": 1670 }, { "epoch": 0.42, "grad_norm": 0.412109375, "learning_rate": 3.5750881817754343e-07, "logits/chosen": 0.161602184176445, "logits/rejected": 0.4142238199710846, "logps/chosen": -207.36044311523438, "logps/rejected": -231.387939453125, "loss": 0.628, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.0025730610359460115, "rewards/margins": 0.13976158201694489, "rewards/margins_max": 0.20258326828479767, "rewards/margins_min": 0.0769398957490921, "rewards/margins_std": 0.08884327113628387, "rewards/rejected": -0.14233465492725372, "step": 1680 }, { "epoch": 0.43, "grad_norm": 0.45703125, "learning_rate": 3.5552184551880987e-07, "logits/chosen": 0.1704731285572052, "logits/rejected": 0.553528904914856, "logps/chosen": -235.5526580810547, "logps/rejected": -258.637939453125, "loss": 0.625, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.004888847935944796, "rewards/margins": 0.1633296012878418, "rewards/margins_max": 0.24000592529773712, "rewards/margins_min": 0.08665328472852707, "rewards/margins_std": 0.1084367036819458, "rewards/rejected": -0.1682184487581253, "step": 1690 }, { "epoch": 0.43, "grad_norm": 0.45703125, "learning_rate": 3.5352672873734023e-07, "logits/chosen": 0.09287004917860031, "logits/rejected": 0.45186567306518555, "logps/chosen": -231.8868865966797, "logps/rejected": -242.67031860351562, "loss": 0.6225, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0019060630584135652, "rewards/margins": 0.14974726736545563, "rewards/margins_max": 0.2069702446460724, "rewards/margins_min": 0.09252426773309708, "rewards/margins_std": 0.08092552423477173, "rewards/rejected": -0.15165331959724426, "step": 1700 }, { "epoch": 0.43, "grad_norm": 0.439453125, "learning_rate": 3.515236218152401e-07, "logits/chosen": 0.15407420694828033, "logits/rejected": 0.6246528029441833, "logps/chosen": -222.09487915039062, "logps/rejected": -205.3578643798828, "loss": 0.6203, "rewards/accuracies": 0.9375, "rewards/chosen": -0.00915017444640398, "rewards/margins": 0.14655938744544983, "rewards/margins_max": 0.21362081170082092, "rewards/margins_min": 0.07949795573949814, "rewards/margins_std": 0.0948391705751419, "rewards/rejected": -0.15570956468582153, "step": 1710 }, { "epoch": 0.43, "grad_norm": 0.5078125, "learning_rate": 3.495126793512898e-07, "logits/chosen": 0.13046178221702576, "logits/rejected": 0.5708819627761841, "logps/chosen": -216.9124298095703, "logps/rejected": -213.406982421875, "loss": 0.6227, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.0025185600388795137, "rewards/margins": 0.14821955561637878, "rewards/margins_max": 0.20639987289905548, "rewards/margins_min": 0.09003923833370209, "rewards/margins_std": 0.08227940648794174, "rewards/rejected": -0.15073810517787933, "step": 1720 }, { "epoch": 0.44, "grad_norm": 0.46484375, "learning_rate": 3.474940565490129e-07, "logits/chosen": 0.21661195158958435, "logits/rejected": 0.42825061082839966, "logps/chosen": -222.0463104248047, "logps/rejected": -248.99337768554688, "loss": 0.6217, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.010068004950881004, "rewards/margins": 0.13712318241596222, "rewards/margins_max": 0.20092687010765076, "rewards/margins_min": 0.0733194500207901, "rewards/margins_std": 0.09023208916187286, "rewards/rejected": -0.14719118177890778, "step": 1730 }, { "epoch": 0.44, "grad_norm": 0.41015625, "learning_rate": 3.4546790920469776e-07, "logits/chosen": 0.12336118519306183, "logits/rejected": 0.5848550796508789, "logps/chosen": -216.853271484375, "logps/rejected": -224.48001098632812, "loss": 0.6279, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.003088909201323986, "rewards/margins": 0.13839459419250488, "rewards/margins_max": 0.19858820736408234, "rewards/margins_min": 0.07820095121860504, "rewards/margins_std": 0.08512666076421738, "rewards/rejected": -0.13530568778514862, "step": 1740 }, { "epoch": 0.44, "grad_norm": 0.49609375, "learning_rate": 3.4343439369537286e-07, "logits/chosen": 0.03644455224275589, "logits/rejected": 0.3748028576374054, "logps/chosen": -199.61643981933594, "logps/rejected": -217.1728973388672, "loss": 0.6235, "rewards/accuracies": 0.9375, "rewards/chosen": 0.002527992706745863, "rewards/margins": 0.1366109549999237, "rewards/margins_max": 0.19345729053020477, "rewards/margins_min": 0.07976466417312622, "rewards/margins_std": 0.08039282262325287, "rewards/rejected": -0.13408295810222626, "step": 1750 }, { "epoch": 0.44, "grad_norm": 0.4375, "learning_rate": 3.413936669667381e-07, "logits/chosen": 0.16939976811408997, "logits/rejected": 0.5194543600082397, "logps/chosen": -208.93807983398438, "logps/rejected": -214.12088012695312, "loss": 0.63, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.017858784645795822, "rewards/margins": 0.13456907868385315, "rewards/margins_max": 0.19640877842903137, "rewards/margins_min": 0.07272940874099731, "rewards/margins_std": 0.08745451271533966, "rewards/rejected": -0.15242788195610046, "step": 1760 }, { "epoch": 0.45, "grad_norm": 0.52734375, "learning_rate": 3.3934588652105156e-07, "logits/chosen": 0.07584407180547714, "logits/rejected": 0.3485907018184662, "logps/chosen": -200.30685424804688, "logps/rejected": -230.2808380126953, "loss": 0.6216, "rewards/accuracies": 1.0, "rewards/chosen": 0.0006271469173952937, "rewards/margins": 0.1628343164920807, "rewards/margins_max": 0.23816709220409393, "rewards/margins_min": 0.08750150352716446, "rewards/margins_std": 0.10653666406869888, "rewards/rejected": -0.16220712661743164, "step": 1770 }, { "epoch": 0.45, "grad_norm": 0.49609375, "learning_rate": 3.3729121040497367e-07, "logits/chosen": 0.10274849832057953, "logits/rejected": 0.3027471601963043, "logps/chosen": -197.2198486328125, "logps/rejected": -214.23056030273438, "loss": 0.6339, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.020425861701369286, "rewards/margins": 0.1282966136932373, "rewards/margins_max": 0.19665293395519257, "rewards/margins_min": 0.05994029715657234, "rewards/margins_std": 0.09667042642831802, "rewards/rejected": -0.14872248470783234, "step": 1780 }, { "epoch": 0.45, "grad_norm": 0.4609375, "learning_rate": 3.3522979719736923e-07, "logits/chosen": 0.2011840045452118, "logits/rejected": 0.5494887232780457, "logps/chosen": -202.08560180664062, "logps/rejected": -204.0021514892578, "loss": 0.6328, "rewards/accuracies": 0.9375, "rewards/chosen": -0.012764686718583107, "rewards/margins": 0.11872999370098114, "rewards/margins_max": 0.1699032038450241, "rewards/margins_min": 0.06755679100751877, "rewards/margins_std": 0.07236983627080917, "rewards/rejected": -0.1314946711063385, "step": 1790 }, { "epoch": 0.45, "grad_norm": 0.3984375, "learning_rate": 3.3316180599706835e-07, "logits/chosen": 0.25245994329452515, "logits/rejected": 0.7181536555290222, "logps/chosen": -203.9959716796875, "logps/rejected": -217.80441284179688, "loss": 0.6242, "rewards/accuracies": 0.9375, "rewards/chosen": -0.006771632935851812, "rewards/margins": 0.14376261830329895, "rewards/margins_max": 0.20835721492767334, "rewards/margins_min": 0.07916799187660217, "rewards/margins_std": 0.09135057032108307, "rewards/rejected": -0.1505342423915863, "step": 1800 }, { "epoch": 0.46, "grad_norm": 0.412109375, "learning_rate": 3.310873964105872e-07, "logits/chosen": 0.14832933247089386, "logits/rejected": 0.4920567572116852, "logps/chosen": -221.96011352539062, "logps/rejected": -249.03427124023438, "loss": 0.6202, "rewards/accuracies": 1.0, "rewards/chosen": -0.010531350038945675, "rewards/margins": 0.1801469624042511, "rewards/margins_max": 0.24239560961723328, "rewards/margins_min": 0.11789830029010773, "rewards/margins_std": 0.08803291618824005, "rewards/rejected": -0.19067831337451935, "step": 1810 }, { "epoch": 0.46, "grad_norm": 0.408203125, "learning_rate": 3.290067285398099e-07, "logits/chosen": 0.20503363013267517, "logits/rejected": 0.44941800832748413, "logps/chosen": -214.3133087158203, "logps/rejected": -250.54025268554688, "loss": 0.6259, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0005943391588516533, "rewards/margins": 0.1370176076889038, "rewards/margins_max": 0.19939222931861877, "rewards/margins_min": 0.07464297860860825, "rewards/margins_std": 0.0882110595703125, "rewards/rejected": -0.13761194050312042, "step": 1820 }, { "epoch": 0.46, "grad_norm": 0.478515625, "learning_rate": 3.269199629696318e-07, "logits/chosen": 0.1035645380616188, "logits/rejected": 0.4018480181694031, "logps/chosen": -194.89292907714844, "logps/rejected": -207.0443878173828, "loss": 0.6233, "rewards/accuracies": 0.9375, "rewards/chosen": -0.005496837664395571, "rewards/margins": 0.13553115725517273, "rewards/margins_max": 0.21054013073444366, "rewards/margins_min": 0.0605221763253212, "rewards/margins_std": 0.10607870668172836, "rewards/rejected": -0.14102798700332642, "step": 1830 }, { "epoch": 0.46, "grad_norm": 0.4765625, "learning_rate": 3.2482726075556545e-07, "logits/chosen": 0.2008371353149414, "logits/rejected": 0.43435138463974, "logps/chosen": -222.0498504638672, "logps/rejected": -257.41876220703125, "loss": 0.6257, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01451225858181715, "rewards/margins": 0.1478424370288849, "rewards/margins_max": 0.2117810696363449, "rewards/margins_min": 0.08390381187200546, "rewards/margins_std": 0.0904228687286377, "rewards/rejected": -0.16235469281673431, "step": 1840 }, { "epoch": 0.47, "grad_norm": 0.431640625, "learning_rate": 3.227287834113107e-07, "logits/chosen": 0.17866966128349304, "logits/rejected": 0.6629277467727661, "logps/chosen": -250.87997436523438, "logps/rejected": -231.09695434570312, "loss": 0.6303, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0061906189657747746, "rewards/margins": 0.12782703340053558, "rewards/margins_max": 0.1987806260585785, "rewards/margins_min": 0.05687345191836357, "rewards/margins_std": 0.10034352540969849, "rewards/rejected": -0.134017676115036, "step": 1850 }, { "epoch": 0.47, "grad_norm": 0.43359375, "learning_rate": 3.2062469289628897e-07, "logits/chosen": 0.14636272192001343, "logits/rejected": 0.557874321937561, "logps/chosen": -214.5691680908203, "logps/rejected": -208.2883758544922, "loss": 0.6274, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.00484920060262084, "rewards/margins": 0.13341280817985535, "rewards/margins_max": 0.19893401861190796, "rewards/margins_min": 0.06789158284664154, "rewards/margins_std": 0.09266099333763123, "rewards/rejected": -0.13826200366020203, "step": 1860 }, { "epoch": 0.47, "grad_norm": 0.4453125, "learning_rate": 3.185151516031434e-07, "logits/chosen": -0.03240922838449478, "logits/rejected": 0.3734849691390991, "logps/chosen": -212.96359252929688, "logps/rejected": -236.880126953125, "loss": 0.6162, "rewards/accuracies": 1.0, "rewards/chosen": 0.0027457866817712784, "rewards/margins": 0.16785992681980133, "rewards/margins_max": 0.2289036512374878, "rewards/margins_min": 0.10681621730327606, "rewards/margins_std": 0.08632884919643402, "rewards/rejected": -0.1651141345500946, "step": 1870 }, { "epoch": 0.47, "grad_norm": 0.423828125, "learning_rate": 3.164003223452055e-07, "logits/chosen": 0.0975664034485817, "logits/rejected": 0.36212414503097534, "logps/chosen": -202.66549682617188, "logps/rejected": -209.4398651123047, "loss": 0.6255, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.003101496724411845, "rewards/margins": 0.15433330833911896, "rewards/margins_max": 0.22387921810150146, "rewards/margins_min": 0.08478739112615585, "rewards/margins_std": 0.09835276752710342, "rewards/rejected": -0.15743482112884521, "step": 1880 }, { "epoch": 0.48, "grad_norm": 0.484375, "learning_rate": 3.14280368343929e-07, "logits/chosen": 0.11557137966156006, "logits/rejected": 0.5097672939300537, "logps/chosen": -207.88168334960938, "logps/rejected": -230.6980438232422, "loss": 0.6281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.005800972692668438, "rewards/margins": 0.15571673214435577, "rewards/margins_max": 0.22469434142112732, "rewards/margins_min": 0.08673910796642303, "rewards/margins_std": 0.09754908829927444, "rewards/rejected": -0.16151770949363708, "step": 1890 }, { "epoch": 0.48, "grad_norm": 0.46484375, "learning_rate": 3.12155453216293e-07, "logits/chosen": 0.1869688332080841, "logits/rejected": 0.41782325506210327, "logps/chosen": -228.55056762695312, "logps/rejected": -251.06576538085938, "loss": 0.6186, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.013970062136650085, "rewards/margins": 0.17064602673053741, "rewards/margins_max": 0.25282150506973267, "rewards/margins_min": 0.08847051858901978, "rewards/margins_std": 0.11621370166540146, "rewards/rejected": -0.1846160888671875, "step": 1900 }, { "epoch": 0.48, "grad_norm": 0.59765625, "learning_rate": 3.1002574096217377e-07, "logits/chosen": 0.1901499629020691, "logits/rejected": 0.4289619028568268, "logps/chosen": -188.7092742919922, "logps/rejected": -201.12625122070312, "loss": 0.6361, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.02309594675898552, "rewards/margins": 0.1068769097328186, "rewards/margins_max": 0.15343698859214783, "rewards/margins_min": 0.06031683087348938, "rewards/margins_std": 0.06584589183330536, "rewards/rejected": -0.12997284531593323, "step": 1910 }, { "epoch": 0.48, "grad_norm": 0.458984375, "learning_rate": 3.0789139595168717e-07, "logits/chosen": 0.10028757899999619, "logits/rejected": 0.4567994177341461, "logps/chosen": -206.2977752685547, "logps/rejected": -217.33621215820312, "loss": 0.6253, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.008093327283859253, "rewards/margins": 0.15557749569416046, "rewards/margins_max": 0.22069554030895233, "rewards/margins_min": 0.090459443628788, "rewards/margins_std": 0.09209083020687103, "rewards/rejected": -0.16367082297801971, "step": 1920 }, { "epoch": 0.49, "grad_norm": 0.396484375, "learning_rate": 3.057525829125032e-07, "logits/chosen": 0.11732654273509979, "logits/rejected": 0.41394370794296265, "logps/chosen": -217.79763793945312, "logps/rejected": -224.2724151611328, "loss": 0.6217, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0056742457672953606, "rewards/margins": 0.15555034577846527, "rewards/margins_max": 0.2168913632631302, "rewards/margins_min": 0.09420934319496155, "rewards/margins_std": 0.08674929291009903, "rewards/rejected": -0.1612246036529541, "step": 1930 }, { "epoch": 0.49, "grad_norm": 0.41015625, "learning_rate": 3.0360946691713157e-07, "logits/chosen": 0.12568379938602448, "logits/rejected": 0.3682595193386078, "logps/chosen": -211.91970825195312, "logps/rejected": -226.7738037109375, "loss": 0.6328, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.007700534071773291, "rewards/margins": 0.14159148931503296, "rewards/margins_max": 0.19709011912345886, "rewards/margins_min": 0.08609285950660706, "rewards/margins_std": 0.07848692685365677, "rewards/rejected": -0.14929203689098358, "step": 1940 }, { "epoch": 0.49, "grad_norm": 0.419921875, "learning_rate": 3.0146221337018255e-07, "logits/chosen": 0.15395836532115936, "logits/rejected": 0.44430074095726013, "logps/chosen": -235.67892456054688, "logps/rejected": -231.19473266601562, "loss": 0.6308, "rewards/accuracies": 0.9375, "rewards/chosen": -0.025711605325341225, "rewards/margins": 0.11922027915716171, "rewards/margins_max": 0.17725497484207153, "rewards/margins_min": 0.06118558719754219, "rewards/margins_std": 0.08207345753908157, "rewards/rejected": -0.1449318826198578, "step": 1950 }, { "epoch": 0.49, "grad_norm": 0.3671875, "learning_rate": 2.9931098799560023e-07, "logits/chosen": 0.1437145471572876, "logits/rejected": 0.6458569169044495, "logps/chosen": -221.72555541992188, "logps/rejected": -233.5401153564453, "loss": 0.6267, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.008342169225215912, "rewards/margins": 0.13210944831371307, "rewards/margins_max": 0.18978825211524963, "rewards/margins_min": 0.0744306668639183, "rewards/margins_std": 0.08157012611627579, "rewards/rejected": -0.14045162498950958, "step": 1960 }, { "epoch": 0.5, "grad_norm": 0.494140625, "learning_rate": 2.971559568238724e-07, "logits/chosen": 0.10921628773212433, "logits/rejected": 0.4569002091884613, "logps/chosen": -233.3131561279297, "logps/rejected": -221.300537109375, "loss": 0.6314, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.01218502875417471, "rewards/margins": 0.1277925670146942, "rewards/margins_max": 0.19912521541118622, "rewards/margins_min": 0.05645991116762161, "rewards/margins_std": 0.10087960958480835, "rewards/rejected": -0.1399776041507721, "step": 1970 }, { "epoch": 0.5, "grad_norm": 0.40625, "learning_rate": 2.9499728617921617e-07, "logits/chosen": 0.19204050302505493, "logits/rejected": 0.4214607775211334, "logps/chosen": -234.29531860351562, "logps/rejected": -251.43264770507812, "loss": 0.6249, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01504532527178526, "rewards/margins": 0.15309670567512512, "rewards/margins_max": 0.2328995168209076, "rewards/margins_min": 0.07329384982585907, "rewards/margins_std": 0.11285825818777084, "rewards/rejected": -0.16814202070236206, "step": 1980 }, { "epoch": 0.5, "grad_norm": 0.408203125, "learning_rate": 2.9283514266674164e-07, "logits/chosen": 0.03331400826573372, "logits/rejected": 0.42242470383644104, "logps/chosen": -241.34912109375, "logps/rejected": -230.1846923828125, "loss": 0.6234, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.009459172375500202, "rewards/margins": 0.14525583386421204, "rewards/margins_max": 0.21799802780151367, "rewards/margins_min": 0.07251361012458801, "rewards/margins_std": 0.10287301242351532, "rewards/rejected": -0.15471498668193817, "step": 1990 }, { "epoch": 0.5, "grad_norm": 0.384765625, "learning_rate": 2.90669693159593e-07, "logits/chosen": 0.1895698606967926, "logits/rejected": 0.5809098482131958, "logps/chosen": -232.75515747070312, "logps/rejected": -226.44052124023438, "loss": 0.629, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02089577354490757, "rewards/margins": 0.1445704698562622, "rewards/margins_max": 0.20795254409313202, "rewards/margins_min": 0.08118841797113419, "rewards/margins_std": 0.08963577449321747, "rewards/rejected": -0.16546623408794403, "step": 2000 }, { "epoch": 0.51, "grad_norm": 0.41796875, "learning_rate": 2.885011047860694e-07, "logits/chosen": 0.169859379529953, "logits/rejected": 0.6314293146133423, "logps/chosen": -234.74581909179688, "logps/rejected": -229.83444213867188, "loss": 0.6316, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.012991060502827168, "rewards/margins": 0.13140568137168884, "rewards/margins_max": 0.19197922945022583, "rewards/margins_min": 0.07083216309547424, "rewards/margins_std": 0.08566389977931976, "rewards/rejected": -0.14439675211906433, "step": 2010 }, { "epoch": 0.51, "grad_norm": 0.41796875, "learning_rate": 2.86329544916726e-07, "logits/chosen": 0.08864578604698181, "logits/rejected": 0.5466204881668091, "logps/chosen": -210.2231903076172, "logps/rejected": -234.4009246826172, "loss": 0.6185, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0054148463532328606, "rewards/margins": 0.15986548364162445, "rewards/margins_max": 0.24291566014289856, "rewards/margins_min": 0.07681533694267273, "rewards/margins_std": 0.11745065450668335, "rewards/rejected": -0.16528034210205078, "step": 2020 }, { "epoch": 0.51, "grad_norm": 0.50390625, "learning_rate": 2.841551811514567e-07, "logits/chosen": 0.3782397508621216, "logits/rejected": 0.7252362370491028, "logps/chosen": -239.1989288330078, "logps/rejected": -259.85247802734375, "loss": 0.6264, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.018043052405118942, "rewards/margins": 0.1520715057849884, "rewards/margins_max": 0.23350951075553894, "rewards/margins_min": 0.07063348591327667, "rewards/margins_std": 0.11517073959112167, "rewards/rejected": -0.17011454701423645, "step": 2030 }, { "epoch": 0.51, "grad_norm": 0.39453125, "learning_rate": 2.819781813065586e-07, "logits/chosen": 0.14642193913459778, "logits/rejected": 0.4363443851470947, "logps/chosen": -222.1160125732422, "logps/rejected": -231.21261596679688, "loss": 0.6291, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.02110998146235943, "rewards/margins": 0.12621168792247772, "rewards/margins_max": 0.18353228271007538, "rewards/margins_min": 0.06889110803604126, "rewards/margins_std": 0.08106354624032974, "rewards/rejected": -0.1473216712474823, "step": 2040 }, { "epoch": 0.52, "grad_norm": 0.44140625, "learning_rate": 2.7979871340178e-07, "logits/chosen": 0.2349119633436203, "logits/rejected": 0.44057101011276245, "logps/chosen": -213.79025268554688, "logps/rejected": -249.9253692626953, "loss": 0.6314, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.017500776797533035, "rewards/margins": 0.14498502016067505, "rewards/margins_max": 0.2182203084230423, "rewards/margins_min": 0.0717497318983078, "rewards/margins_std": 0.10357034206390381, "rewards/rejected": -0.16248580813407898, "step": 2050 }, { "epoch": 0.52, "grad_norm": 0.43359375, "learning_rate": 2.77616945647353e-07, "logits/chosen": 0.13186123967170715, "logits/rejected": 0.5581918954849243, "logps/chosen": -250.9673309326172, "logps/rejected": -255.6508331298828, "loss": 0.6251, "rewards/accuracies": 0.9375, "rewards/chosen": -0.013809965923428535, "rewards/margins": 0.1398267298936844, "rewards/margins_max": 0.19955222308635712, "rewards/margins_min": 0.08010122179985046, "rewards/margins_std": 0.08446462452411652, "rewards/rejected": -0.15363669395446777, "step": 2060 }, { "epoch": 0.52, "grad_norm": 0.482421875, "learning_rate": 2.7543304643101077e-07, "logits/chosen": 0.3025481104850769, "logits/rejected": 0.5707412958145142, "logps/chosen": -192.90701293945312, "logps/rejected": -219.3175048828125, "loss": 0.6278, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.003879230935126543, "rewards/margins": 0.14587515592575073, "rewards/margins_max": 0.2113659679889679, "rewards/margins_min": 0.08038434386253357, "rewards/margins_std": 0.09261800348758698, "rewards/rejected": -0.1497543752193451, "step": 2070 }, { "epoch": 0.52, "grad_norm": 0.46484375, "learning_rate": 2.7324718430499183e-07, "logits/chosen": 0.142515629529953, "logits/rejected": 0.5404806137084961, "logps/chosen": -191.1787872314453, "logps/rejected": -201.55758666992188, "loss": 0.6256, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.00013065226085018367, "rewards/margins": 0.15328006446361542, "rewards/margins_max": 0.22164976596832275, "rewards/margins_min": 0.08491034060716629, "rewards/margins_std": 0.0966893881559372, "rewards/rejected": -0.15341070294380188, "step": 2080 }, { "epoch": 0.53, "grad_norm": 0.44140625, "learning_rate": 2.7105952797303076e-07, "logits/chosen": 0.3479451835155487, "logits/rejected": 0.5935274362564087, "logps/chosen": -204.52516174316406, "logps/rejected": -225.3500213623047, "loss": 0.619, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.010690455324947834, "rewards/margins": 0.15082670748233795, "rewards/margins_max": 0.20945756137371063, "rewards/margins_min": 0.09219582378864288, "rewards/margins_std": 0.08291657269001007, "rewards/rejected": -0.1615171581506729, "step": 2090 }, { "epoch": 0.53, "grad_norm": 0.466796875, "learning_rate": 2.6887024627733827e-07, "logits/chosen": 0.08297687768936157, "logits/rejected": 0.4158407747745514, "logps/chosen": -237.61325073242188, "logps/rejected": -236.90185546875, "loss": 0.6203, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.012562384828925133, "rewards/margins": 0.15429750084877014, "rewards/margins_max": 0.220596581697464, "rewards/margins_min": 0.08799847960472107, "rewards/margins_std": 0.09376100450754166, "rewards/rejected": -0.16685989499092102, "step": 2100 }, { "epoch": 0.53, "grad_norm": 0.46484375, "learning_rate": 2.666795081855699e-07, "logits/chosen": 0.09937963634729385, "logits/rejected": 0.45777082443237305, "logps/chosen": -245.4235382080078, "logps/rejected": -227.5722198486328, "loss": 0.6319, "rewards/accuracies": 0.9375, "rewards/chosen": -0.027121981605887413, "rewards/margins": 0.10453277826309204, "rewards/margins_max": 0.154584139585495, "rewards/margins_min": 0.05448141694068909, "rewards/margins_std": 0.07078330963850021, "rewards/rejected": -0.1316547691822052, "step": 2110 }, { "epoch": 0.53, "grad_norm": 0.47265625, "learning_rate": 2.6448748277778486e-07, "logits/chosen": 0.1422795057296753, "logits/rejected": 0.47126078605651855, "logps/chosen": -237.8463897705078, "logps/rejected": -243.663330078125, "loss": 0.6226, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.007196065969765186, "rewards/margins": 0.16027355194091797, "rewards/margins_max": 0.2328023612499237, "rewards/margins_min": 0.08774472773075104, "rewards/margins_std": 0.10257124900817871, "rewards/rejected": -0.16746962070465088, "step": 2120 }, { "epoch": 0.54, "grad_norm": 0.44921875, "learning_rate": 2.6229433923339693e-07, "logits/chosen": 0.13821318745613098, "logits/rejected": 0.5182450413703918, "logps/chosen": -209.3953857421875, "logps/rejected": -205.0969696044922, "loss": 0.6279, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.012799051590263844, "rewards/margins": 0.12856367230415344, "rewards/margins_max": 0.18207482993602753, "rewards/margins_min": 0.07505248486995697, "rewards/margins_std": 0.07567623257637024, "rewards/rejected": -0.14136271178722382, "step": 2130 }, { "epoch": 0.54, "grad_norm": 0.458984375, "learning_rate": 2.6010024681811715e-07, "logits/chosen": 0.1384851336479187, "logits/rejected": 0.5728726387023926, "logps/chosen": -232.31570434570312, "logps/rejected": -245.65493774414062, "loss": 0.6244, "rewards/accuracies": 0.9375, "rewards/chosen": -0.007313706912100315, "rewards/margins": 0.1552475243806839, "rewards/margins_max": 0.21772119402885437, "rewards/margins_min": 0.09277385473251343, "rewards/margins_std": 0.08835110813379288, "rewards/rejected": -0.16256123781204224, "step": 2140 }, { "epoch": 0.54, "grad_norm": 0.431640625, "learning_rate": 2.579053748708897e-07, "logits/chosen": 0.05881236866116524, "logits/rejected": 0.5870357751846313, "logps/chosen": -222.7379608154297, "logps/rejected": -209.11807250976562, "loss": 0.626, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.005922857206314802, "rewards/margins": 0.14325572550296783, "rewards/margins_max": 0.20521852374076843, "rewards/margins_min": 0.08129292726516724, "rewards/margins_std": 0.08762861788272858, "rewards/rejected": -0.14917859435081482, "step": 2150 }, { "epoch": 0.54, "grad_norm": 0.412109375, "learning_rate": 2.5570989279082287e-07, "logits/chosen": 0.042133577167987823, "logits/rejected": 0.3915202021598816, "logps/chosen": -219.47879028320312, "logps/rejected": -245.1941680908203, "loss": 0.6219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.00811375305056572, "rewards/margins": 0.16366888582706451, "rewards/margins_max": 0.23381967842578888, "rewards/margins_min": 0.09351803362369537, "rewards/margins_std": 0.09920825064182281, "rewards/rejected": -0.17178264260292053, "step": 2160 }, { "epoch": 0.55, "grad_norm": 0.41796875, "learning_rate": 2.535139700241147e-07, "logits/chosen": 0.13664479553699493, "logits/rejected": 0.4714391827583313, "logps/chosen": -241.48922729492188, "logps/rejected": -236.61392211914062, "loss": 0.6221, "rewards/accuracies": 0.9375, "rewards/chosen": -0.007417517714202404, "rewards/margins": 0.14727117121219635, "rewards/margins_max": 0.22314317524433136, "rewards/margins_min": 0.07139919698238373, "rewards/margins_std": 0.10729918628931046, "rewards/rejected": -0.15468870103359222, "step": 2170 }, { "epoch": 0.55, "grad_norm": 0.44921875, "learning_rate": 2.51317776050975e-07, "logits/chosen": 0.1303066909313202, "logits/rejected": 0.35825151205062866, "logps/chosen": -195.5194549560547, "logps/rejected": -243.30276489257812, "loss": 0.6174, "rewards/accuracies": 0.9375, "rewards/chosen": -0.005396073218435049, "rewards/margins": 0.1572524756193161, "rewards/margins_max": 0.22743459045886993, "rewards/margins_min": 0.08707039058208466, "rewards/margins_std": 0.09925246238708496, "rewards/rejected": -0.16264855861663818, "step": 2180 }, { "epoch": 0.55, "grad_norm": 0.40234375, "learning_rate": 2.4912148037254533e-07, "logits/chosen": -0.10321170091629028, "logits/rejected": 0.1964869350194931, "logps/chosen": -227.9014892578125, "logps/rejected": -248.35763549804688, "loss": 0.6188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.019046422094106674, "rewards/margins": 0.1504187434911728, "rewards/margins_max": 0.22531433403491974, "rewards/margins_min": 0.07552315294742584, "rewards/margins_std": 0.10591837018728256, "rewards/rejected": -0.16946516931056976, "step": 2190 }, { "epoch": 0.55, "grad_norm": 0.5, "learning_rate": 2.4692525249781676e-07, "logits/chosen": 0.23636607825756073, "logits/rejected": 0.5955804586410522, "logps/chosen": -241.3157501220703, "logps/rejected": -221.0636444091797, "loss": 0.6236, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.016589036211371422, "rewards/margins": 0.16155707836151123, "rewards/margins_max": 0.22862081229686737, "rewards/margins_min": 0.0944933295249939, "rewards/margins_std": 0.09484247118234634, "rewards/rejected": -0.1781461089849472, "step": 2200 }, { "epoch": 0.56, "grad_norm": 0.416015625, "learning_rate": 2.447292619305473e-07, "logits/chosen": 0.1862422525882721, "logits/rejected": 0.5260340571403503, "logps/chosen": -202.081298828125, "logps/rejected": -230.32275390625, "loss": 0.6161, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0024939021095633507, "rewards/margins": 0.16634412109851837, "rewards/margins_max": 0.24052074551582336, "rewards/margins_min": 0.09216747432947159, "rewards/margins_std": 0.10490158945322037, "rewards/rejected": -0.1688380092382431, "step": 2210 }, { "epoch": 0.56, "grad_norm": 0.44921875, "learning_rate": 2.425336781561796e-07, "logits/chosen": 0.06229435279965401, "logits/rejected": 0.35382336378097534, "logps/chosen": -203.84588623046875, "logps/rejected": -219.60073852539062, "loss": 0.6276, "rewards/accuracies": 0.9375, "rewards/chosen": -0.020327067002654076, "rewards/margins": 0.1277579963207245, "rewards/margins_max": 0.18487441539764404, "rewards/margins_min": 0.07064155489206314, "rewards/margins_std": 0.08077484369277954, "rewards/rejected": -0.14808505773544312, "step": 2220 }, { "epoch": 0.56, "grad_norm": 0.419921875, "learning_rate": 2.403386706287605e-07, "logits/chosen": 0.24842128157615662, "logits/rejected": 0.5870705842971802, "logps/chosen": -191.80935668945312, "logps/rejected": -215.96127319335938, "loss": 0.6217, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.011121408082544804, "rewards/margins": 0.15986418724060059, "rewards/margins_max": 0.24271711707115173, "rewards/margins_min": 0.07701121270656586, "rewards/margins_std": 0.11717178672552109, "rewards/rejected": -0.170985609292984, "step": 2230 }, { "epoch": 0.56, "grad_norm": 0.3828125, "learning_rate": 2.381444087578621e-07, "logits/chosen": 0.1613490730524063, "logits/rejected": 0.5991760492324829, "logps/chosen": -229.7702178955078, "logps/rejected": -240.04299926757812, "loss": 0.6296, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01214107871055603, "rewards/margins": 0.13114643096923828, "rewards/margins_max": 0.19676503539085388, "rewards/margins_min": 0.06552781164646149, "rewards/margins_std": 0.09279872477054596, "rewards/rejected": -0.14328749477863312, "step": 2240 }, { "epoch": 0.57, "grad_norm": 0.5078125, "learning_rate": 2.359510618955073e-07, "logits/chosen": 0.13083569705486298, "logits/rejected": 0.42787185311317444, "logps/chosen": -243.2305450439453, "logps/rejected": -244.7343292236328, "loss": 0.629, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.009732378646731377, "rewards/margins": 0.13312295079231262, "rewards/margins_max": 0.19658634066581726, "rewards/margins_min": 0.06965956836938858, "rewards/margins_std": 0.08975076675415039, "rewards/rejected": -0.14285531640052795, "step": 2250 }, { "epoch": 0.57, "grad_norm": 0.490234375, "learning_rate": 2.3375879932309908e-07, "logits/chosen": 0.09023120254278183, "logits/rejected": 0.4491159915924072, "logps/chosen": -221.2506103515625, "logps/rejected": -248.62643432617188, "loss": 0.6169, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.004972636234015226, "rewards/margins": 0.16423256695270538, "rewards/margins_max": 0.21688561141490936, "rewards/margins_min": 0.1115795224905014, "rewards/margins_std": 0.0744626373052597, "rewards/rejected": -0.1692052036523819, "step": 2260 }, { "epoch": 0.57, "grad_norm": 0.37109375, "learning_rate": 2.3156779023835524e-07, "logits/chosen": 0.2503266930580139, "logits/rejected": 0.43795377016067505, "logps/chosen": -203.3564910888672, "logps/rejected": -243.43563842773438, "loss": 0.6217, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.011418869718909264, "rewards/margins": 0.15341442823410034, "rewards/margins_max": 0.22445444762706757, "rewards/margins_min": 0.08237439393997192, "rewards/margins_std": 0.1004657968878746, "rewards/rejected": -0.16483330726623535, "step": 2270 }, { "epoch": 0.57, "grad_norm": 0.431640625, "learning_rate": 2.2937820374225044e-07, "logits/chosen": 0.09318618476390839, "logits/rejected": 0.34944021701812744, "logps/chosen": -208.92123413085938, "logps/rejected": -221.4180145263672, "loss": 0.6273, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.016211625188589096, "rewards/margins": 0.12588536739349365, "rewards/margins_max": 0.20268261432647705, "rewards/margins_min": 0.04908811300992966, "rewards/margins_std": 0.1086077094078064, "rewards/rejected": -0.14209699630737305, "step": 2280 }, { "epoch": 0.58, "grad_norm": 0.51171875, "learning_rate": 2.2719020882596427e-07, "logits/chosen": 0.1537734419107437, "logits/rejected": 0.46534472703933716, "logps/chosen": -228.1218719482422, "logps/rejected": -241.26083374023438, "loss": 0.6237, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.020388314500451088, "rewards/margins": 0.15441298484802246, "rewards/margins_max": 0.23009231686592102, "rewards/margins_min": 0.0787336677312851, "rewards/margins_std": 0.10702673345804214, "rewards/rejected": -0.174801304936409, "step": 2290 }, { "epoch": 0.58, "grad_norm": 0.435546875, "learning_rate": 2.2500397435783924e-07, "logits/chosen": 0.09081272035837173, "logits/rejected": 0.47422558069229126, "logps/chosen": -238.5783233642578, "logps/rejected": -222.4462127685547, "loss": 0.6283, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.023588668555021286, "rewards/margins": 0.1449275016784668, "rewards/margins_max": 0.2159515917301178, "rewards/margins_min": 0.0739034041762352, "rewards/margins_std": 0.10044324398040771, "rewards/rejected": -0.16851617395877838, "step": 2300 }, { "epoch": 0.58, "grad_norm": 0.59375, "learning_rate": 2.2281966907034733e-07, "logits/chosen": 0.11909373849630356, "logits/rejected": 0.42936739325523376, "logps/chosen": -230.6641387939453, "logps/rejected": -226.5247344970703, "loss": 0.623, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0029804124496877193, "rewards/margins": 0.1596844494342804, "rewards/margins_max": 0.2344544231891632, "rewards/margins_min": 0.08491448312997818, "rewards/margins_std": 0.10574068874120712, "rewards/rejected": -0.16266486048698425, "step": 2310 }, { "epoch": 0.58, "grad_norm": 0.48046875, "learning_rate": 2.2063746154706724e-07, "logits/chosen": 0.2687230706214905, "logits/rejected": 0.4621661305427551, "logps/chosen": -194.59451293945312, "logps/rejected": -234.8601837158203, "loss": 0.6146, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.001996729988604784, "rewards/margins": 0.14776286482810974, "rewards/margins_max": 0.22188358008861542, "rewards/margins_min": 0.07364213466644287, "rewards/margins_std": 0.10482251644134521, "rewards/rejected": -0.14975957572460175, "step": 2320 }, { "epoch": 0.59, "grad_norm": 0.486328125, "learning_rate": 2.1845752020967333e-07, "logits/chosen": 0.01649840548634529, "logits/rejected": 0.33088165521621704, "logps/chosen": -212.7630157470703, "logps/rejected": -221.63339233398438, "loss": 0.6211, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.017026465386152267, "rewards/margins": 0.15200331807136536, "rewards/margins_max": 0.21017222106456757, "rewards/margins_min": 0.09383439272642136, "rewards/margins_std": 0.08226325362920761, "rewards/rejected": -0.16902975738048553, "step": 2330 }, { "epoch": 0.59, "grad_norm": 0.412109375, "learning_rate": 2.16280013304937e-07, "logits/chosen": 0.16655944287776947, "logits/rejected": 0.4885994791984558, "logps/chosen": -201.47377014160156, "logps/rejected": -217.09274291992188, "loss": 0.6269, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.007077026180922985, "rewards/margins": 0.1284123957157135, "rewards/margins_max": 0.18359172344207764, "rewards/margins_min": 0.07323308289051056, "rewards/margins_std": 0.07803533971309662, "rewards/rejected": -0.13548941910266876, "step": 2340 }, { "epoch": 0.59, "grad_norm": 1.9296875, "learning_rate": 2.1410510889174109e-07, "logits/chosen": 0.142364963889122, "logits/rejected": 0.4045885503292084, "logps/chosen": -251.03823852539062, "logps/rejected": -238.9752655029297, "loss": 0.6209, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.010029856115579605, "rewards/margins": 0.167240172624588, "rewards/margins_max": 0.22913317382335663, "rewards/margins_min": 0.10534713417291641, "rewards/margins_std": 0.08752994984388351, "rewards/rejected": -0.17727001011371613, "step": 2350 }, { "epoch": 0.59, "grad_norm": 0.51171875, "learning_rate": 2.119329748281098e-07, "logits/chosen": 0.12697362899780273, "logits/rejected": 0.5167545080184937, "logps/chosen": -191.67379760742188, "logps/rejected": -210.9341278076172, "loss": 0.6289, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.009999660775065422, "rewards/margins": 0.14249001443386078, "rewards/margins_max": 0.20294538140296936, "rewards/margins_min": 0.08203467726707458, "rewards/margins_std": 0.08549676090478897, "rewards/rejected": -0.15248967707157135, "step": 2360 }, { "epoch": 0.6, "grad_norm": 0.4609375, "learning_rate": 2.0976377875825282e-07, "logits/chosen": 0.10468962043523788, "logits/rejected": 0.4102189540863037, "logps/chosen": -221.7640380859375, "logps/rejected": -239.70553588867188, "loss": 0.6293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.017431270331144333, "rewards/margins": 0.16169613599777222, "rewards/margins_max": 0.24615326523780823, "rewards/margins_min": 0.07723899185657501, "rewards/margins_std": 0.11944042146205902, "rewards/rejected": -0.17912741005420685, "step": 2370 }, { "epoch": 0.6, "grad_norm": 0.4140625, "learning_rate": 2.0759768809962713e-07, "logits/chosen": 0.03870842233300209, "logits/rejected": 0.4031105041503906, "logps/chosen": -211.58126831054688, "logps/rejected": -222.51828002929688, "loss": 0.622, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01689000055193901, "rewards/margins": 0.14617034792900085, "rewards/margins_max": 0.20459063351154327, "rewards/margins_min": 0.08775003999471664, "rewards/margins_std": 0.08261878788471222, "rewards/rejected": -0.16306033730506897, "step": 2380 }, { "epoch": 0.6, "grad_norm": 0.359375, "learning_rate": 2.0543487003001577e-07, "logits/chosen": 0.2453078031539917, "logits/rejected": 0.45263537764549255, "logps/chosen": -196.93557739257812, "logps/rejected": -228.71194458007812, "loss": 0.6227, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.010591455735266209, "rewards/margins": 0.15100930631160736, "rewards/margins_max": 0.21889753639698029, "rewards/margins_min": 0.08312106132507324, "rewards/margins_std": 0.09600846469402313, "rewards/rejected": -0.1616007387638092, "step": 2390 }, { "epoch": 0.6, "grad_norm": 0.486328125, "learning_rate": 2.032754914746247e-07, "logits/chosen": 0.11738073825836182, "logits/rejected": 0.5080969929695129, "logps/chosen": -226.0777130126953, "logps/rejected": -235.65225219726562, "loss": 0.6192, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.00305022019892931, "rewards/margins": 0.15616664290428162, "rewards/margins_max": 0.23057445883750916, "rewards/margins_min": 0.08175883442163467, "rewards/margins_std": 0.10522852838039398, "rewards/rejected": -0.15921685099601746, "step": 2400 }, { "epoch": 0.61, "grad_norm": 0.455078125, "learning_rate": 2.0111971909320026e-07, "logits/chosen": 0.14011964201927185, "logits/rejected": 0.6768498420715332, "logps/chosen": -256.1953125, "logps/rejected": -231.5035400390625, "loss": 0.6248, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.010894227772951126, "rewards/margins": 0.15551438927650452, "rewards/margins_max": 0.22847270965576172, "rewards/margins_min": 0.08255606144666672, "rewards/margins_std": 0.10317866504192352, "rewards/rejected": -0.16640862822532654, "step": 2410 }, { "epoch": 0.61, "grad_norm": 0.490234375, "learning_rate": 1.989677192671657e-07, "logits/chosen": 0.044994331896305084, "logits/rejected": 0.5023723840713501, "logps/chosen": -223.20223999023438, "logps/rejected": -244.1636505126953, "loss": 0.6191, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.015944605693221092, "rewards/margins": 0.1615656316280365, "rewards/margins_max": 0.23082807660102844, "rewards/margins_min": 0.09230320155620575, "rewards/margins_std": 0.09795187413692474, "rewards/rejected": -0.17751023173332214, "step": 2420 }, { "epoch": 0.61, "grad_norm": 0.357421875, "learning_rate": 1.9681965808678076e-07, "logits/chosen": 0.20567622780799866, "logits/rejected": 0.5247214436531067, "logps/chosen": -236.1908721923828, "logps/rejected": -243.5137939453125, "loss": 0.6264, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.003678310662508011, "rewards/margins": 0.133884459733963, "rewards/margins_max": 0.19526854157447815, "rewards/margins_min": 0.07250036299228668, "rewards/margins_std": 0.08681021630764008, "rewards/rejected": -0.13756278157234192, "step": 2430 }, { "epoch": 0.61, "grad_norm": 0.423828125, "learning_rate": 1.9467570133832234e-07, "logits/chosen": 0.17644178867340088, "logits/rejected": 0.6248446702957153, "logps/chosen": -246.7824249267578, "logps/rejected": -237.9423065185547, "loss": 0.6264, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.019817547872662544, "rewards/margins": 0.12975750863552094, "rewards/margins_max": 0.1915426254272461, "rewards/margins_min": 0.06797240674495697, "rewards/margins_std": 0.08737734705209732, "rewards/rejected": -0.14957503974437714, "step": 2440 }, { "epoch": 0.62, "grad_norm": 0.40234375, "learning_rate": 1.925360144912891e-07, "logits/chosen": 0.09493163973093033, "logits/rejected": 0.4174177050590515, "logps/chosen": -229.5768585205078, "logps/rejected": -237.2871856689453, "loss": 0.6252, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.009837731719017029, "rewards/margins": 0.13863086700439453, "rewards/margins_max": 0.2090788185596466, "rewards/margins_min": 0.06818293035030365, "rewards/margins_std": 0.09962843358516693, "rewards/rejected": -0.14846859872341156, "step": 2450 }, { "epoch": 0.62, "grad_norm": 0.4921875, "learning_rate": 1.9040076268563126e-07, "logits/chosen": 0.24845823645591736, "logits/rejected": 0.43799370527267456, "logps/chosen": -226.14871215820312, "logps/rejected": -268.9622802734375, "loss": 0.6247, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.02408982627093792, "rewards/margins": 0.14104606211185455, "rewards/margins_max": 0.20654428005218506, "rewards/margins_min": 0.07554782927036285, "rewards/margins_std": 0.09262847900390625, "rewards/rejected": -0.16513587534427643, "step": 2460 }, { "epoch": 0.62, "grad_norm": 0.419921875, "learning_rate": 1.8827011071900472e-07, "logits/chosen": 0.07511536031961441, "logits/rejected": 0.3902598023414612, "logps/chosen": -206.7931365966797, "logps/rejected": -221.56466674804688, "loss": 0.6204, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0016159467631950974, "rewards/margins": 0.16660144925117493, "rewards/margins_max": 0.22700098156929016, "rewards/margins_min": 0.10620193183422089, "rewards/margins_std": 0.08541782200336456, "rewards/rejected": -0.16821739077568054, "step": 2470 }, { "epoch": 0.62, "grad_norm": 0.421875, "learning_rate": 1.8614422303405188e-07, "logits/chosen": 0.12508592009544373, "logits/rejected": 0.4025653898715973, "logps/chosen": -206.9744415283203, "logps/rejected": -217.2001495361328, "loss": 0.6316, "rewards/accuracies": 0.9375, "rewards/chosen": -0.014698827639222145, "rewards/margins": 0.12224705517292023, "rewards/margins_max": 0.1787988245487213, "rewards/margins_min": 0.06569530814886093, "rewards/margins_std": 0.07997626066207886, "rewards/rejected": -0.13694588840007782, "step": 2480 }, { "epoch": 0.63, "grad_norm": 0.447265625, "learning_rate": 1.8402326370571054e-07, "logits/chosen": 0.2565688490867615, "logits/rejected": 0.4014928936958313, "logps/chosen": -207.6375274658203, "logps/rejected": -228.64892578125, "loss": 0.6285, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.010557899251580238, "rewards/margins": 0.16835328936576843, "rewards/margins_max": 0.24136631190776825, "rewards/margins_min": 0.09534025937318802, "rewards/margins_std": 0.10325602442026138, "rewards/rejected": -0.17891117930412292, "step": 2490 }, { "epoch": 0.63, "grad_norm": 0.431640625, "learning_rate": 1.819073964285501e-07, "logits/chosen": 0.011672258377075195, "logits/rejected": 0.426652729511261, "logps/chosen": -232.6514129638672, "logps/rejected": -222.7469940185547, "loss": 0.6276, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.014878949150443077, "rewards/margins": 0.13608737289905548, "rewards/margins_max": 0.1932559311389923, "rewards/margins_min": 0.07891880720853806, "rewards/margins_std": 0.08084855228662491, "rewards/rejected": -0.1509663164615631, "step": 2500 }, { "epoch": 0.63, "grad_norm": 0.51953125, "learning_rate": 1.7979678450413844e-07, "logits/chosen": 0.24352100491523743, "logits/rejected": 0.520950198173523, "logps/chosen": -211.32107543945312, "logps/rejected": -226.19711303710938, "loss": 0.6193, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.007636167109012604, "rewards/margins": 0.16029870510101318, "rewards/margins_max": 0.22749435901641846, "rewards/margins_min": 0.09310305863618851, "rewards/margins_std": 0.09502898901700974, "rewards/rejected": -0.1679348647594452, "step": 2510 }, { "epoch": 0.63, "grad_norm": 0.427734375, "learning_rate": 1.7769159082843782e-07, "logits/chosen": 0.2498486489057541, "logits/rejected": 0.4882238507270813, "logps/chosen": -213.7360076904297, "logps/rejected": -230.9351806640625, "loss": 0.6231, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.010208885185420513, "rewards/margins": 0.15363340079784393, "rewards/margins_max": 0.21247386932373047, "rewards/margins_min": 0.09479296207427979, "rewards/margins_std": 0.0832129567861557, "rewards/rejected": -0.16384229063987732, "step": 2520 }, { "epoch": 0.64, "grad_norm": 0.42578125, "learning_rate": 1.755919778792326e-07, "logits/chosen": 0.14641539752483368, "logits/rejected": 0.47882094979286194, "logps/chosen": -232.2136993408203, "logps/rejected": -249.48605346679688, "loss": 0.6291, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.014472192153334618, "rewards/margins": 0.14982035756111145, "rewards/margins_max": 0.2209053337574005, "rewards/margins_min": 0.0787353515625, "rewards/margins_std": 0.10052935034036636, "rewards/rejected": -0.16429252922534943, "step": 2530 }, { "epoch": 0.64, "grad_norm": 0.443359375, "learning_rate": 1.7349810770358974e-07, "logits/chosen": 0.09130740910768509, "logits/rejected": 0.4923134446144104, "logps/chosen": -230.8465576171875, "logps/rejected": -241.65554809570312, "loss": 0.6211, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.014047250151634216, "rewards/margins": 0.16473481059074402, "rewards/margins_max": 0.23630110919475555, "rewards/margins_min": 0.09316851198673248, "rewards/margins_std": 0.10121002048254013, "rewards/rejected": -0.17878206074237823, "step": 2540 }, { "epoch": 0.64, "grad_norm": 0.5, "learning_rate": 1.714101419053518e-07, "logits/chosen": 0.2340908944606781, "logits/rejected": 0.5541288256645203, "logps/chosen": -231.4524383544922, "logps/rejected": -257.86279296875, "loss": 0.6131, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.008820734918117523, "rewards/margins": 0.1457538902759552, "rewards/margins_max": 0.2088393270969391, "rewards/margins_min": 0.0826684758067131, "rewards/margins_std": 0.08921625465154648, "rewards/rejected": -0.15457461774349213, "step": 2550 }, { "epoch": 0.64, "grad_norm": 0.455078125, "learning_rate": 1.6932824163266423e-07, "logits/chosen": 0.09834763407707214, "logits/rejected": 0.4759771227836609, "logps/chosen": -213.1902313232422, "logps/rejected": -246.94180297851562, "loss": 0.6251, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.009206411428749561, "rewards/margins": 0.14650623500347137, "rewards/margins_max": 0.20215961337089539, "rewards/margins_min": 0.09085287898778915, "rewards/margins_std": 0.07870575040578842, "rewards/rejected": -0.15571266412734985, "step": 2560 }, { "epoch": 0.65, "grad_norm": 0.447265625, "learning_rate": 1.6725256756553869e-07, "logits/chosen": 0.03130980581045151, "logits/rejected": 0.4244791865348816, "logps/chosen": -204.5883026123047, "logps/rejected": -228.83706665039062, "loss": 0.6232, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.008677573874592781, "rewards/margins": 0.1341676414012909, "rewards/margins_max": 0.18546664714813232, "rewards/margins_min": 0.08286865055561066, "rewards/margins_std": 0.07254774123430252, "rewards/rejected": -0.14284522831439972, "step": 2570 }, { "epoch": 0.65, "grad_norm": 0.416015625, "learning_rate": 1.65183279903451e-07, "logits/chosen": 0.21596547961235046, "logits/rejected": 0.44787946343421936, "logps/chosen": -216.00198364257812, "logps/rejected": -244.8545379638672, "loss": 0.6202, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.010507548227906227, "rewards/margins": 0.15654310584068298, "rewards/margins_max": 0.23176832497119904, "rewards/margins_min": 0.08131789416074753, "rewards/margins_std": 0.1063845157623291, "rewards/rejected": -0.16705067455768585, "step": 2580 }, { "epoch": 0.65, "grad_norm": 0.470703125, "learning_rate": 1.631205383529778e-07, "logits/chosen": 0.23015046119689941, "logits/rejected": 0.5896509885787964, "logps/chosen": -225.5498046875, "logps/rejected": -242.12826538085938, "loss": 0.6187, "rewards/accuracies": 0.9375, "rewards/chosen": -0.013665847480297089, "rewards/margins": 0.1573982536792755, "rewards/margins_max": 0.23688676953315735, "rewards/margins_min": 0.07790976017713547, "rewards/margins_std": 0.11241370439529419, "rewards/rejected": -0.171064093708992, "step": 2590 }, { "epoch": 0.65, "grad_norm": 0.486328125, "learning_rate": 1.610645021154701e-07, "logits/chosen": 0.19087959825992584, "logits/rejected": 0.6289721131324768, "logps/chosen": -221.5736846923828, "logps/rejected": -236.9660186767578, "loss": 0.622, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.009838482365012169, "rewards/margins": 0.16315440833568573, "rewards/margins_max": 0.21977396309375763, "rewards/margins_min": 0.10653485357761383, "rewards/margins_std": 0.08007214963436127, "rewards/rejected": -0.17299290001392365, "step": 2600 }, { "epoch": 0.66, "grad_norm": 0.38671875, "learning_rate": 1.5901532987476594e-07, "logits/chosen": 0.031648941338062286, "logits/rejected": 0.3961068391799927, "logps/chosen": -210.3728485107422, "logps/rejected": -235.66720581054688, "loss": 0.6253, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.007695254869759083, "rewards/margins": 0.14787700772285461, "rewards/margins_max": 0.21479801833629608, "rewards/margins_min": 0.08095600455999374, "rewards/margins_std": 0.09464059770107269, "rewards/rejected": -0.15557226538658142, "step": 2610 }, { "epoch": 0.66, "grad_norm": 0.482421875, "learning_rate": 1.5697317978494406e-07, "logits/chosen": 0.09580625593662262, "logits/rejected": 0.4831513464450836, "logps/chosen": -236.427001953125, "logps/rejected": -228.22958374023438, "loss": 0.6243, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.009196283295750618, "rewards/margins": 0.14667744934558868, "rewards/margins_max": 0.22402644157409668, "rewards/margins_min": 0.0693284273147583, "rewards/margins_std": 0.10938803106546402, "rewards/rejected": -0.15587374567985535, "step": 2620 }, { "epoch": 0.66, "grad_norm": 0.458984375, "learning_rate": 1.5493820945811658e-07, "logits/chosen": 0.15679766237735748, "logits/rejected": 0.5111496448516846, "logps/chosen": -208.1581268310547, "logps/rejected": -225.63211059570312, "loss": 0.6284, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.014764217659831047, "rewards/margins": 0.14576368033885956, "rewards/margins_max": 0.2189546376466751, "rewards/margins_min": 0.072572723031044, "rewards/margins_std": 0.10350766032934189, "rewards/rejected": -0.16052789986133575, "step": 2630 }, { "epoch": 0.66, "grad_norm": 0.40625, "learning_rate": 1.5291057595226557e-07, "logits/chosen": 0.11481879651546478, "logits/rejected": 0.4119172990322113, "logps/chosen": -182.10604858398438, "logps/rejected": -214.1801300048828, "loss": 0.624, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.003746766597032547, "rewards/margins": 0.15647880733013153, "rewards/margins_max": 0.21717135608196259, "rewards/margins_min": 0.09578627347946167, "rewards/margins_std": 0.08583220094442368, "rewards/rejected": -0.16022558510303497, "step": 2640 }, { "epoch": 0.67, "grad_norm": 0.44140625, "learning_rate": 1.5089043575912097e-07, "logits/chosen": 0.24892990291118622, "logits/rejected": 0.5904571413993835, "logps/chosen": -211.7594757080078, "logps/rejected": -226.3749542236328, "loss": 0.6325, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01375389564782381, "rewards/margins": 0.1372380554676056, "rewards/margins_max": 0.19826175272464752, "rewards/margins_min": 0.07621435821056366, "rewards/margins_std": 0.08630053699016571, "rewards/rejected": -0.15099194645881653, "step": 2650 }, { "epoch": 0.67, "grad_norm": 0.48046875, "learning_rate": 1.4887794479208221e-07, "logits/chosen": 0.1940927356481552, "logits/rejected": 0.3191990554332733, "logps/chosen": -218.141357421875, "logps/rejected": -264.9046630859375, "loss": 0.6247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.010482221841812134, "rewards/margins": 0.14843745529651642, "rewards/margins_max": 0.2098129540681839, "rewards/margins_min": 0.08706195652484894, "rewards/margins_std": 0.0867980569601059, "rewards/rejected": -0.15891966223716736, "step": 2660 }, { "epoch": 0.67, "grad_norm": 0.53515625, "learning_rate": 1.4687325837418562e-07, "logits/chosen": 0.12689308822155, "logits/rejected": 0.5191614031791687, "logps/chosen": -247.96890258789062, "logps/rejected": -245.7145233154297, "loss": 0.6163, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.008867842145264149, "rewards/margins": 0.17783989012241364, "rewards/margins_max": 0.24966421723365784, "rewards/margins_min": 0.10601554065942764, "rewards/margins_std": 0.10157494246959686, "rewards/rejected": -0.1867077350616455, "step": 2670 }, { "epoch": 0.67, "grad_norm": 0.44921875, "learning_rate": 1.4487653122611642e-07, "logits/chosen": 0.07228230684995651, "logits/rejected": 0.5454927682876587, "logps/chosen": -224.2992401123047, "logps/rejected": -219.3193359375, "loss": 0.6244, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.009803833439946175, "rewards/margins": 0.13594987988471985, "rewards/margins_max": 0.20596370100975037, "rewards/margins_min": 0.06593604385852814, "rewards/margins_std": 0.0990145206451416, "rewards/rejected": -0.14575372636318207, "step": 2680 }, { "epoch": 0.68, "grad_norm": 0.412109375, "learning_rate": 1.4288791745426736e-07, "logits/chosen": 0.15894924104213715, "logits/rejected": 0.49075669050216675, "logps/chosen": -209.205078125, "logps/rejected": -214.1329345703125, "loss": 0.626, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.02186177298426628, "rewards/margins": 0.13367237150669098, "rewards/margins_max": 0.20408746600151062, "rewards/margins_min": 0.06325726211071014, "rewards/margins_std": 0.0995819941163063, "rewards/rejected": -0.15553416311740875, "step": 2690 }, { "epoch": 0.68, "grad_norm": 0.4453125, "learning_rate": 1.4090757053884478e-07, "logits/chosen": 0.20218896865844727, "logits/rejected": 0.4428800940513611, "logps/chosen": -200.88058471679688, "logps/rejected": -213.5211181640625, "loss": 0.6191, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01311523001641035, "rewards/margins": 0.15484270453453064, "rewards/margins_max": 0.2205415964126587, "rewards/margins_min": 0.08914382755756378, "rewards/margins_std": 0.09291225671768188, "rewards/rejected": -0.16795794665813446, "step": 2700 }, { "epoch": 0.68, "grad_norm": 0.455078125, "learning_rate": 1.3893564332202318e-07, "logits/chosen": 0.07864736765623093, "logits/rejected": 0.4497681260108948, "logps/chosen": -209.11807250976562, "logps/rejected": -229.51058959960938, "loss": 0.6281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.0004749842919409275, "rewards/margins": 0.1542075276374817, "rewards/margins_max": 0.21607331931591034, "rewards/margins_min": 0.09234174340963364, "rewards/margins_std": 0.08749144524335861, "rewards/rejected": -0.15468251705169678, "step": 2710 }, { "epoch": 0.68, "grad_norm": 0.37890625, "learning_rate": 1.3697228799614923e-07, "logits/chosen": 0.18281707167625427, "logits/rejected": 0.6343039274215698, "logps/chosen": -205.13455200195312, "logps/rejected": -213.9533233642578, "loss": 0.6247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.011571279726922512, "rewards/margins": 0.14571483433246613, "rewards/margins_max": 0.21802254021167755, "rewards/margins_min": 0.07340715825557709, "rewards/margins_std": 0.10225850343704224, "rewards/rejected": -0.1572861224412918, "step": 2720 }, { "epoch": 0.69, "grad_norm": 0.4609375, "learning_rate": 1.3501765609199534e-07, "logits/chosen": 0.11079691350460052, "logits/rejected": 0.49163714051246643, "logps/chosen": -213.44705200195312, "logps/rejected": -220.19482421875, "loss": 0.6263, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.016141904518008232, "rewards/margins": 0.1408129781484604, "rewards/margins_max": 0.20387931168079376, "rewards/margins_min": 0.0777466669678688, "rewards/margins_std": 0.08918922394514084, "rewards/rejected": -0.15695486962795258, "step": 2730 }, { "epoch": 0.69, "grad_norm": 0.5078125, "learning_rate": 1.3307189846706436e-07, "logits/chosen": 0.176819309592247, "logits/rejected": 0.49666061997413635, "logps/chosen": -232.3857421875, "logps/rejected": -229.29183959960938, "loss": 0.6301, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.004111931659281254, "rewards/margins": 0.14815407991409302, "rewards/margins_max": 0.2107928991317749, "rewards/margins_min": 0.08551524579524994, "rewards/margins_std": 0.08858468383550644, "rewards/rejected": -0.1522659957408905, "step": 2740 }, { "epoch": 0.69, "grad_norm": 0.392578125, "learning_rate": 1.3113516529394701e-07, "logits/chosen": 0.0033464699517935514, "logits/rejected": 0.1263490915298462, "logps/chosen": -178.94464111328125, "logps/rejected": -209.9365692138672, "loss": 0.6226, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.014974281191825867, "rewards/margins": 0.13582995533943176, "rewards/margins_max": 0.20509657263755798, "rewards/margins_min": 0.06656330823898315, "rewards/margins_std": 0.09795782715082169, "rewards/rejected": -0.15080423653125763, "step": 2750 }, { "epoch": 0.69, "grad_norm": 0.470703125, "learning_rate": 1.2920760604873145e-07, "logits/chosen": 0.2543647587299347, "logits/rejected": 0.5973013639450073, "logps/chosen": -209.7562255859375, "logps/rejected": -238.90591430664062, "loss": 0.6254, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.013892347924411297, "rewards/margins": 0.14194358885288239, "rewards/margins_max": 0.20539668202400208, "rewards/margins_min": 0.07849050313234329, "rewards/margins_std": 0.089736208319664, "rewards/rejected": -0.15583592653274536, "step": 2760 }, { "epoch": 0.7, "grad_norm": 0.455078125, "learning_rate": 1.2728936949946636e-07, "logits/chosen": 0.2741765081882477, "logits/rejected": 0.5410383939743042, "logps/chosen": -210.25143432617188, "logps/rejected": -233.1810302734375, "loss": 0.6214, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0035478256177157164, "rewards/margins": 0.16275660693645477, "rewards/margins_max": 0.23406746983528137, "rewards/margins_min": 0.09144572168588638, "rewards/margins_std": 0.10084881633520126, "rewards/rejected": -0.16630443930625916, "step": 2770 }, { "epoch": 0.7, "grad_norm": 0.48046875, "learning_rate": 1.2538060369467988e-07, "logits/chosen": 0.11184243857860565, "logits/rejected": 0.4886614680290222, "logps/chosen": -213.7685546875, "logps/rejected": -214.91598510742188, "loss": 0.6292, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.006969447247684002, "rewards/margins": 0.14751985669136047, "rewards/margins_max": 0.22655411064624786, "rewards/margins_min": 0.0684855580329895, "rewards/margins_std": 0.1117713451385498, "rewards/rejected": -0.15448927879333496, "step": 2780 }, { "epoch": 0.7, "grad_norm": 0.474609375, "learning_rate": 1.2348145595195246e-07, "logits/chosen": 0.1253242939710617, "logits/rejected": 0.6095115542411804, "logps/chosen": -233.3427276611328, "logps/rejected": -240.261962890625, "loss": 0.6236, "rewards/accuracies": 0.9375, "rewards/chosen": -0.018260497599840164, "rewards/margins": 0.1275160014629364, "rewards/margins_max": 0.1807311624288559, "rewards/margins_min": 0.07430081814527512, "rewards/margins_std": 0.07525762170553207, "rewards/rejected": -0.14577649533748627, "step": 2790 }, { "epoch": 0.7, "grad_norm": 0.4921875, "learning_rate": 1.2159207284654777e-07, "logits/chosen": 0.06922309100627899, "logits/rejected": 0.24848881363868713, "logps/chosen": -206.8855743408203, "logps/rejected": -250.9455108642578, "loss": 0.6307, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.014282561838626862, "rewards/margins": 0.14171846210956573, "rewards/margins_max": 0.21836943924427032, "rewards/margins_min": 0.06506749242544174, "rewards/margins_std": 0.10840083658695221, "rewards/rejected": -0.1560010313987732, "step": 2800 }, { "epoch": 0.71, "grad_norm": 0.5703125, "learning_rate": 1.1971260020009942e-07, "logits/chosen": 0.06933801621198654, "logits/rejected": 0.3677939474582672, "logps/chosen": -235.5343017578125, "logps/rejected": -252.79782104492188, "loss": 0.6252, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.021200697869062424, "rewards/margins": 0.1525932401418686, "rewards/margins_max": 0.23009681701660156, "rewards/margins_min": 0.07508967816829681, "rewards/margins_std": 0.10960660129785538, "rewards/rejected": -0.1737939417362213, "step": 2810 }, { "epoch": 0.71, "grad_norm": 0.392578125, "learning_rate": 1.1784318306935686e-07, "logits/chosen": 0.0961008220911026, "logits/rejected": 0.5717853903770447, "logps/chosen": -214.80746459960938, "logps/rejected": -221.073486328125, "loss": 0.6292, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.012031609192490578, "rewards/margins": 0.13416241109371185, "rewards/margins_max": 0.2018776684999466, "rewards/margins_min": 0.0664471685886383, "rewards/margins_std": 0.09576381742954254, "rewards/rejected": -0.14619402587413788, "step": 2820 }, { "epoch": 0.71, "grad_norm": 0.40234375, "learning_rate": 1.1598396573499003e-07, "logits/chosen": 0.09464100748300552, "logits/rejected": 0.4134500026702881, "logps/chosen": -215.2247772216797, "logps/rejected": -224.3886260986328, "loss": 0.6317, "rewards/accuracies": 1.0, "rewards/chosen": -0.01749451458454132, "rewards/margins": 0.13466881215572357, "rewards/margins_max": 0.1939276158809662, "rewards/margins_min": 0.07541001588106155, "rewards/margins_std": 0.08380459249019623, "rewards/rejected": -0.1521633267402649, "step": 2830 }, { "epoch": 0.71, "grad_norm": 0.390625, "learning_rate": 1.1413509169045374e-07, "logits/chosen": 0.24158377945423126, "logits/rejected": 0.49442940950393677, "logps/chosen": -239.3939666748047, "logps/rejected": -247.13577270507812, "loss": 0.6295, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.017084117978811264, "rewards/margins": 0.132806658744812, "rewards/margins_max": 0.18559898436069489, "rewards/margins_min": 0.08001436293125153, "rewards/margins_std": 0.07465962320566177, "rewards/rejected": -0.14989081025123596, "step": 2840 }, { "epoch": 0.72, "grad_norm": 0.51953125, "learning_rate": 1.122967036309127e-07, "logits/chosen": 0.07201902568340302, "logits/rejected": 0.4692930579185486, "logps/chosen": -231.50064086914062, "logps/rejected": -236.1966552734375, "loss": 0.6217, "rewards/accuracies": 0.9375, "rewards/chosen": -0.018202459439635277, "rewards/margins": 0.1421966552734375, "rewards/margins_max": 0.188903346657753, "rewards/margins_min": 0.0954899713397026, "rewards/margins_std": 0.06605321913957596, "rewards/rejected": -0.16039910912513733, "step": 2850 }, { "epoch": 0.72, "grad_norm": 0.412109375, "learning_rate": 1.104689434422289e-07, "logits/chosen": 0.026485636830329895, "logits/rejected": 0.3804229497909546, "logps/chosen": -220.963623046875, "logps/rejected": -273.83428955078125, "loss": 0.6242, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.003993537276983261, "rewards/margins": 0.15278157591819763, "rewards/margins_max": 0.221563458442688, "rewards/margins_min": 0.08399970829486847, "rewards/margins_std": 0.09727227687835693, "rewards/rejected": -0.15677510201931, "step": 2860 }, { "epoch": 0.72, "grad_norm": 0.384765625, "learning_rate": 1.0865195219001028e-07, "logits/chosen": 0.17775173485279083, "logits/rejected": 0.5558250546455383, "logps/chosen": -228.0889892578125, "logps/rejected": -228.9232177734375, "loss": 0.6315, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.02530711516737938, "rewards/margins": 0.12181045114994049, "rewards/margins_max": 0.17721505463123322, "rewards/margins_min": 0.06640583276748657, "rewards/margins_std": 0.07835394889116287, "rewards/rejected": -0.14711755514144897, "step": 2870 }, { "epoch": 0.72, "grad_norm": 0.49609375, "learning_rate": 1.0684587010872398e-07, "logits/chosen": 0.13486912846565247, "logits/rejected": 0.46685299277305603, "logps/chosen": -209.0098419189453, "logps/rejected": -205.53067016601562, "loss": 0.6227, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.010718288831412792, "rewards/margins": 0.1312871277332306, "rewards/margins_max": 0.1951644867658615, "rewards/margins_min": 0.06740979105234146, "rewards/margins_std": 0.09033621847629547, "rewards/rejected": -0.14200544357299805, "step": 2880 }, { "epoch": 0.73, "grad_norm": 0.451171875, "learning_rate": 1.0505083659087283e-07, "logits/chosen": 0.002250906778499484, "logits/rejected": 0.3973466157913208, "logps/chosen": -240.7798309326172, "logps/rejected": -244.6177215576172, "loss": 0.6282, "rewards/accuracies": 0.9375, "rewards/chosen": -0.027314025908708572, "rewards/margins": 0.12634828686714172, "rewards/margins_max": 0.18633539974689484, "rewards/margins_min": 0.06636115163564682, "rewards/margins_std": 0.08483459800481796, "rewards/rejected": -0.1536622941493988, "step": 2890 }, { "epoch": 0.73, "grad_norm": 0.44140625, "learning_rate": 1.0326699017623689e-07, "logits/chosen": 0.28300029039382935, "logits/rejected": 0.6444196701049805, "logps/chosen": -212.2546844482422, "logps/rejected": -225.2244873046875, "loss": 0.6227, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0028811150696128607, "rewards/margins": 0.15493568778038025, "rewards/margins_max": 0.217520073056221, "rewards/margins_min": 0.0923512876033783, "rewards/margins_std": 0.08850769698619843, "rewards/rejected": -0.1578167974948883, "step": 2900 }, { "epoch": 0.73, "grad_norm": 0.443359375, "learning_rate": 1.0149446854118151e-07, "logits/chosen": 0.04816162586212158, "logits/rejected": 0.4773538112640381, "logps/chosen": -224.32205200195312, "logps/rejected": -226.0942840576172, "loss": 0.6186, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.01020828541368246, "rewards/margins": 0.1651410609483719, "rewards/margins_max": 0.2378121316432953, "rewards/margins_min": 0.0924699530005455, "rewards/margins_std": 0.10277243703603745, "rewards/rejected": -0.17534933984279633, "step": 2910 }, { "epoch": 0.73, "grad_norm": 0.48046875, "learning_rate": 9.97334084880311e-08, "logits/chosen": 0.21521815657615662, "logits/rejected": 0.6471478343009949, "logps/chosen": -220.90487670898438, "logps/rejected": -232.4788055419922, "loss": 0.6258, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.006086512468755245, "rewards/margins": 0.14669203758239746, "rewards/margins_max": 0.21843501925468445, "rewards/margins_min": 0.07494904100894928, "rewards/margins_std": 0.10145990550518036, "rewards/rejected": -0.1527785360813141, "step": 2920 }, { "epoch": 0.74, "grad_norm": 0.376953125, "learning_rate": 9.79839459345109e-08, "logits/chosen": 0.2569006085395813, "logits/rejected": 0.5799908638000488, "logps/chosen": -223.657958984375, "logps/rejected": -234.80221557617188, "loss": 0.6208, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.02363382652401924, "rewards/margins": 0.1418285369873047, "rewards/margins_max": 0.2157059907913208, "rewards/margins_min": 0.06795106828212738, "rewards/margins_std": 0.10447851568460464, "rewards/rejected": -0.16546235978603363, "step": 2930 }, { "epoch": 0.74, "grad_norm": 0.484375, "learning_rate": 9.624621590325733e-08, "logits/chosen": 0.034563831984996796, "logits/rejected": 0.3041626214981079, "logps/chosen": -221.3046112060547, "logps/rejected": -246.3438262939453, "loss": 0.6225, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.02380397915840149, "rewards/margins": 0.13136173784732819, "rewards/margins_max": 0.19247514009475708, "rewards/margins_min": 0.0702483206987381, "rewards/margins_std": 0.08642742037773132, "rewards/rejected": -0.15516570210456848, "step": 2940 }, { "epoch": 0.74, "grad_norm": 0.42578125, "learning_rate": 9.45203525113962e-08, "logits/chosen": 0.12874826788902283, "logits/rejected": 0.38895314931869507, "logps/chosen": -197.7905731201172, "logps/rejected": -234.9347686767578, "loss": 0.6257, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.02550782822072506, "rewards/margins": 0.12670674920082092, "rewards/margins_max": 0.19691213965415955, "rewards/margins_min": 0.0565013512969017, "rewards/margins_std": 0.09928543120622635, "rewards/rejected": -0.15221455693244934, "step": 2950 }, { "epoch": 0.74, "grad_norm": 0.435546875, "learning_rate": 9.280648896019245e-08, "logits/chosen": 0.22306354343891144, "logits/rejected": 0.593471884727478, "logps/chosen": -233.04739379882812, "logps/rejected": -237.7582244873047, "loss": 0.6162, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.010324180126190186, "rewards/margins": 0.16291800141334534, "rewards/margins_max": 0.23097142577171326, "rewards/margins_min": 0.09486456960439682, "rewards/margins_std": 0.09624208509922028, "rewards/rejected": -0.17324218153953552, "step": 2960 }, { "epoch": 0.75, "grad_norm": 0.46484375, "learning_rate": 9.110475752476935e-08, "logits/chosen": 0.3003462553024292, "logits/rejected": 0.6558989882469177, "logps/chosen": -224.33920288085938, "logps/rejected": -225.9638671875, "loss": 0.6306, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.004842034541070461, "rewards/margins": 0.14018483459949493, "rewards/margins_max": 0.2194204032421112, "rewards/margins_min": 0.06094926595687866, "rewards/margins_std": 0.11205601692199707, "rewards/rejected": -0.14502686262130737, "step": 2970 }, { "epoch": 0.75, "grad_norm": 0.462890625, "learning_rate": 8.94152895438993e-08, "logits/chosen": 0.10406837612390518, "logits/rejected": 0.2556748688220978, "logps/chosen": -193.1667938232422, "logps/rejected": -241.8805694580078, "loss": 0.616, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.002372722839936614, "rewards/margins": 0.16095606982707977, "rewards/margins_max": 0.2316565066576004, "rewards/margins_min": 0.09025563299655914, "rewards/margins_std": 0.0999855026602745, "rewards/rejected": -0.16332878172397614, "step": 2980 }, { "epoch": 0.75, "grad_norm": 0.44140625, "learning_rate": 8.773821540986789e-08, "logits/chosen": 0.24239897727966309, "logits/rejected": 0.6253222227096558, "logps/chosen": -214.51638793945312, "logps/rejected": -229.56784057617188, "loss": 0.6244, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.003825037507340312, "rewards/margins": 0.15542516112327576, "rewards/margins_max": 0.21181336045265198, "rewards/margins_min": 0.09903697669506073, "rewards/margins_std": 0.07974494993686676, "rewards/rejected": -0.15925021469593048, "step": 2990 }, { "epoch": 0.75, "grad_norm": 0.482421875, "learning_rate": 8.607366455840948e-08, "logits/chosen": 0.16232073307037354, "logits/rejected": 0.5660216808319092, "logps/chosen": -219.9098663330078, "logps/rejected": -229.86160278320312, "loss": 0.6209, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.0017680373275652528, "rewards/margins": 0.17724210023880005, "rewards/margins_max": 0.26198625564575195, "rewards/margins_min": 0.09249792248010635, "rewards/margins_std": 0.11984635889530182, "rewards/rejected": -0.17901012301445007, "step": 3000 }, { "epoch": 0.76, "grad_norm": 0.392578125, "learning_rate": 8.442176545871805e-08, "logits/chosen": 0.19031231105327606, "logits/rejected": 0.45045891404151917, "logps/chosen": -189.07440185546875, "logps/rejected": -220.3076171875, "loss": 0.6294, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.017957186326384544, "rewards/margins": 0.127852663397789, "rewards/margins_max": 0.1860080063343048, "rewards/margins_min": 0.0696973130106926, "rewards/margins_std": 0.08224406093358994, "rewards/rejected": -0.1458098441362381, "step": 3010 }, { "epoch": 0.76, "grad_norm": 0.408203125, "learning_rate": 8.278264560353182e-08, "logits/chosen": 0.00534349400550127, "logits/rejected": 0.34737536311149597, "logps/chosen": -198.40512084960938, "logps/rejected": -209.44314575195312, "loss": 0.6232, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.014808463864028454, "rewards/margins": 0.14278842508792877, "rewards/margins_max": 0.21879836916923523, "rewards/margins_min": 0.06677846610546112, "rewards/margins_std": 0.10749431699514389, "rewards/rejected": -0.1575968861579895, "step": 3020 }, { "epoch": 0.76, "grad_norm": 0.42578125, "learning_rate": 8.115643149929316e-08, "logits/chosen": 0.2704222798347473, "logits/rejected": 0.425426721572876, "logps/chosen": -191.9525146484375, "logps/rejected": -234.10140991210938, "loss": 0.6175, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.011862250044941902, "rewards/margins": 0.16936099529266357, "rewards/margins_max": 0.24441039562225342, "rewards/margins_min": 0.09431157261133194, "rewards/margins_std": 0.10613591969013214, "rewards/rejected": -0.18122324347496033, "step": 3030 }, { "epoch": 0.76, "grad_norm": 0.435546875, "learning_rate": 7.954324865638515e-08, "logits/chosen": 0.1512942612171173, "logits/rejected": 0.4919726252555847, "logps/chosen": -253.28726196289062, "logps/rejected": -237.4526824951172, "loss": 0.6229, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.015799174085259438, "rewards/margins": 0.16401445865631104, "rewards/margins_max": 0.23614898324012756, "rewards/margins_min": 0.0918799415230751, "rewards/margins_std": 0.10201362520456314, "rewards/rejected": -0.17981365323066711, "step": 3040 }, { "epoch": 0.77, "grad_norm": 0.404296875, "learning_rate": 7.794322157944488e-08, "logits/chosen": 0.1862577199935913, "logits/rejected": 0.5486984848976135, "logps/chosen": -192.9154510498047, "logps/rejected": -210.54116821289062, "loss": 0.624, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01078636385500431, "rewards/margins": 0.14916051924228668, "rewards/margins_max": 0.21048632264137268, "rewards/margins_min": 0.08783473819494247, "rewards/margins_std": 0.08672776818275452, "rewards/rejected": -0.15994688868522644, "step": 3050 }, { "epoch": 0.77, "grad_norm": 0.41015625, "learning_rate": 7.63564737577538e-08, "logits/chosen": 0.18104226887226105, "logits/rejected": 0.46887174248695374, "logps/chosen": -207.02169799804688, "logps/rejected": -221.676025390625, "loss": 0.626, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.015064379200339317, "rewards/margins": 0.1299816220998764, "rewards/margins_max": 0.18708129227161407, "rewards/margins_min": 0.07288195937871933, "rewards/margins_std": 0.08075112104415894, "rewards/rejected": -0.14504601061344147, "step": 3060 }, { "epoch": 0.77, "grad_norm": 0.46484375, "learning_rate": 7.478312765570746e-08, "logits/chosen": 0.32956480979919434, "logits/rejected": 0.5469153523445129, "logps/chosen": -198.80450439453125, "logps/rejected": -222.2632598876953, "loss": 0.6249, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.01266825757920742, "rewards/margins": 0.14634735882282257, "rewards/margins_max": 0.2088189572095871, "rewards/margins_min": 0.08387576043605804, "rewards/margins_std": 0.08834818005561829, "rewards/rejected": -0.15901562571525574, "step": 3070 }, { "epoch": 0.77, "grad_norm": 0.41796875, "learning_rate": 7.322330470336313e-08, "logits/chosen": 0.2729640305042267, "logits/rejected": 0.5077253580093384, "logps/chosen": -216.9774627685547, "logps/rejected": -219.729736328125, "loss": 0.6198, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.007791057229042053, "rewards/margins": 0.16394273936748505, "rewards/margins_max": 0.21674828231334686, "rewards/margins_min": 0.11113719642162323, "rewards/margins_std": 0.07467831671237946, "rewards/rejected": -0.1717337965965271, "step": 3080 }, { "epoch": 0.78, "grad_norm": 0.412109375, "learning_rate": 7.167712528706843e-08, "logits/chosen": 0.1362188160419464, "logits/rejected": 0.40041661262512207, "logps/chosen": -236.1621856689453, "logps/rejected": -222.1932830810547, "loss": 0.6194, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.012866659089922905, "rewards/margins": 0.15987268090248108, "rewards/margins_max": 0.21934957802295685, "rewards/margins_min": 0.1003957986831665, "rewards/margins_std": 0.08411302417516708, "rewards/rejected": -0.17273934185504913, "step": 3090 }, { "epoch": 0.78, "grad_norm": 0.416015625, "learning_rate": 7.014470874016981e-08, "logits/chosen": 0.24159541726112366, "logits/rejected": 0.4540305733680725, "logps/chosen": -198.06089782714844, "logps/rejected": -231.21859741210938, "loss": 0.623, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.00808451697230339, "rewards/margins": 0.163883239030838, "rewards/margins_max": 0.23084087669849396, "rewards/margins_min": 0.09692560136318207, "rewards/margins_std": 0.0946924015879631, "rewards/rejected": -0.1719677448272705, "step": 3100 }, { "epoch": 0.78, "grad_norm": 0.474609375, "learning_rate": 6.862617333380213e-08, "logits/chosen": 0.04986204952001572, "logits/rejected": 0.22950272262096405, "logps/chosen": -193.28591918945312, "logps/rejected": -225.59683227539062, "loss": 0.6264, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.006926494650542736, "rewards/margins": 0.1489885151386261, "rewards/margins_max": 0.21327200531959534, "rewards/margins_min": 0.08470507711172104, "rewards/margins_std": 0.09091053903102875, "rewards/rejected": -0.1559150218963623, "step": 3110 }, { "epoch": 0.79, "grad_norm": 0.451171875, "learning_rate": 6.712163626776116e-08, "logits/chosen": 0.051450151950120926, "logits/rejected": 0.40259242057800293, "logps/chosen": -231.8721923828125, "logps/rejected": -251.7601776123047, "loss": 0.6208, "rewards/accuracies": 1.0, "rewards/chosen": -0.005096064880490303, "rewards/margins": 0.16292552649974823, "rewards/margins_max": 0.22777943313121796, "rewards/margins_min": 0.0980716347694397, "rewards/margins_std": 0.09171726554632187, "rewards/rejected": -0.16802158951759338, "step": 3120 }, { "epoch": 0.79, "grad_norm": 0.4453125, "learning_rate": 6.563121366145758e-08, "logits/chosen": 0.11294182389974594, "logits/rejected": 0.45181140303611755, "logps/chosen": -219.5288848876953, "logps/rejected": -231.9810333251953, "loss": 0.6276, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.014740655198693275, "rewards/margins": 0.14601239562034607, "rewards/margins_max": 0.20798468589782715, "rewards/margins_min": 0.08404012024402618, "rewards/margins_std": 0.08764202892780304, "rewards/rejected": -0.1607530415058136, "step": 3130 }, { "epoch": 0.79, "grad_norm": 0.478515625, "learning_rate": 6.415502054495539e-08, "logits/chosen": 0.14309485256671906, "logits/rejected": 0.45276278257369995, "logps/chosen": -228.79171752929688, "logps/rejected": -233.21902465820312, "loss": 0.6181, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.005436851177364588, "rewards/margins": 0.17040897905826569, "rewards/margins_max": 0.23482480645179749, "rewards/margins_min": 0.10599315166473389, "rewards/margins_std": 0.09109772741794586, "rewards/rejected": -0.17584583163261414, "step": 3140 }, { "epoch": 0.79, "grad_norm": 0.435546875, "learning_rate": 6.269317085009362e-08, "logits/chosen": 0.2758491635322571, "logits/rejected": 0.6270065307617188, "logps/chosen": -210.78067016601562, "logps/rejected": -217.84274291992188, "loss": 0.6191, "rewards/accuracies": 0.9375, "rewards/chosen": -0.012884063646197319, "rewards/margins": 0.14818808436393738, "rewards/margins_max": 0.2214541733264923, "rewards/margins_min": 0.07492204010486603, "rewards/margins_std": 0.10361386835575104, "rewards/rejected": -0.16107216477394104, "step": 3150 }, { "epoch": 0.8, "grad_norm": 0.65625, "learning_rate": 6.124577740169326e-08, "logits/chosen": 0.12650275230407715, "logits/rejected": 0.2816252112388611, "logps/chosen": -215.2202911376953, "logps/rejected": -250.96603393554688, "loss": 0.6226, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.015245141461491585, "rewards/margins": 0.144632026553154, "rewards/margins_max": 0.20244213938713074, "rewards/margins_min": 0.08682187646627426, "rewards/margins_std": 0.08175589144229889, "rewards/rejected": -0.15987718105316162, "step": 3160 }, { "epoch": 0.8, "grad_norm": 0.443359375, "learning_rate": 5.981295190884961e-08, "logits/chosen": 0.29551905393600464, "logits/rejected": 0.6196221113204956, "logps/chosen": -226.7298583984375, "logps/rejected": -248.25308227539062, "loss": 0.6324, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.012598914094269276, "rewards/margins": 0.12753096222877502, "rewards/margins_max": 0.18140225112438202, "rewards/margins_min": 0.07365966588258743, "rewards/margins_std": 0.07618552446365356, "rewards/rejected": -0.14012984931468964, "step": 3170 }, { "epoch": 0.8, "grad_norm": 0.515625, "learning_rate": 5.839480495631049e-08, "logits/chosen": 0.21646694839000702, "logits/rejected": 0.46308571100234985, "logps/chosen": -215.9964141845703, "logps/rejected": -265.9897155761719, "loss": 0.6248, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.020272348076105118, "rewards/margins": 0.14989283680915833, "rewards/margins_max": 0.22070348262786865, "rewards/margins_min": 0.07908222824335098, "rewards/margins_std": 0.10014134645462036, "rewards/rejected": -0.17016521096229553, "step": 3180 }, { "epoch": 0.8, "grad_norm": 0.361328125, "learning_rate": 5.69914459959413e-08, "logits/chosen": 0.13232514262199402, "logits/rejected": 0.4126739501953125, "logps/chosen": -204.1754913330078, "logps/rejected": -213.9013671875, "loss": 0.6236, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.002058145124465227, "rewards/margins": 0.14735762774944305, "rewards/margins_max": 0.22531196475028992, "rewards/margins_min": 0.0694032534956932, "rewards/margins_std": 0.11024411767721176, "rewards/rejected": -0.1494157761335373, "step": 3190 }, { "epoch": 0.81, "grad_norm": 0.373046875, "learning_rate": 5.560298333827782e-08, "logits/chosen": 0.1216355413198471, "logits/rejected": 0.43593940138816833, "logps/chosen": -198.83663940429688, "logps/rejected": -205.1917724609375, "loss": 0.6283, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.024115536361932755, "rewards/margins": 0.1255904883146286, "rewards/margins_max": 0.1869795024394989, "rewards/margins_min": 0.06420144438743591, "rewards/margins_std": 0.0868171975016594, "rewards/rejected": -0.14970602095127106, "step": 3200 }, { "epoch": 0.81, "grad_norm": 0.3828125, "learning_rate": 5.422952414416648e-08, "logits/chosen": 0.12433646619319916, "logits/rejected": 0.5170078277587891, "logps/chosen": -200.92434692382812, "logps/rejected": -191.41696166992188, "loss": 0.6272, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.018672823905944824, "rewards/margins": 0.1293351650238037, "rewards/margins_max": 0.2056567668914795, "rewards/margins_min": 0.05301358178257942, "rewards/margins_std": 0.10793503373861313, "rewards/rejected": -0.14800798892974854, "step": 3210 }, { "epoch": 0.81, "grad_norm": 0.5546875, "learning_rate": 5.2871174416494246e-08, "logits/chosen": 0.26231053471565247, "logits/rejected": 0.5368086099624634, "logps/chosen": -216.56167602539062, "logps/rejected": -235.6735382080078, "loss": 0.6283, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.02093801461160183, "rewards/margins": 0.15397703647613525, "rewards/margins_max": 0.2333345115184784, "rewards/margins_min": 0.07461957633495331, "rewards/margins_std": 0.1122283935546875, "rewards/rejected": -0.17491504549980164, "step": 3220 }, { "epoch": 0.81, "grad_norm": 0.48046875, "learning_rate": 5.1528038992007e-08, "logits/chosen": 0.05730471760034561, "logits/rejected": 0.34825384616851807, "logps/chosen": -207.71719360351562, "logps/rejected": -233.8785400390625, "loss": 0.6201, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.015006333589553833, "rewards/margins": 0.1500559151172638, "rewards/margins_max": 0.23772725462913513, "rewards/margins_min": 0.062384605407714844, "rewards/margins_std": 0.12398598343133926, "rewards/rejected": -0.16506226360797882, "step": 3230 }, { "epoch": 0.82, "grad_norm": 0.392578125, "learning_rate": 5.020022153321823e-08, "logits/chosen": 0.09996681660413742, "logits/rejected": 0.4557221829891205, "logps/chosen": -195.9675750732422, "logps/rejected": -204.02793884277344, "loss": 0.6242, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.006736955605447292, "rewards/margins": 0.14469148218631744, "rewards/margins_max": 0.2036510705947876, "rewards/margins_min": 0.0857318863272667, "rewards/margins_std": 0.08338145166635513, "rewards/rejected": -0.1514284312725067, "step": 3240 }, { "epoch": 0.82, "grad_norm": 0.408203125, "learning_rate": 4.888782452040885e-08, "logits/chosen": 0.21130314469337463, "logits/rejected": 0.4518766403198242, "logps/chosen": -195.22560119628906, "logps/rejected": -224.5800323486328, "loss": 0.6165, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.011300037615001202, "rewards/margins": 0.16262060403823853, "rewards/margins_max": 0.24175962805747986, "rewards/margins_min": 0.0834815502166748, "rewards/margins_std": 0.11191952228546143, "rewards/rejected": -0.1739206314086914, "step": 3250 }, { "epoch": 0.82, "grad_norm": 0.431640625, "learning_rate": 4.759094924371732e-08, "logits/chosen": 0.22043880820274353, "logits/rejected": 0.540926456451416, "logps/chosen": -192.521728515625, "logps/rejected": -216.2484130859375, "loss": 0.6227, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.006431704852730036, "rewards/margins": 0.16023923456668854, "rewards/margins_max": 0.2194247990846634, "rewards/margins_min": 0.10105365514755249, "rewards/margins_std": 0.08370102941989899, "rewards/rejected": -0.16667093336582184, "step": 3260 }, { "epoch": 0.82, "grad_norm": 0.45703125, "learning_rate": 4.6309695795322315e-08, "logits/chosen": 0.26144370436668396, "logits/rejected": 0.5330118536949158, "logps/chosen": -211.96853637695312, "logps/rejected": -207.638427734375, "loss": 0.6099, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.006718629505485296, "rewards/margins": 0.18218782544136047, "rewards/margins_max": 0.2676844000816345, "rewards/margins_min": 0.09669125080108643, "rewards/margins_std": 0.12091042846441269, "rewards/rejected": -0.17546920478343964, "step": 3270 }, { "epoch": 0.83, "grad_norm": 0.46484375, "learning_rate": 4.504416306171796e-08, "logits/chosen": 0.11534128338098526, "logits/rejected": 0.5749226212501526, "logps/chosen": -257.9087219238281, "logps/rejected": -258.05767822265625, "loss": 0.6251, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.025025730952620506, "rewards/margins": 0.1406787931919098, "rewards/margins_max": 0.18732953071594238, "rewards/margins_min": 0.094028040766716, "rewards/margins_std": 0.06597410887479782, "rewards/rejected": -0.16570451855659485, "step": 3280 }, { "epoch": 0.83, "grad_norm": 0.400390625, "learning_rate": 4.3794448716081236e-08, "logits/chosen": 0.14908891916275024, "logits/rejected": 0.4629700779914856, "logps/chosen": -216.52182006835938, "logps/rejected": -232.32540893554688, "loss": 0.62, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.009620030410587788, "rewards/margins": 0.16076137125492096, "rewards/margins_max": 0.2331482470035553, "rewards/margins_min": 0.08837450295686722, "rewards/margins_std": 0.1023704782128334, "rewards/rejected": -0.17038139700889587, "step": 3290 }, { "epoch": 0.83, "grad_norm": 0.400390625, "learning_rate": 4.2560649210734184e-08, "logits/chosen": 0.15376536548137665, "logits/rejected": 0.5315260887145996, "logps/chosen": -222.3070068359375, "logps/rejected": -236.8693389892578, "loss": 0.6308, "rewards/accuracies": 0.9375, "rewards/chosen": -0.014246064238250256, "rewards/margins": 0.13371500372886658, "rewards/margins_max": 0.19171631336212158, "rewards/margins_min": 0.07571368664503098, "rewards/margins_std": 0.08202625066041946, "rewards/rejected": -0.1479610651731491, "step": 3300 }, { "epoch": 0.83, "grad_norm": 0.494140625, "learning_rate": 4.134285976969948e-08, "logits/chosen": 0.012613398022949696, "logits/rejected": 0.3860972225666046, "logps/chosen": -215.45736694335938, "logps/rejected": -236.6368408203125, "loss": 0.6216, "rewards/accuracies": 0.9375, "rewards/chosen": -0.008860927075147629, "rewards/margins": 0.15610839426517487, "rewards/margins_max": 0.23083467781543732, "rewards/margins_min": 0.08138208836317062, "rewards/margins_std": 0.10567893832921982, "rewards/rejected": -0.1649693250656128, "step": 3310 }, { "epoch": 0.84, "grad_norm": 0.41015625, "learning_rate": 4.014117438135087e-08, "logits/chosen": 0.13868138194084167, "logits/rejected": 0.583828866481781, "logps/chosen": -220.776123046875, "logps/rejected": -216.2689208984375, "loss": 0.6218, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.01833544299006462, "rewards/margins": 0.15705737471580505, "rewards/margins_max": 0.22352662682533264, "rewards/margins_min": 0.09058809280395508, "rewards/margins_std": 0.09400173276662827, "rewards/rejected": -0.17539279162883759, "step": 3320 }, { "epoch": 0.84, "grad_norm": 0.421875, "learning_rate": 3.895568579115982e-08, "logits/chosen": 0.15903417766094208, "logits/rejected": 0.4794175624847412, "logps/chosen": -215.1283721923828, "logps/rejected": -233.633056640625, "loss": 0.6275, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.012763378210365772, "rewards/margins": 0.14582587778568268, "rewards/margins_max": 0.2024238407611847, "rewards/margins_min": 0.08922794461250305, "rewards/margins_std": 0.08004157990217209, "rewards/rejected": -0.15858925879001617, "step": 3330 }, { "epoch": 0.84, "grad_norm": 0.45703125, "learning_rate": 3.778648549453672e-08, "logits/chosen": 0.17751941084861755, "logits/rejected": 0.5122814774513245, "logps/chosen": -221.0343017578125, "logps/rejected": -225.0925750732422, "loss": 0.6237, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.028264153748750687, "rewards/margins": 0.14352814853191376, "rewards/margins_max": 0.19432470202445984, "rewards/margins_min": 0.09273160994052887, "rewards/margins_std": 0.0718371644616127, "rewards/rejected": -0.17179229855537415, "step": 3340 }, { "epoch": 0.84, "grad_norm": 0.423828125, "learning_rate": 3.663366372977e-08, "logits/chosen": 0.14561694860458374, "logits/rejected": 0.49741190671920776, "logps/chosen": -206.4697723388672, "logps/rejected": -220.1299285888672, "loss": 0.6274, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.015442503616213799, "rewards/margins": 0.14094342291355133, "rewards/margins_max": 0.2111312448978424, "rewards/margins_min": 0.07075560837984085, "rewards/margins_std": 0.09926055371761322, "rewards/rejected": -0.15638591349124908, "step": 3350 }, { "epoch": 0.85, "grad_norm": 0.46875, "learning_rate": 3.5497309471061193e-08, "logits/chosen": 0.2309088408946991, "logits/rejected": 0.5552518367767334, "logps/chosen": -222.244140625, "logps/rejected": -245.16860961914062, "loss": 0.6229, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.010492782108485699, "rewards/margins": 0.14994525909423828, "rewards/margins_max": 0.20558050274848938, "rewards/margins_min": 0.09431000798940659, "rewards/margins_std": 0.0786801129579544, "rewards/rejected": -0.16043803095817566, "step": 3360 }, { "epoch": 0.85, "grad_norm": 0.478515625, "learning_rate": 3.43775104216579e-08, "logits/chosen": 0.18579712510108948, "logits/rejected": 0.4238066077232361, "logps/chosen": -198.09408569335938, "logps/rejected": -227.15322875976562, "loss": 0.6225, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0045109400525689125, "rewards/margins": 0.14594948291778564, "rewards/margins_max": 0.21625879406929016, "rewards/margins_min": 0.07564017921686172, "rewards/margins_std": 0.09943237155675888, "rewards/rejected": -0.15046042203903198, "step": 3370 }, { "epoch": 0.85, "grad_norm": 0.4453125, "learning_rate": 3.3274353007085114e-08, "logits/chosen": 0.11678584665060043, "logits/rejected": 0.5011313557624817, "logps/chosen": -218.9866180419922, "logps/rejected": -246.6363983154297, "loss": 0.6259, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.015473976731300354, "rewards/margins": 0.14063812792301178, "rewards/margins_max": 0.19851435720920563, "rewards/margins_min": 0.08276190608739853, "rewards/margins_std": 0.0818493440747261, "rewards/rejected": -0.15611210465431213, "step": 3380 }, { "epoch": 0.85, "grad_norm": 0.41796875, "learning_rate": 3.2187922368474954e-08, "logits/chosen": 0.22834794223308563, "logits/rejected": 0.46402662992477417, "logps/chosen": -238.57455444335938, "logps/rejected": -232.40298461914062, "loss": 0.6256, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.02530672773718834, "rewards/margins": 0.12855622172355652, "rewards/margins_max": 0.19385860860347748, "rewards/margins_min": 0.06325386464595795, "rewards/margins_std": 0.09235149621963501, "rewards/rejected": -0.15386296808719635, "step": 3390 }, { "epoch": 0.86, "grad_norm": 0.3984375, "learning_rate": 3.111830235599519e-08, "logits/chosen": 0.12516704201698303, "logits/rejected": 0.5988355278968811, "logps/chosen": -232.6156005859375, "logps/rejected": -207.4446563720703, "loss": 0.629, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.01665051281452179, "rewards/margins": 0.1352086365222931, "rewards/margins_max": 0.19865334033966064, "rewards/margins_min": 0.07176395505666733, "rewards/margins_std": 0.0897243469953537, "rewards/rejected": -0.15185916423797607, "step": 3400 }, { "epoch": 0.86, "grad_norm": 0.4140625, "learning_rate": 3.006557552237826e-08, "logits/chosen": 0.27945616841316223, "logits/rejected": 0.7063152194023132, "logps/chosen": -234.36636352539062, "logps/rejected": -218.6753387451172, "loss": 0.6147, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.01583377830684185, "rewards/margins": 0.15065374970436096, "rewards/margins_max": 0.2287486046552658, "rewards/margins_min": 0.07255889475345612, "rewards/margins_std": 0.11044280230998993, "rewards/rejected": -0.16648752987384796, "step": 3410 }, { "epoch": 0.86, "grad_norm": 0.408203125, "learning_rate": 2.9029823116549307e-08, "logits/chosen": 0.1043904647231102, "logits/rejected": 0.36490142345428467, "logps/chosen": -202.95152282714844, "logps/rejected": -226.87008666992188, "loss": 0.623, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.01607043482363224, "rewards/margins": 0.1644206941127777, "rewards/margins_max": 0.22772078216075897, "rewards/margins_min": 0.10112065076828003, "rewards/margins_std": 0.08951981365680695, "rewards/rejected": -0.1804911196231842, "step": 3420 }, { "epoch": 0.86, "grad_norm": 0.443359375, "learning_rate": 2.8011125077355868e-08, "logits/chosen": 0.22078931331634521, "logits/rejected": 0.43829792737960815, "logps/chosen": -195.32461547851562, "logps/rejected": -235.56689453125, "loss": 0.6202, "rewards/accuracies": 0.9375, "rewards/chosen": -0.014973275363445282, "rewards/margins": 0.1381199061870575, "rewards/margins_max": 0.1917978674173355, "rewards/margins_min": 0.08444195240736008, "rewards/margins_std": 0.07591209560632706, "rewards/rejected": -0.15309318900108337, "step": 3430 }, { "epoch": 0.87, "grad_norm": 0.49609375, "learning_rate": 2.7009560027398125e-08, "logits/chosen": 0.30612367391586304, "logits/rejected": 0.7241549491882324, "logps/chosen": -244.1378173828125, "logps/rejected": -252.7650604248047, "loss": 0.6235, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.020922096446156502, "rewards/margins": 0.13570961356163025, "rewards/margins_max": 0.19178488850593567, "rewards/margins_min": 0.07963432371616364, "rewards/margins_std": 0.07930243760347366, "rewards/rejected": -0.1566317081451416, "step": 3440 }, { "epoch": 0.87, "grad_norm": 0.443359375, "learning_rate": 2.6025205266960587e-08, "logits/chosen": 0.1889585703611374, "logits/rejected": 0.7006251811981201, "logps/chosen": -259.59271240234375, "logps/rejected": -223.6729278564453, "loss": 0.6298, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0160837359726429, "rewards/margins": 0.13242119550704956, "rewards/margins_max": 0.1900266408920288, "rewards/margins_min": 0.07481571286916733, "rewards/margins_std": 0.0814664289355278, "rewards/rejected": -0.14850492775440216, "step": 3450 }, { "epoch": 0.87, "grad_norm": 0.474609375, "learning_rate": 2.505813676804652e-08, "logits/chosen": 0.15270619094371796, "logits/rejected": 0.4426157474517822, "logps/chosen": -234.78604125976562, "logps/rejected": -238.5996551513672, "loss": 0.6256, "rewards/accuracies": 0.9375, "rewards/chosen": -0.011501345783472061, "rewards/margins": 0.15211376547813416, "rewards/margins_max": 0.2341291606426239, "rewards/margins_min": 0.07009837031364441, "rewards/margins_std": 0.11598727852106094, "rewards/rejected": -0.16361510753631592, "step": 3460 }, { "epoch": 0.87, "grad_norm": 0.37109375, "learning_rate": 2.4108429168514245e-08, "logits/chosen": 0.20349092781543732, "logits/rejected": 0.6541525721549988, "logps/chosen": -226.94345092773438, "logps/rejected": -217.9384765625, "loss": 0.6307, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01658671535551548, "rewards/margins": 0.1279931366443634, "rewards/margins_max": 0.18294529616832733, "rewards/margins_min": 0.07304098457098007, "rewards/margins_std": 0.07771409302949905, "rewards/rejected": -0.14457984268665314, "step": 3470 }, { "epoch": 0.88, "grad_norm": 0.40625, "learning_rate": 2.3176155766316486e-08, "logits/chosen": 0.2457670271396637, "logits/rejected": 0.550891637802124, "logps/chosen": -211.3545684814453, "logps/rejected": -208.39163208007812, "loss": 0.6231, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.009505241177976131, "rewards/margins": 0.1423722207546234, "rewards/margins_max": 0.21473953127861023, "rewards/margins_min": 0.0700049176812172, "rewards/margins_std": 0.10234282910823822, "rewards/rejected": -0.15187746286392212, "step": 3480 }, { "epoch": 0.88, "grad_norm": 0.42578125, "learning_rate": 2.2261388513843516e-08, "logits/chosen": 0.0553862564265728, "logits/rejected": 0.4699821472167969, "logps/chosen": -221.19204711914062, "logps/rejected": -219.5061798095703, "loss": 0.6223, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.01753642037510872, "rewards/margins": 0.1534227579832077, "rewards/margins_max": 0.2139158695936203, "rewards/margins_min": 0.0929296463727951, "rewards/margins_std": 0.08555018156766891, "rewards/rejected": -0.17095918953418732, "step": 3490 }, { "epoch": 0.88, "grad_norm": 0.41796875, "learning_rate": 2.1364198012369883e-08, "logits/chosen": 0.10180866718292236, "logits/rejected": 0.3975290060043335, "logps/chosen": -192.74179077148438, "logps/rejected": -208.12094116210938, "loss": 0.624, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.012156123295426369, "rewards/margins": 0.14519140124320984, "rewards/margins_max": 0.21287605166435242, "rewards/margins_min": 0.07750679552555084, "rewards/margins_std": 0.09572052955627441, "rewards/rejected": -0.15734753012657166, "step": 3500 }, { "epoch": 0.88, "grad_norm": 0.4609375, "learning_rate": 2.048465350660522e-08, "logits/chosen": 0.08099778741598129, "logits/rejected": 0.5274533033370972, "logps/chosen": -256.3960266113281, "logps/rejected": -240.3711395263672, "loss": 0.6323, "rewards/accuracies": 0.9375, "rewards/chosen": -0.022899020463228226, "rewards/margins": 0.1262959986925125, "rewards/margins_max": 0.17493896186351776, "rewards/margins_min": 0.07765305042266846, "rewards/margins_std": 0.06879152357578278, "rewards/rejected": -0.14919503033161163, "step": 3510 }, { "epoch": 0.89, "grad_norm": 0.416015625, "learning_rate": 1.9622822879350297e-08, "logits/chosen": 0.18845385313034058, "logits/rejected": 0.5498403310775757, "logps/chosen": -218.7178192138672, "logps/rejected": -218.140869140625, "loss": 0.6289, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.013486932031810284, "rewards/margins": 0.14780057966709137, "rewards/margins_max": 0.2192755490541458, "rewards/margins_min": 0.07632561028003693, "rewards/margins_std": 0.10108087956905365, "rewards/rejected": -0.16128750145435333, "step": 3520 }, { "epoch": 0.89, "grad_norm": 0.40625, "learning_rate": 1.877877264625749e-08, "logits/chosen": 0.09181084483861923, "logits/rejected": 0.44030362367630005, "logps/chosen": -222.36746215820312, "logps/rejected": -238.787109375, "loss": 0.6291, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.011573871597647667, "rewards/margins": 0.1379801481962204, "rewards/margins_max": 0.19014567136764526, "rewards/margins_min": 0.08581461012363434, "rewards/margins_std": 0.07377320528030396, "rewards/rejected": -0.14955401420593262, "step": 3530 }, { "epoch": 0.89, "grad_norm": 0.466796875, "learning_rate": 1.7952567950697524e-08, "logits/chosen": 0.23057833313941956, "logits/rejected": 0.5786561369895935, "logps/chosen": -209.0341796875, "logps/rejected": -225.7135772705078, "loss": 0.6237, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.011088917963206768, "rewards/margins": 0.13525059819221497, "rewards/margins_max": 0.20516173541545868, "rewards/margins_min": 0.06533943861722946, "rewards/margins_std": 0.09886928647756577, "rewards/rejected": -0.1463395059108734, "step": 3540 }, { "epoch": 0.89, "grad_norm": 0.46875, "learning_rate": 1.7144272558731465e-08, "logits/chosen": 0.15487629175186157, "logits/rejected": 0.44518956542015076, "logps/chosen": -205.5785369873047, "logps/rejected": -261.6163635253906, "loss": 0.624, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.013380967080593109, "rewards/margins": 0.1502249538898468, "rewards/margins_max": 0.21569721400737762, "rewards/margins_min": 0.08475267887115479, "rewards/margins_std": 0.09259176254272461, "rewards/rejected": -0.16360591351985931, "step": 3550 }, { "epoch": 0.9, "grad_norm": 0.466796875, "learning_rate": 1.6353948854189335e-08, "logits/chosen": 0.1412431001663208, "logits/rejected": 0.48025646805763245, "logps/chosen": -214.26284790039062, "logps/rejected": -224.42929077148438, "loss": 0.6226, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.006520640105009079, "rewards/margins": 0.14878708124160767, "rewards/margins_max": 0.21710875630378723, "rewards/margins_min": 0.08046545088291168, "rewards/margins_std": 0.09662137925624847, "rewards/rejected": -0.15530773997306824, "step": 3560 }, { "epoch": 0.9, "grad_norm": 0.52734375, "learning_rate": 1.558165783385551e-08, "logits/chosen": -0.06958173215389252, "logits/rejected": 0.4530336856842041, "logps/chosen": -237.4164581298828, "logps/rejected": -232.8388214111328, "loss": 0.6259, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.012647953815758228, "rewards/margins": 0.1542251855134964, "rewards/margins_max": 0.21820297837257385, "rewards/margins_min": 0.09024737775325775, "rewards/margins_std": 0.09047825634479523, "rewards/rejected": -0.16687312722206116, "step": 3570 }, { "epoch": 0.9, "grad_norm": 0.48828125, "learning_rate": 1.4827459102760758e-08, "logits/chosen": 0.1923970878124237, "logits/rejected": 0.5918298363685608, "logps/chosen": -231.77603149414062, "logps/rejected": -214.3612823486328, "loss": 0.6275, "rewards/accuracies": 0.9375, "rewards/chosen": -0.007002467755228281, "rewards/margins": 0.13359227776527405, "rewards/margins_max": 0.18523195385932922, "rewards/margins_min": 0.08195260167121887, "rewards/margins_std": 0.073029525578022, "rewards/rejected": -0.1405947357416153, "step": 3580 }, { "epoch": 0.9, "grad_norm": 0.447265625, "learning_rate": 1.4091410869582265e-08, "logits/chosen": 0.33185145258903503, "logits/rejected": 0.6642862558364868, "logps/chosen": -198.0188446044922, "logps/rejected": -207.3836669921875, "loss": 0.6241, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.010471848770976067, "rewards/margins": 0.14225298166275024, "rewards/margins_max": 0.21028229594230652, "rewards/margins_min": 0.07422365993261337, "rewards/margins_std": 0.09620799869298935, "rewards/rejected": -0.15272483229637146, "step": 3590 }, { "epoch": 0.91, "grad_norm": 0.40234375, "learning_rate": 1.3373569942150803e-08, "logits/chosen": 0.05179264023900032, "logits/rejected": 0.49000033736228943, "logps/chosen": -228.24319458007812, "logps/rejected": -224.9072723388672, "loss": 0.6295, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.015899891033768654, "rewards/margins": 0.16258960962295532, "rewards/margins_max": 0.23546075820922852, "rewards/margins_min": 0.08971847593784332, "rewards/margins_std": 0.10305535793304443, "rewards/rejected": -0.17848949134349823, "step": 3600 }, { "epoch": 0.91, "grad_norm": 0.392578125, "learning_rate": 1.267399172306663e-08, "logits/chosen": 0.10965637117624283, "logits/rejected": 0.4788391590118408, "logps/chosen": -234.5061492919922, "logps/rejected": -231.590087890625, "loss": 0.6249, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.01606924459338188, "rewards/margins": 0.1393829733133316, "rewards/margins_max": 0.2047717571258545, "rewards/margins_min": 0.07399418205022812, "rewards/margins_std": 0.09247370064258575, "rewards/rejected": -0.1554522067308426, "step": 3610 }, { "epoch": 0.91, "grad_norm": 0.44140625, "learning_rate": 1.1992730205423268e-08, "logits/chosen": 0.29971417784690857, "logits/rejected": 0.7055215835571289, "logps/chosen": -239.8339080810547, "logps/rejected": -219.6548309326172, "loss": 0.6224, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.018544862046837807, "rewards/margins": 0.14643549919128418, "rewards/margins_max": 0.219753697514534, "rewards/margins_min": 0.07311731576919556, "rewards/margins_std": 0.1036875993013382, "rewards/rejected": -0.16498038172721863, "step": 3620 }, { "epoch": 0.91, "grad_norm": 0.9296875, "learning_rate": 1.1329837968640538e-08, "logits/chosen": 0.2418794333934784, "logits/rejected": 0.6269633173942566, "logps/chosen": -230.0504608154297, "logps/rejected": -235.80770874023438, "loss": 0.6236, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.007086062338203192, "rewards/margins": 0.15617170929908752, "rewards/margins_max": 0.21963337063789368, "rewards/margins_min": 0.09271007776260376, "rewards/margins_std": 0.0897483080625534, "rewards/rejected": -0.16325779259204865, "step": 3630 }, { "epoch": 0.92, "grad_norm": 0.4296875, "learning_rate": 1.0685366174406407e-08, "logits/chosen": 0.11754278838634491, "logits/rejected": 0.3370839059352875, "logps/chosen": -180.74795532226562, "logps/rejected": -222.08981323242188, "loss": 0.6133, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.007735075894743204, "rewards/margins": 0.17414657771587372, "rewards/margins_max": 0.25469130277633667, "rewards/margins_min": 0.09360190480947495, "rewards/margins_std": 0.11390741169452667, "rewards/rejected": -0.18188168108463287, "step": 3640 }, { "epoch": 0.92, "grad_norm": 0.41015625, "learning_rate": 1.0059364562728517e-08, "logits/chosen": 0.09660269320011139, "logits/rejected": 0.4383271634578705, "logps/chosen": -224.48855590820312, "logps/rejected": -246.1910858154297, "loss": 0.6186, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.0118337944149971, "rewards/margins": 0.16131319105625153, "rewards/margins_max": 0.21953459084033966, "rewards/margins_min": 0.10309179127216339, "rewards/margins_std": 0.08233748376369476, "rewards/rejected": -0.17314699292182922, "step": 3650 }, { "epoch": 0.92, "grad_norm": 0.44140625, "learning_rate": 9.451881448094945e-09, "logits/chosen": 0.12552067637443542, "logits/rejected": 0.4242735505104065, "logps/chosen": -226.05575561523438, "logps/rejected": -256.68621826171875, "loss": 0.63, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.004671765957027674, "rewards/margins": 0.1497805267572403, "rewards/margins_max": 0.22058932483196259, "rewards/margins_min": 0.07897171378135681, "rewards/margins_std": 0.10013879835605621, "rewards/rejected": -0.15445229411125183, "step": 3660 }, { "epoch": 0.92, "grad_norm": 0.42578125, "learning_rate": 8.862963715745685e-09, "logits/chosen": 0.3190884590148926, "logits/rejected": 0.4993259906768799, "logps/chosen": -204.91552734375, "logps/rejected": -236.74575805664062, "loss": 0.6261, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.012486386112868786, "rewards/margins": 0.16221530735492706, "rewards/margins_max": 0.2286953181028366, "rewards/margins_min": 0.09573531150817871, "rewards/margins_std": 0.0940169245004654, "rewards/rejected": -0.17470169067382812, "step": 3670 }, { "epoch": 0.93, "grad_norm": 0.51171875, "learning_rate": 8.292656818053855e-09, "logits/chosen": 0.2774728536605835, "logits/rejected": 0.5323530435562134, "logps/chosen": -236.22982788085938, "logps/rejected": -267.6929016113281, "loss": 0.6219, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.008514724671840668, "rewards/margins": 0.16985894739627838, "rewards/margins_max": 0.24397945404052734, "rewards/margins_min": 0.09573845565319061, "rewards/margins_std": 0.10482219606637955, "rewards/rejected": -0.17837366461753845, "step": 3680 }, { "epoch": 0.93, "grad_norm": 0.54296875, "learning_rate": 7.74100477101769e-09, "logits/chosen": 0.1537213921546936, "logits/rejected": 0.5959830284118652, "logps/chosen": -262.23541259765625, "logps/rejected": -248.9879608154297, "loss": 0.6212, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01718863844871521, "rewards/margins": 0.14395023882389069, "rewards/margins_max": 0.2045532464981079, "rewards/margins_min": 0.08334719389677048, "rewards/margins_std": 0.08570563048124313, "rewards/rejected": -0.1611388623714447, "step": 3690 }, { "epoch": 0.93, "grad_norm": 0.41796875, "learning_rate": 7.2080501508635925e-09, "logits/chosen": 0.23357871174812317, "logits/rejected": 0.575967013835907, "logps/chosen": -233.1752166748047, "logps/rejected": -204.51193237304688, "loss": 0.6301, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.00857073999941349, "rewards/margins": 0.12167960405349731, "rewards/margins_max": 0.17322678864002228, "rewards/margins_min": 0.07013241946697235, "rewards/margins_std": 0.072898730635643, "rewards/rejected": -0.13025033473968506, "step": 3700 }, { "epoch": 0.93, "grad_norm": 0.4921875, "learning_rate": 6.693834090759909e-09, "logits/chosen": 0.011134648695588112, "logits/rejected": 0.36169159412384033, "logps/chosen": -205.3801727294922, "logps/rejected": -224.67868041992188, "loss": 0.6298, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.017876390367746353, "rewards/margins": 0.12985476851463318, "rewards/margins_max": 0.18134212493896484, "rewards/margins_min": 0.07836741209030151, "rewards/margins_std": 0.07281412184238434, "rewards/rejected": -0.14773115515708923, "step": 3710 }, { "epoch": 0.94, "grad_norm": 0.478515625, "learning_rate": 6.1983962776424314e-09, "logits/chosen": 0.04568805173039436, "logits/rejected": 0.28506359457969666, "logps/chosen": -210.31790161132812, "logps/rejected": -231.1966094970703, "loss": 0.6228, "rewards/accuracies": 0.9375, "rewards/chosen": -0.010566742159426212, "rewards/margins": 0.15785974264144897, "rewards/margins_max": 0.230033278465271, "rewards/margins_min": 0.08568622916936874, "rewards/margins_std": 0.10206875950098038, "rewards/rejected": -0.1684264838695526, "step": 3720 }, { "epoch": 0.94, "grad_norm": 0.43359375, "learning_rate": 5.721774949151298e-09, "logits/chosen": 0.17579585313796997, "logits/rejected": 0.4936830997467041, "logps/chosen": -199.6393280029297, "logps/rejected": -241.1573486328125, "loss": 0.6174, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0065175071358680725, "rewards/margins": 0.16208353638648987, "rewards/margins_max": 0.24574527144432068, "rewards/margins_min": 0.07842177897691727, "rewards/margins_std": 0.11831557750701904, "rewards/rejected": -0.16860103607177734, "step": 3730 }, { "epoch": 0.94, "grad_norm": 0.466796875, "learning_rate": 5.264006890679879e-09, "logits/chosen": 0.18780869245529175, "logits/rejected": 0.5377491116523743, "logps/chosen": -220.1456756591797, "logps/rejected": -218.50497436523438, "loss": 0.6229, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.011242363601922989, "rewards/margins": 0.14424775540828705, "rewards/margins_max": 0.20541329681873322, "rewards/margins_min": 0.08308223634958267, "rewards/margins_std": 0.0865011066198349, "rewards/rejected": -0.15549013018608093, "step": 3740 }, { "epoch": 0.94, "grad_norm": 0.4375, "learning_rate": 4.825127432535714e-09, "logits/chosen": 0.2462950199842453, "logits/rejected": 0.5929055213928223, "logps/chosen": -210.54916381835938, "logps/rejected": -229.2125701904297, "loss": 0.6264, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.011940447613596916, "rewards/margins": 0.13500988483428955, "rewards/margins_max": 0.1944727897644043, "rewards/margins_min": 0.07554700970649719, "rewards/margins_std": 0.08409321308135986, "rewards/rejected": -0.14695033431053162, "step": 3750 }, { "epoch": 0.95, "grad_norm": 0.453125, "learning_rate": 4.405170447213752e-09, "logits/chosen": 0.06404795497655869, "logits/rejected": 0.3608579635620117, "logps/chosen": -224.13229370117188, "logps/rejected": -236.7704315185547, "loss": 0.6218, "rewards/accuracies": 0.9375, "rewards/chosen": -0.017158111557364464, "rewards/margins": 0.1566043496131897, "rewards/margins_max": 0.21671798825263977, "rewards/margins_min": 0.09649072587490082, "rewards/margins_std": 0.08501352369785309, "rewards/rejected": -0.1737624704837799, "step": 3760 }, { "epoch": 0.95, "grad_norm": 0.47265625, "learning_rate": 4.004168346781911e-09, "logits/chosen": 0.11094603687524796, "logits/rejected": 0.7000952959060669, "logps/chosen": -234.12606811523438, "logps/rejected": -204.2307891845703, "loss": 0.6236, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01193686481565237, "rewards/margins": 0.1489245593547821, "rewards/margins_max": 0.2147788554430008, "rewards/margins_min": 0.08307026326656342, "rewards/margins_std": 0.09313204139471054, "rewards/rejected": -0.16086141765117645, "step": 3770 }, { "epoch": 0.95, "grad_norm": 0.421875, "learning_rate": 3.6221520803798034e-09, "logits/chosen": 0.040562160313129425, "logits/rejected": 0.458746999502182, "logps/chosen": -210.5111541748047, "logps/rejected": -207.12551879882812, "loss": 0.6233, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.026498395949602127, "rewards/margins": 0.14982527494430542, "rewards/margins_max": 0.20188288390636444, "rewards/margins_min": 0.0977676585316658, "rewards/margins_std": 0.07362059503793716, "rewards/rejected": -0.17632366716861725, "step": 3780 }, { "epoch": 0.95, "grad_norm": 0.490234375, "learning_rate": 3.259151131829868e-09, "logits/chosen": 0.19650276005268097, "logits/rejected": 0.5268104672431946, "logps/chosen": -224.9433135986328, "logps/rejected": -240.1770782470703, "loss": 0.6205, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.01444349717348814, "rewards/margins": 0.1535378247499466, "rewards/margins_max": 0.22261205315589905, "rewards/margins_min": 0.08446361124515533, "rewards/margins_std": 0.09768567979335785, "rewards/rejected": -0.1679813116788864, "step": 3790 }, { "epoch": 0.96, "grad_norm": 0.4921875, "learning_rate": 2.915193517361969e-09, "logits/chosen": 0.273774117231369, "logits/rejected": 0.5841876268386841, "logps/chosen": -226.00149536132812, "logps/rejected": -239.44406127929688, "loss": 0.6155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.005546947009861469, "rewards/margins": 0.16865171492099762, "rewards/margins_max": 0.2197173833847046, "rewards/margins_min": 0.11758601665496826, "rewards/margins_std": 0.07221778482198715, "rewards/rejected": -0.16310475766658783, "step": 3800 }, { "epoch": 0.96, "grad_norm": 0.8828125, "learning_rate": 2.5903057834510965e-09, "logits/chosen": 0.1838090717792511, "logits/rejected": 0.45546311140060425, "logps/chosen": -227.0294189453125, "logps/rejected": -258.4100036621094, "loss": 0.622, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.019510911777615547, "rewards/margins": 0.16543366014957428, "rewards/margins_max": 0.2430049479007721, "rewards/margins_min": 0.08786235004663467, "rewards/margins_std": 0.10970237106084824, "rewards/rejected": -0.18494455516338348, "step": 3810 }, { "epoch": 0.96, "grad_norm": 0.41015625, "learning_rate": 2.284513004768368e-09, "logits/chosen": 0.17463012039661407, "logits/rejected": 0.42735734581947327, "logps/chosen": -244.6623992919922, "logps/rejected": -254.37728881835938, "loss": 0.6302, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.016274752095341682, "rewards/margins": 0.12291550636291504, "rewards/margins_max": 0.1892261803150177, "rewards/margins_min": 0.05660480260848999, "rewards/margins_std": 0.09377748519182205, "rewards/rejected": -0.13919024169445038, "step": 3820 }, { "epoch": 0.96, "grad_norm": 0.455078125, "learning_rate": 1.9978387822460197e-09, "logits/chosen": 0.15252685546875, "logits/rejected": 0.49902796745300293, "logps/chosen": -185.53038024902344, "logps/rejected": -216.5926513671875, "loss": 0.6247, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.0046777850948274136, "rewards/margins": 0.13263201713562012, "rewards/margins_max": 0.19261547923088074, "rewards/margins_min": 0.0726485624909401, "rewards/margins_std": 0.0848294124007225, "rewards/rejected": -0.13730980455875397, "step": 3830 }, { "epoch": 0.97, "grad_norm": 0.408203125, "learning_rate": 1.730305241255753e-09, "logits/chosen": 0.05551935359835625, "logits/rejected": 0.5064684152603149, "logps/chosen": -233.8549346923828, "logps/rejected": -219.7116241455078, "loss": 0.6271, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.02773299440741539, "rewards/margins": 0.14008685946464539, "rewards/margins_max": 0.2064247578382492, "rewards/margins_min": 0.07374893873929977, "rewards/margins_std": 0.09381597489118576, "rewards/rejected": -0.16781985759735107, "step": 3840 }, { "epoch": 0.97, "grad_norm": 0.4453125, "learning_rate": 1.4819330299011001e-09, "logits/chosen": 0.19423505663871765, "logits/rejected": 0.6530963778495789, "logps/chosen": -245.65902709960938, "logps/rejected": -233.35067749023438, "loss": 0.6313, "rewards/accuracies": 0.9375, "rewards/chosen": -0.019910115748643875, "rewards/margins": 0.12754671275615692, "rewards/margins_max": 0.17843300104141235, "rewards/margins_min": 0.07666041702032089, "rewards/margins_std": 0.0719640776515007, "rewards/rejected": -0.1474568396806717, "step": 3850 }, { "epoch": 0.97, "grad_norm": 0.44140625, "learning_rate": 1.2527413174238943e-09, "logits/chosen": 0.12026523053646088, "logits/rejected": 0.5703681111335754, "logps/chosen": -258.96392822265625, "logps/rejected": -249.3824920654297, "loss": 0.6207, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0033578346483409405, "rewards/margins": 0.1527034342288971, "rewards/margins_max": 0.2185053527355194, "rewards/margins_min": 0.08690150082111359, "rewards/margins_std": 0.09305797517299652, "rewards/rejected": -0.15606126189231873, "step": 3860 }, { "epoch": 0.97, "grad_norm": 0.427734375, "learning_rate": 1.042747792724702e-09, "logits/chosen": 0.305549681186676, "logits/rejected": 0.5823919177055359, "logps/chosen": -202.752197265625, "logps/rejected": -215.44558715820312, "loss": 0.6236, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.013430334627628326, "rewards/margins": 0.1376066654920578, "rewards/margins_max": 0.20174825191497803, "rewards/margins_min": 0.07346507161855698, "rewards/margins_std": 0.09070989489555359, "rewards/rejected": -0.15103699266910553, "step": 3870 }, { "epoch": 0.98, "grad_norm": 0.4296875, "learning_rate": 8.519686629977208e-10, "logits/chosen": 0.11620450019836426, "logits/rejected": 0.40117159485816956, "logps/chosen": -192.13516235351562, "logps/rejected": -213.638427734375, "loss": 0.6268, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.010598389431834221, "rewards/margins": 0.14347238838672638, "rewards/margins_max": 0.23001083731651306, "rewards/margins_min": 0.0569339320063591, "rewards/margins_std": 0.12238385528326035, "rewards/rejected": -0.15407077968120575, "step": 3880 }, { "epoch": 0.98, "grad_norm": 0.470703125, "learning_rate": 6.804186524798361e-10, "logits/chosen": 0.2581743597984314, "logits/rejected": 0.6087909936904907, "logps/chosen": -223.1142120361328, "logps/rejected": -243.00857543945312, "loss": 0.622, "rewards/accuracies": 0.9375, "rewards/chosen": -0.017251556739211082, "rewards/margins": 0.15255269408226013, "rewards/margins_max": 0.21804174780845642, "rewards/margins_min": 0.08706365525722504, "rewards/margins_std": 0.092615507543087, "rewards/rejected": -0.16980427503585815, "step": 3890 }, { "epoch": 0.98, "grad_norm": 0.4453125, "learning_rate": 5.28111001314141e-10, "logits/chosen": 0.12378685176372528, "logits/rejected": 0.28624629974365234, "logps/chosen": -205.4754180908203, "logps/rejected": -243.43252563476562, "loss": 0.6282, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.013372248038649559, "rewards/margins": 0.14947311580181122, "rewards/margins_max": 0.21651044487953186, "rewards/margins_min": 0.08243578672409058, "rewards/margins_std": 0.09480509907007217, "rewards/rejected": -0.16284537315368652, "step": 3900 }, { "epoch": 0.98, "grad_norm": 0.4140625, "learning_rate": 3.950574645283089e-10, "logits/chosen": 0.09277466684579849, "logits/rejected": 0.40722179412841797, "logps/chosen": -205.1505584716797, "logps/rejected": -220.9902801513672, "loss": 0.6247, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.002510580001398921, "rewards/margins": 0.15540029108524323, "rewards/margins_max": 0.22838278114795685, "rewards/margins_min": 0.0824178084731102, "rewards/margins_std": 0.10321281105279922, "rewards/rejected": -0.15791086852550507, "step": 3910 }, { "epoch": 0.99, "grad_norm": 0.443359375, "learning_rate": 2.812683111270142e-10, "logits/chosen": 0.09386277943849564, "logits/rejected": 0.41236963868141174, "logps/chosen": -214.34445190429688, "logps/rejected": -221.28152465820312, "loss": 0.6261, "rewards/accuracies": 0.9375, "rewards/chosen": -0.013635742478072643, "rewards/margins": 0.1383320391178131, "rewards/margins_max": 0.1967514008283615, "rewards/margins_min": 0.0799126923084259, "rewards/margins_std": 0.08261743932962418, "rewards/rejected": -0.15196779370307922, "step": 3920 }, { "epoch": 0.99, "grad_norm": 0.5, "learning_rate": 1.8675232329967705e-10, "logits/chosen": 0.03526927903294563, "logits/rejected": 0.24813762307167053, "logps/chosen": -205.2987060546875, "logps/rejected": -230.0944366455078, "loss": 0.6216, "rewards/accuracies": 0.9375, "rewards/chosen": -0.009834588505327702, "rewards/margins": 0.16574285924434662, "rewards/margins_max": 0.2461838722229004, "rewards/margins_min": 0.08530186116695404, "rewards/margins_std": 0.11376076936721802, "rewards/rejected": -0.17557744681835175, "step": 3930 }, { "epoch": 0.99, "grad_norm": 0.39453125, "learning_rate": 1.1151679574247786e-10, "logits/chosen": 0.08690213412046432, "logits/rejected": 0.47821909189224243, "logps/chosen": -194.68600463867188, "logps/rejected": -216.38900756835938, "loss": 0.6218, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0060258423909544945, "rewards/margins": 0.15075986087322235, "rewards/margins_max": 0.21543128788471222, "rewards/margins_min": 0.08608846366405487, "rewards/margins_std": 0.09145916998386383, "rewards/rejected": -0.1567857265472412, "step": 3940 }, { "epoch": 0.99, "grad_norm": 0.48828125, "learning_rate": 5.556753509547429e-11, "logits/chosen": 0.13599836826324463, "logits/rejected": 0.33884793519973755, "logps/chosen": -195.2382354736328, "logps/rejected": -236.0681610107422, "loss": 0.6268, "rewards/accuracies": 0.9375, "rewards/chosen": -0.008052920922636986, "rewards/margins": 0.1700795590877533, "rewards/margins_max": 0.24161715805530548, "rewards/margins_min": 0.0985419973731041, "rewards/margins_std": 0.10116942226886749, "rewards/rejected": -0.17813250422477722, "step": 3950 }, { "epoch": 1.0, "grad_norm": 0.45703125, "learning_rate": 1.8908859494293216e-11, "logits/chosen": 0.10905084758996964, "logits/rejected": 0.5333555936813354, "logps/chosen": -207.8270263671875, "logps/rejected": -215.47021484375, "loss": 0.6228, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.014156030490994453, "rewards/margins": 0.15326142311096191, "rewards/margins_max": 0.2231847494840622, "rewards/margins_min": 0.08333809673786163, "rewards/margins_std": 0.09888650476932526, "rewards/rejected": -0.16741743683815002, "step": 3960 }, { "epoch": 1.0, "grad_norm": 0.45703125, "learning_rate": 1.5435982370359812e-12, "logits/chosen": 0.2128898650407791, "logits/rejected": 0.6233851313591003, "logps/chosen": -227.25686645507812, "logps/rejected": -225.08322143554688, "loss": 0.6327, "rewards/accuracies": 0.9375, "rewards/chosen": -0.018604299053549767, "rewards/margins": 0.1318085491657257, "rewards/margins_max": 0.2025558054447174, "rewards/margins_min": 0.0610613115131855, "rewards/margins_std": 0.10005171597003937, "rewards/rejected": -0.15041287243366241, "step": 3970 }, { "epoch": 1.0, "eval_logits/chosen": 0.7480918765068054, "eval_logits/rejected": 0.8995296359062195, "eval_logps/chosen": -341.4869079589844, "eval_logps/rejected": -324.06951904296875, "eval_loss": 0.6878686547279358, "eval_rewards/accuracies": 0.5809999704360962, "eval_rewards/chosen": -0.04465363174676895, "eval_rewards/margins": 0.011995132081210613, "eval_rewards/margins_max": 0.10682077705860138, "eval_rewards/margins_min": -0.0803549736738205, "eval_rewards/margins_std": 0.06201187148690224, "eval_rewards/rejected": -0.05664876475930214, "eval_runtime": 2499.0849, "eval_samples_per_second": 4.802, "eval_steps_per_second": 0.3, "step": 3974 }, { "epoch": 1.0, "step": 3974, "total_flos": 0.0, "train_loss": 0.6361258800268533, "train_runtime": 32593.909, "train_samples_per_second": 1.951, "train_steps_per_second": 0.122 } ], "logging_steps": 10, "max_steps": 3974, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }