diff --git "a/checkpoint-3086/trainer_state.json" "b/checkpoint-3086/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3086/trainer_state.json" @@ -0,0 +1,6522 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3086, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "diff_generated": 0.0, + "epoch": 0.0003240440699935191, + "grad_norm": 3562.119462078596, + "learning_rate": 8.639308855291577e-10, + "logits/chosen": -2.6025924682617188, + "logits/rejected": -2.469420909881592, + "logps/chosen": -126.16082000732422, + "logps/rejected": -106.06934356689453, + "loss": 10.5588, + "losses_ref": -106.06934356689453, + "ref_logps/chosen": -126.16082000732422, + "ref_logps/rejected": -106.06934356689453, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "u": 3.725290298461914e-09, + "weight": 1.0 + }, + { + "diff_generated": 0.0093117356300354, + "epoch": 0.0032404406999351912, + "grad_norm": 3388.988957019305, + "learning_rate": 8.639308855291576e-09, + "logits/chosen": -2.483880043029785, + "logits/rejected": -2.558704137802124, + "logps/chosen": -101.72637176513672, + "logps/rejected": -97.26779174804688, + "loss": 6.9997, + "losses_ref": -97.88931274414062, + "ref_logps/chosen": -101.72274780273438, + "ref_logps/rejected": -97.277099609375, + "rewards/accuracies": 0.4114583432674408, + "rewards/chosen": -0.0036318302154541016, + "rewards/margins": -0.012943565845489502, + "rewards/rejected": 0.0093117356300354, + "step": 10, + "u": 0.024260468780994415, + "weight": 1.0008878707885742 + }, + { + "diff_generated": 0.00930875726044178, + "epoch": 0.0064808813998703824, + "grad_norm": 3256.601575596696, + "learning_rate": 1.727861771058315e-08, + "logits/chosen": -2.5140349864959717, + "logits/rejected": -2.5642495155334473, + "logps/chosen": -109.59700775146484, + "logps/rejected": -93.96660614013672, + "loss": 11.2379, + "losses_ref": -93.9469985961914, + "ref_logps/chosen": -109.68983459472656, + "ref_logps/rejected": -93.97591400146484, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.09283681213855743, + "rewards/margins": 0.0835280567407608, + "rewards/rejected": 0.00930875726044178, + "step": 20, + "u": 0.02573983743786812, + "weight": 1.0009663105010986 + }, + { + "diff_generated": -0.062067270278930664, + "epoch": 0.009721322099805573, + "grad_norm": 3313.946127745741, + "learning_rate": 2.591792656587473e-08, + "logits/chosen": -2.5175981521606445, + "logits/rejected": -2.5421903133392334, + "logps/chosen": -110.19058990478516, + "logps/rejected": -95.05355072021484, + "loss": 11.8529, + "losses_ref": -95.38294982910156, + "ref_logps/chosen": -110.7219009399414, + "ref_logps/rejected": -94.99147033691406, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5313212871551514, + "rewards/margins": 0.5933884978294373, + "rewards/rejected": -0.062067270278930664, + "step": 30, + "u": -0.03643938526511192, + "weight": 0.9975458383560181 + }, + { + "diff_generated": -0.25546377897262573, + "epoch": 0.012961762799740765, + "grad_norm": 3220.117747952819, + "learning_rate": 3.45572354211663e-08, + "logits/chosen": -2.5363521575927734, + "logits/rejected": -2.6350269317626953, + "logps/chosen": -106.64350891113281, + "logps/rejected": -100.74821472167969, + "loss": 10.3539, + "losses_ref": -97.54227447509766, + "ref_logps/chosen": -108.94718170166016, + "ref_logps/rejected": -100.49275207519531, + "rewards/accuracies": 0.957812488079071, + "rewards/chosen": 2.3036744594573975, + "rewards/margins": 2.559138059616089, + "rewards/rejected": -0.25546377897262573, + "step": 40, + "u": -0.17697972059249878, + "weight": 0.9873207211494446 + }, + { + "diff_generated": -0.5713750123977661, + "epoch": 0.016202203499675955, + "grad_norm": 3021.3116260681286, + "learning_rate": 4.319654427645788e-08, + "logits/chosen": -2.4939026832580566, + "logits/rejected": -2.5600945949554443, + "logps/chosen": -99.71741485595703, + "logps/rejected": -93.92168426513672, + "loss": 11.34, + "losses_ref": -87.43788146972656, + "ref_logps/chosen": -104.4645767211914, + "ref_logps/rejected": -93.35031127929688, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.747157573699951, + "rewards/margins": 5.318532943725586, + "rewards/rejected": -0.5713750123977661, + "step": 50, + "u": -0.3438769280910492, + "weight": 0.9693188667297363 + }, + { + "diff_generated": -2.0571205615997314, + "epoch": 0.019442644199611146, + "grad_norm": 3038.8863487829362, + "learning_rate": 5.183585313174946e-08, + "logits/chosen": -2.5006918907165527, + "logits/rejected": -2.5789248943328857, + "logps/chosen": -91.19766235351562, + "logps/rejected": -97.12265014648438, + "loss": 17.7878, + "losses_ref": -71.39484405517578, + "ref_logps/chosen": -104.03226470947266, + "ref_logps/rejected": -95.0655288696289, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 12.834589958190918, + "rewards/margins": 14.89171314239502, + "rewards/rejected": -2.0571205615997314, + "step": 60, + "u": -0.9664205312728882, + "weight": 0.827978789806366 + }, + { + "diff_generated": -2.8343868255615234, + "epoch": 0.02268308489954634, + "grad_norm": 2936.451811554955, + "learning_rate": 6.047516198704104e-08, + "logits/chosen": -2.4617538452148438, + "logits/rejected": -2.522183895111084, + "logps/chosen": -84.62702941894531, + "logps/rejected": -97.09732055664062, + "loss": 16.052, + "losses_ref": -67.24642944335938, + "ref_logps/chosen": -104.37381744384766, + "ref_logps/rejected": -94.262939453125, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 19.74679183959961, + "rewards/margins": 22.5811767578125, + "rewards/rejected": -2.8343868255615234, + "step": 70, + "u": -0.7202032804489136, + "weight": 0.8072471618652344 + }, + { + "diff_generated": -8.553693771362305, + "epoch": 0.02592352559948153, + "grad_norm": 2095.5830833131795, + "learning_rate": 6.91144708423326e-08, + "logits/chosen": -2.4385647773742676, + "logits/rejected": -2.5173935890197754, + "logps/chosen": -66.70127868652344, + "logps/rejected": -104.8959732055664, + "loss": 35.3544, + "losses_ref": -31.417377471923828, + "ref_logps/chosen": -103.7597427368164, + "ref_logps/rejected": -96.34228515625, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 37.058448791503906, + "rewards/margins": 45.612144470214844, + "rewards/rejected": -8.553693771362305, + "step": 80, + "u": -1.3548884391784668, + "weight": 0.4381393492221832 + }, + { + "diff_generated": -17.18720245361328, + "epoch": 0.02916396629941672, + "grad_norm": 1744.2467307468103, + "learning_rate": 7.775377969762419e-08, + "logits/chosen": -2.4863288402557373, + "logits/rejected": -2.498215913772583, + "logps/chosen": -56.4319953918457, + "logps/rejected": -107.64781188964844, + "loss": 37.797, + "losses_ref": -19.844131469726562, + "ref_logps/chosen": -105.72111511230469, + "ref_logps/rejected": -90.46061706542969, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 49.28911590576172, + "rewards/margins": 66.476318359375, + "rewards/rejected": -17.18720245361328, + "step": 90, + "u": -1.7762200832366943, + "weight": 0.33093181252479553 + }, + { + "diff_generated": -31.4757137298584, + "epoch": 0.03240440699935191, + "grad_norm": 1134.1280328280775, + "learning_rate": 8.639308855291576e-08, + "logits/chosen": -2.477350950241089, + "logits/rejected": -2.496324062347412, + "logps/chosen": -54.80144119262695, + "logps/rejected": -124.31352233886719, + "loss": 46.0326, + "losses_ref": -5.370136260986328, + "ref_logps/chosen": -109.20549011230469, + "ref_logps/rejected": -92.83781433105469, + "rewards/accuracies": 0.948437511920929, + "rewards/chosen": 54.40404510498047, + "rewards/margins": 85.8797607421875, + "rewards/rejected": -31.4757137298584, + "step": 100, + "u": -2.5925116539001465, + "weight": 0.13635995984077454 + }, + { + "diff_generated": -35.75048065185547, + "epoch": 0.0356448476992871, + "grad_norm": 965.4189948465325, + "learning_rate": 9.503239740820734e-08, + "logits/chosen": -2.4699106216430664, + "logits/rejected": -2.522479772567749, + "logps/chosen": -47.429195404052734, + "logps/rejected": -125.5269546508789, + "loss": 40.8724, + "losses_ref": -5.26834774017334, + "ref_logps/chosen": -106.1762466430664, + "ref_logps/rejected": -89.7764663696289, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 58.74705123901367, + "rewards/margins": 94.4975357055664, + "rewards/rejected": -35.75048065185547, + "step": 110, + "u": -2.7017464637756348, + "weight": 0.09796740114688873 + }, + { + "diff_generated": -40.2996711730957, + "epoch": 0.03888528839922229, + "grad_norm": 783.215648338237, + "learning_rate": 1.0367170626349892e-07, + "logits/chosen": -2.4583067893981934, + "logits/rejected": -2.580239772796631, + "logps/chosen": -40.9269905090332, + "logps/rejected": -135.03713989257812, + "loss": 40.3508, + "losses_ref": -1.4218990802764893, + "ref_logps/chosen": -103.93497467041016, + "ref_logps/rejected": -94.73746490478516, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 63.00797653198242, + "rewards/margins": 103.3076400756836, + "rewards/rejected": -40.2996711730957, + "step": 120, + "u": -2.861257314682007, + "weight": 0.04878731817007065 + }, + { + "diff_generated": -37.824851989746094, + "epoch": 0.04212572909915749, + "grad_norm": 836.0455308707783, + "learning_rate": 1.1231101511879049e-07, + "logits/chosen": -2.4154086112976074, + "logits/rejected": -2.508922576904297, + "logps/chosen": -39.98351287841797, + "logps/rejected": -129.5335235595703, + "loss": 38.6543, + "losses_ref": -1.8764454126358032, + "ref_logps/chosen": -99.47947692871094, + "ref_logps/rejected": -91.70866394042969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 59.49596405029297, + "rewards/margins": 97.32081604003906, + "rewards/rejected": -37.824851989746094, + "step": 130, + "u": -2.684591770172119, + "weight": 0.0949232280254364 + }, + { + "diff_generated": -39.24931716918945, + "epoch": 0.04536616979909268, + "grad_norm": 856.8360580503436, + "learning_rate": 1.2095032397408208e-07, + "logits/chosen": -2.443293333053589, + "logits/rejected": -2.5706210136413574, + "logps/chosen": -39.42460250854492, + "logps/rejected": -137.81185913085938, + "loss": 38.051, + "losses_ref": -3.7866032123565674, + "ref_logps/chosen": -101.5798110961914, + "ref_logps/rejected": -98.56255340576172, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 62.15519332885742, + "rewards/margins": 101.4045181274414, + "rewards/rejected": -39.24931716918945, + "step": 140, + "u": -2.774660348892212, + "weight": 0.07413013279438019 + }, + { + "diff_generated": -41.10640335083008, + "epoch": 0.04860661049902787, + "grad_norm": 849.0602988408947, + "learning_rate": 1.2958963282937366e-07, + "logits/chosen": -2.497579336166382, + "logits/rejected": -2.542670249938965, + "logps/chosen": -40.633460998535156, + "logps/rejected": -135.76116943359375, + "loss": 38.7108, + "losses_ref": -0.644196629524231, + "ref_logps/chosen": -107.93096923828125, + "ref_logps/rejected": -94.65476989746094, + "rewards/accuracies": 0.9375, + "rewards/chosen": 67.2975082397461, + "rewards/margins": 108.40391540527344, + "rewards/rejected": -41.10640335083008, + "step": 150, + "u": -2.7651712894439697, + "weight": 0.07480394840240479 + }, + { + "diff_generated": -42.27370834350586, + "epoch": 0.05184705119896306, + "grad_norm": 816.8808654794275, + "learning_rate": 1.382289416846652e-07, + "logits/chosen": -2.4625027179718018, + "logits/rejected": -2.5268664360046387, + "logps/chosen": -41.08820343017578, + "logps/rejected": -136.96205139160156, + "loss": 37.1926, + "losses_ref": -2.8047332763671875, + "ref_logps/chosen": -109.29515075683594, + "ref_logps/rejected": -94.68833923339844, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 68.20694732666016, + "rewards/margins": 110.48065185546875, + "rewards/rejected": -42.27370834350586, + "step": 160, + "u": -2.647674322128296, + "weight": 0.06888891756534576 + }, + { + "diff_generated": -42.294044494628906, + "epoch": 0.05508749189889825, + "grad_norm": 796.5865357257265, + "learning_rate": 1.468682505399568e-07, + "logits/chosen": -2.483508586883545, + "logits/rejected": -2.6074001789093018, + "logps/chosen": -34.566261291503906, + "logps/rejected": -138.64111328125, + "loss": 34.9207, + "losses_ref": -0.620043933391571, + "ref_logps/chosen": -103.62943267822266, + "ref_logps/rejected": -96.34706115722656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 69.06317138671875, + "rewards/margins": 111.35722351074219, + "rewards/rejected": -42.294044494628906, + "step": 170, + "u": -2.880436420440674, + "weight": 0.04548298567533493 + }, + { + "diff_generated": -42.531715393066406, + "epoch": 0.05832793259883344, + "grad_norm": 733.1226948265313, + "learning_rate": 1.5550755939524837e-07, + "logits/chosen": -2.464200973510742, + "logits/rejected": -2.5283455848693848, + "logps/chosen": -37.04677200317383, + "logps/rejected": -133.07716369628906, + "loss": 33.7744, + "losses_ref": -2.0727856159210205, + "ref_logps/chosen": -104.30806732177734, + "ref_logps/rejected": -90.54544067382812, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 67.26130676269531, + "rewards/margins": 109.79301452636719, + "rewards/rejected": -42.531715393066406, + "step": 180, + "u": -2.7712061405181885, + "weight": 0.07460576295852661 + }, + { + "diff_generated": -46.349693298339844, + "epoch": 0.06156837329876863, + "grad_norm": 809.6407509722846, + "learning_rate": 1.6414686825053995e-07, + "logits/chosen": -2.468390941619873, + "logits/rejected": -2.4976563453674316, + "logps/chosen": -38.513465881347656, + "logps/rejected": -144.1353302001953, + "loss": 34.3773, + "losses_ref": -0.6266312003135681, + "ref_logps/chosen": -111.34663391113281, + "ref_logps/rejected": -97.78563690185547, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 72.83317565917969, + "rewards/margins": 119.182861328125, + "rewards/rejected": -46.349693298339844, + "step": 190, + "u": -2.930321455001831, + "weight": 0.025513440370559692 + }, + { + "diff_generated": -46.30202865600586, + "epoch": 0.06480881399870382, + "grad_norm": 874.911954984581, + "learning_rate": 1.7278617710583153e-07, + "logits/chosen": -2.411224365234375, + "logits/rejected": -2.5258326530456543, + "logps/chosen": -33.44983673095703, + "logps/rejected": -142.40084838867188, + "loss": 33.3045, + "losses_ref": -0.844235897064209, + "ref_logps/chosen": -96.5532455444336, + "ref_logps/rejected": -96.09882354736328, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 63.103416442871094, + "rewards/margins": 109.40544128417969, + "rewards/rejected": -46.30202865600586, + "step": 200, + "u": -2.7650609016418457, + "weight": 0.08122508227825165 + }, + { + "diff_generated": -48.628482818603516, + "epoch": 0.06804925469863901, + "grad_norm": 1002.8888910125974, + "learning_rate": 1.814254859611231e-07, + "logits/chosen": -2.4576199054718018, + "logits/rejected": -2.5580880641937256, + "logps/chosen": -35.48185729980469, + "logps/rejected": -144.4121856689453, + "loss": 33.6586, + "losses_ref": -0.39017030596733093, + "ref_logps/chosen": -105.24406433105469, + "ref_logps/rejected": -95.78369140625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 69.76220703125, + "rewards/margins": 118.39068603515625, + "rewards/rejected": -48.628482818603516, + "step": 210, + "u": -2.898162841796875, + "weight": 0.03487515076994896 + }, + { + "diff_generated": -47.10941696166992, + "epoch": 0.0712896953985742, + "grad_norm": 843.0552464941519, + "learning_rate": 1.900647948164147e-07, + "logits/chosen": -2.421787738800049, + "logits/rejected": -2.5013813972473145, + "logps/chosen": -35.37281036376953, + "logps/rejected": -138.56710815429688, + "loss": 33.6089, + "losses_ref": -0.3818884491920471, + "ref_logps/chosen": -106.4619140625, + "ref_logps/rejected": -91.45767974853516, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.08909606933594, + "rewards/margins": 118.19852447509766, + "rewards/rejected": -47.10941696166992, + "step": 220, + "u": -2.823335647583008, + "weight": 0.0594978928565979 + }, + { + "diff_generated": -47.58238220214844, + "epoch": 0.07453013609850939, + "grad_norm": 806.277804701274, + "learning_rate": 1.9870410367170624e-07, + "logits/chosen": -2.454692840576172, + "logits/rejected": -2.4961037635803223, + "logps/chosen": -33.23175811767578, + "logps/rejected": -139.00082397460938, + "loss": 33.1946, + "losses_ref": -1.171278715133667, + "ref_logps/chosen": -102.03202056884766, + "ref_logps/rejected": -91.4184341430664, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 68.80026245117188, + "rewards/margins": 116.38265228271484, + "rewards/rejected": -47.58238220214844, + "step": 230, + "u": -2.7409818172454834, + "weight": 0.08283206820487976 + }, + { + "diff_generated": -49.1198616027832, + "epoch": 0.07777057679844458, + "grad_norm": 818.1731812742752, + "learning_rate": 2.0734341252699785e-07, + "logits/chosen": -2.4587016105651855, + "logits/rejected": -2.526214361190796, + "logps/chosen": -31.493793487548828, + "logps/rejected": -141.08822631835938, + "loss": 31.8918, + "losses_ref": -1.4514960050582886, + "ref_logps/chosen": -101.4176254272461, + "ref_logps/rejected": -91.9683609008789, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 69.92384338378906, + "rewards/margins": 119.043701171875, + "rewards/rejected": -49.1198616027832, + "step": 240, + "u": -2.796725273132324, + "weight": 0.06583192199468613 + }, + { + "diff_generated": -51.98710250854492, + "epoch": 0.08101101749837979, + "grad_norm": 781.7904442626757, + "learning_rate": 2.159827213822894e-07, + "logits/chosen": -2.450554370880127, + "logits/rejected": -2.5678532123565674, + "logps/chosen": -34.040855407714844, + "logps/rejected": -149.1754913330078, + "loss": 34.0048, + "losses_ref": -0.5171114206314087, + "ref_logps/chosen": -103.8505630493164, + "ref_logps/rejected": -97.18838500976562, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 69.80970764160156, + "rewards/margins": 121.79681396484375, + "rewards/rejected": -51.98710250854492, + "step": 250, + "u": -2.832289457321167, + "weight": 0.055971939116716385 + }, + { + "diff_generated": -48.61259841918945, + "epoch": 0.08425145819831498, + "grad_norm": 779.157695978812, + "learning_rate": 2.2462203023758098e-07, + "logits/chosen": -2.443753719329834, + "logits/rejected": -2.512899875640869, + "logps/chosen": -35.335567474365234, + "logps/rejected": -144.41781616210938, + "loss": 33.3595, + "losses_ref": -0.07441789656877518, + "ref_logps/chosen": -105.74320983886719, + "ref_logps/rejected": -95.80522155761719, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 70.40765380859375, + "rewards/margins": 119.020263671875, + "rewards/rejected": -48.61259841918945, + "step": 260, + "u": -2.84458589553833, + "weight": 0.051350824534893036 + }, + { + "diff_generated": -48.469383239746094, + "epoch": 0.08749189889825017, + "grad_norm": 822.8060313845896, + "learning_rate": 2.3326133909287256e-07, + "logits/chosen": -2.456958055496216, + "logits/rejected": -2.520021915435791, + "logps/chosen": -32.37866973876953, + "logps/rejected": -140.49179077148438, + "loss": 32.563, + "losses_ref": -0.005469826515763998, + "ref_logps/chosen": -104.21461486816406, + "ref_logps/rejected": -92.02240753173828, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 71.83594512939453, + "rewards/margins": 120.3053207397461, + "rewards/rejected": -48.469383239746094, + "step": 270, + "u": -2.7521414756774902, + "weight": 0.08144466578960419 + }, + { + "diff_generated": -48.4866828918457, + "epoch": 0.09073233959818536, + "grad_norm": 720.9436259764383, + "learning_rate": 2.4190064794816416e-07, + "logits/chosen": -2.4489905834198, + "logits/rejected": -2.540444850921631, + "logps/chosen": -32.101043701171875, + "logps/rejected": -138.24627685546875, + "loss": 32.2011, + "losses_ref": -1.1843945980072021, + "ref_logps/chosen": -103.7616195678711, + "ref_logps/rejected": -89.75959777832031, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.66057586669922, + "rewards/margins": 120.14725494384766, + "rewards/rejected": -48.4866828918457, + "step": 280, + "u": -2.8051774501800537, + "weight": 0.06826504319906235 + }, + { + "diff_generated": -50.010948181152344, + "epoch": 0.09397278029812055, + "grad_norm": 770.9141251681797, + "learning_rate": 2.505399568034557e-07, + "logits/chosen": -2.461470127105713, + "logits/rejected": -2.511556625366211, + "logps/chosen": -34.223976135253906, + "logps/rejected": -143.33128356933594, + "loss": 32.4193, + "losses_ref": -1.4999120235443115, + "ref_logps/chosen": -107.30704498291016, + "ref_logps/rejected": -93.3203353881836, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 73.08306884765625, + "rewards/margins": 123.0940170288086, + "rewards/rejected": -50.010948181152344, + "step": 290, + "u": -2.7366890907287598, + "weight": 0.08350025117397308 + }, + { + "diff_generated": -55.478302001953125, + "epoch": 0.09721322099805574, + "grad_norm": 803.6157808715096, + "learning_rate": 2.591792656587473e-07, + "logits/chosen": -2.47361421585083, + "logits/rejected": -2.572833299636841, + "logps/chosen": -29.719472885131836, + "logps/rejected": -151.2062225341797, + "loss": 31.1955, + "losses_ref": -0.051477380096912384, + "ref_logps/chosen": -104.66670227050781, + "ref_logps/rejected": -95.72794342041016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 74.94721984863281, + "rewards/margins": 130.42550659179688, + "rewards/rejected": -55.478302001953125, + "step": 300, + "u": -2.9014875888824463, + "weight": 0.03187917545437813 + }, + { + "diff_generated": -54.533409118652344, + "epoch": 0.10045366169799093, + "grad_norm": 777.3740691748907, + "learning_rate": 2.6781857451403887e-07, + "logits/chosen": -2.441887140274048, + "logits/rejected": -2.5617499351501465, + "logps/chosen": -29.868301391601562, + "logps/rejected": -148.29214477539062, + "loss": 29.5778, + "losses_ref": -0.13950416445732117, + "ref_logps/chosen": -100.90555572509766, + "ref_logps/rejected": -93.75871276855469, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.0372543334961, + "rewards/margins": 125.5706558227539, + "rewards/rejected": -54.533409118652344, + "step": 310, + "u": -2.85263729095459, + "weight": 0.049528006464242935 + }, + { + "diff_generated": -49.27448654174805, + "epoch": 0.10369410239792612, + "grad_norm": 771.096492447844, + "learning_rate": 2.764578833693304e-07, + "logits/chosen": -2.415465831756592, + "logits/rejected": -2.4889698028564453, + "logps/chosen": -30.508285522460938, + "logps/rejected": -140.9578399658203, + "loss": 30.7697, + "losses_ref": -0.5802863240242004, + "ref_logps/chosen": -99.48683166503906, + "ref_logps/rejected": -91.68336486816406, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 68.97855377197266, + "rewards/margins": 118.2530288696289, + "rewards/rejected": -49.27448654174805, + "step": 320, + "u": -2.7398669719696045, + "weight": 0.0872223749756813 + }, + { + "diff_generated": -50.7744255065918, + "epoch": 0.10693454309786131, + "grad_norm": 770.7719282970619, + "learning_rate": 2.8509719222462203e-07, + "logits/chosen": -2.4822871685028076, + "logits/rejected": -2.594581365585327, + "logps/chosen": -31.648122787475586, + "logps/rejected": -145.54190063476562, + "loss": 30.0626, + "losses_ref": -0.0007281276048161089, + "ref_logps/chosen": -103.71722412109375, + "ref_logps/rejected": -94.76746368408203, + "rewards/accuracies": 0.9375, + "rewards/chosen": 72.06910705566406, + "rewards/margins": 122.8435287475586, + "rewards/rejected": -50.7744255065918, + "step": 330, + "u": -2.8084800243377686, + "weight": 0.06252004951238632 + }, + { + "diff_generated": -53.30065155029297, + "epoch": 0.1101749837977965, + "grad_norm": 805.1987130491359, + "learning_rate": 2.937365010799136e-07, + "logits/chosen": -2.463784694671631, + "logits/rejected": -2.566464900970459, + "logps/chosen": -30.274654388427734, + "logps/rejected": -150.6307373046875, + "loss": 30.2862, + "losses_ref": -0.6015430688858032, + "ref_logps/chosen": -102.7557144165039, + "ref_logps/rejected": -97.33007049560547, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 72.48106384277344, + "rewards/margins": 125.7817153930664, + "rewards/rejected": -53.30065155029297, + "step": 340, + "u": -2.832361936569214, + "weight": 0.05085529014468193 + }, + { + "diff_generated": -53.60577392578125, + "epoch": 0.11341542449773169, + "grad_norm": 768.8027506136164, + "learning_rate": 3.023758099352052e-07, + "logits/chosen": -2.4243357181549072, + "logits/rejected": -2.515740394592285, + "logps/chosen": -32.064598083496094, + "logps/rejected": -145.3330078125, + "loss": 30.6677, + "losses_ref": -0.5792796015739441, + "ref_logps/chosen": -107.6982192993164, + "ref_logps/rejected": -91.72724914550781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 75.6336441040039, + "rewards/margins": 129.23941040039062, + "rewards/rejected": -53.60577392578125, + "step": 350, + "u": -2.893901824951172, + "weight": 0.03645588457584381 + }, + { + "diff_generated": -53.241851806640625, + "epoch": 0.11665586519766688, + "grad_norm": 733.8624597286104, + "learning_rate": 3.1101511879049674e-07, + "logits/chosen": -2.4519877433776855, + "logits/rejected": -2.5278899669647217, + "logps/chosen": -30.716121673583984, + "logps/rejected": -147.32876586914062, + "loss": 31.2907, + "losses_ref": -0.010169465094804764, + "ref_logps/chosen": -106.13411712646484, + "ref_logps/rejected": -94.08689880371094, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.41799926757812, + "rewards/margins": 128.6598358154297, + "rewards/rejected": -53.241851806640625, + "step": 360, + "u": -2.789494276046753, + "weight": 0.06904186308383942 + }, + { + "diff_generated": -54.6075439453125, + "epoch": 0.11989630589760207, + "grad_norm": 727.7054123140744, + "learning_rate": 3.1965442764578835e-07, + "logits/chosen": -2.5251193046569824, + "logits/rejected": -2.566704273223877, + "logps/chosen": -29.002330780029297, + "logps/rejected": -152.05810546875, + "loss": 29.587, + "losses_ref": -0.001936825574375689, + "ref_logps/chosen": -109.02250671386719, + "ref_logps/rejected": -97.45056915283203, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.0201644897461, + "rewards/margins": 134.62771606445312, + "rewards/rejected": -54.6075439453125, + "step": 370, + "u": -2.883349657058716, + "weight": 0.03754507750272751 + }, + { + "diff_generated": -54.41365432739258, + "epoch": 0.12313674659753726, + "grad_norm": 739.7248123270182, + "learning_rate": 3.282937365010799e-07, + "logits/chosen": -2.4925265312194824, + "logits/rejected": -2.5483505725860596, + "logps/chosen": -30.894588470458984, + "logps/rejected": -148.85020446777344, + "loss": 30.405, + "losses_ref": -0.0042073847725987434, + "ref_logps/chosen": -105.3858871459961, + "ref_logps/rejected": -94.43656158447266, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 74.49128723144531, + "rewards/margins": 128.90493774414062, + "rewards/rejected": -54.41365432739258, + "step": 380, + "u": -2.8271658420562744, + "weight": 0.0563095323741436 + }, + { + "diff_generated": -56.37748336791992, + "epoch": 0.12637718729747247, + "grad_norm": 784.2701333335425, + "learning_rate": 3.3693304535637145e-07, + "logits/chosen": -2.5053887367248535, + "logits/rejected": -2.6210803985595703, + "logps/chosen": -32.53559494018555, + "logps/rejected": -153.89474487304688, + "loss": 29.7721, + "losses_ref": -0.5663825273513794, + "ref_logps/chosen": -109.39384460449219, + "ref_logps/rejected": -97.51725769042969, + "rewards/accuracies": 0.96875, + "rewards/chosen": 76.85826110839844, + "rewards/margins": 133.23574829101562, + "rewards/rejected": -56.37748336791992, + "step": 390, + "u": -2.8883016109466553, + "weight": 0.03719887137413025 + }, + { + "diff_generated": -54.94092559814453, + "epoch": 0.12961762799740764, + "grad_norm": 716.9651418493512, + "learning_rate": 3.4557235421166306e-07, + "logits/chosen": -2.4730770587921143, + "logits/rejected": -2.561882495880127, + "logps/chosen": -30.261028289794922, + "logps/rejected": -150.30093383789062, + "loss": 28.3011, + "losses_ref": -0.0938170999288559, + "ref_logps/chosen": -104.626708984375, + "ref_logps/rejected": -95.36000061035156, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 74.36568450927734, + "rewards/margins": 129.30661010742188, + "rewards/rejected": -54.94092559814453, + "step": 400, + "u": -2.8626420497894287, + "weight": 0.0456349179148674 + }, + { + "diff_generated": -54.72991943359375, + "epoch": 0.13285806869734285, + "grad_norm": 707.6966705455432, + "learning_rate": 3.542116630669546e-07, + "logits/chosen": -2.4588990211486816, + "logits/rejected": -2.566882848739624, + "logps/chosen": -27.552404403686523, + "logps/rejected": -149.89450073242188, + "loss": 28.7623, + "losses_ref": -0.011032069101929665, + "ref_logps/chosen": -103.4614486694336, + "ref_logps/rejected": -95.16458892822266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 75.90904235839844, + "rewards/margins": 130.6389617919922, + "rewards/rejected": -54.72991943359375, + "step": 410, + "u": -2.901738166809082, + "weight": 0.031637679785490036 + }, + { + "diff_generated": -54.070556640625, + "epoch": 0.13609850939727802, + "grad_norm": 744.5333204785713, + "learning_rate": 3.628509719222462e-07, + "logits/chosen": -2.5188753604888916, + "logits/rejected": -2.5969929695129395, + "logps/chosen": -30.11448097229004, + "logps/rejected": -149.3397216796875, + "loss": 29.2246, + "losses_ref": -0.6378255486488342, + "ref_logps/chosen": -109.98274230957031, + "ref_logps/rejected": -95.2691650390625, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 79.86824798583984, + "rewards/margins": 133.93881225585938, + "rewards/rejected": -54.070556640625, + "step": 420, + "u": -2.895763397216797, + "weight": 0.02901501953601837 + }, + { + "diff_generated": -53.83262252807617, + "epoch": 0.13933895009721323, + "grad_norm": 709.4475193868764, + "learning_rate": 3.7149028077753777e-07, + "logits/chosen": -2.4371330738067627, + "logits/rejected": -2.5524258613586426, + "logps/chosen": -26.42848777770996, + "logps/rejected": -142.687744140625, + "loss": 29.5044, + "losses_ref": -0.31355220079421997, + "ref_logps/chosen": -98.81473541259766, + "ref_logps/rejected": -88.85511016845703, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 72.38623809814453, + "rewards/margins": 126.2188720703125, + "rewards/rejected": -53.83262252807617, + "step": 430, + "u": -2.6893277168273926, + "weight": 0.10447784513235092 + }, + { + "diff_generated": -55.30907440185547, + "epoch": 0.1425793907971484, + "grad_norm": 982.6147302266403, + "learning_rate": 3.801295896328294e-07, + "logits/chosen": -2.458221197128296, + "logits/rejected": -2.5157158374786377, + "logps/chosen": -28.952505111694336, + "logps/rejected": -146.90135192871094, + "loss": 29.5034, + "losses_ref": -0.16949811577796936, + "ref_logps/chosen": -104.5290756225586, + "ref_logps/rejected": -91.59226989746094, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.57655334472656, + "rewards/margins": 130.88563537597656, + "rewards/rejected": -55.30907440185547, + "step": 440, + "u": -2.7866439819335938, + "weight": 0.07143958657979965 + }, + { + "diff_generated": -53.21424102783203, + "epoch": 0.1458198314970836, + "grad_norm": 768.9096868799276, + "learning_rate": 3.887688984881209e-07, + "logits/chosen": -2.478175163269043, + "logits/rejected": -2.526900053024292, + "logps/chosen": -27.487741470336914, + "logps/rejected": -141.2913360595703, + "loss": 28.8683, + "losses_ref": -0.03192094713449478, + "ref_logps/chosen": -101.43740844726562, + "ref_logps/rejected": -88.07707977294922, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 73.94967651367188, + "rewards/margins": 127.1639175415039, + "rewards/rejected": -53.21424102783203, + "step": 450, + "u": -2.751070737838745, + "weight": 0.08246159553527832 + }, + { + "diff_generated": -54.764625549316406, + "epoch": 0.14906027219701878, + "grad_norm": 771.7033349112145, + "learning_rate": 3.974082073434125e-07, + "logits/chosen": -2.4950356483459473, + "logits/rejected": -2.5568201541900635, + "logps/chosen": -30.200414657592773, + "logps/rejected": -146.8262176513672, + "loss": 28.9664, + "losses_ref": -0.19666805863380432, + "ref_logps/chosen": -106.53369140625, + "ref_logps/rejected": -92.06159973144531, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.33326721191406, + "rewards/margins": 131.097900390625, + "rewards/rejected": -54.764625549316406, + "step": 460, + "u": -2.8212099075317383, + "weight": 0.06148713827133179 + }, + { + "diff_generated": -54.21906280517578, + "epoch": 0.152300712896954, + "grad_norm": 741.1342427297986, + "learning_rate": 4.060475161987041e-07, + "logits/chosen": -2.505781650543213, + "logits/rejected": -2.5400655269622803, + "logps/chosen": -30.926746368408203, + "logps/rejected": -149.12911987304688, + "loss": 29.0095, + "losses_ref": -0.06011121720075607, + "ref_logps/chosen": -110.43312072753906, + "ref_logps/rejected": -94.91004943847656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 79.5063705444336, + "rewards/margins": 133.72543334960938, + "rewards/rejected": -54.21906280517578, + "step": 470, + "u": -2.900672435760498, + "weight": 0.0327039510011673 + }, + { + "diff_generated": -55.218055725097656, + "epoch": 0.15554115359688916, + "grad_norm": 672.7567401797695, + "learning_rate": 4.146868250539957e-07, + "logits/chosen": -2.5062358379364014, + "logits/rejected": -2.5572612285614014, + "logps/chosen": -26.948486328125, + "logps/rejected": -147.74221801757812, + "loss": 28.2762, + "losses_ref": -0.9450100064277649, + "ref_logps/chosen": -104.43302917480469, + "ref_logps/rejected": -92.52415466308594, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.48454284667969, + "rewards/margins": 132.70260620117188, + "rewards/rejected": -55.218055725097656, + "step": 480, + "u": -2.8369040489196777, + "weight": 0.0559476800262928 + }, + { + "diff_generated": -55.180442810058594, + "epoch": 0.15878159429682437, + "grad_norm": 703.7043637818722, + "learning_rate": 4.2332613390928724e-07, + "logits/chosen": -2.4895575046539307, + "logits/rejected": -2.5281057357788086, + "logps/chosen": -29.348852157592773, + "logps/rejected": -146.33741760253906, + "loss": 29.3673, + "losses_ref": -0.12545503675937653, + "ref_logps/chosen": -108.4737548828125, + "ref_logps/rejected": -91.15696716308594, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.1249008178711, + "rewards/margins": 134.3053436279297, + "rewards/rejected": -55.180442810058594, + "step": 490, + "u": -2.824366569519043, + "weight": 0.059032510966062546 + }, + { + "diff_generated": -55.36357498168945, + "epoch": 0.16202203499675957, + "grad_norm": 647.376785896275, + "learning_rate": 4.319654427645788e-07, + "logits/chosen": -2.466341018676758, + "logits/rejected": -2.586158037185669, + "logps/chosen": -26.323333740234375, + "logps/rejected": -149.58590698242188, + "loss": 28.0963, + "losses_ref": -0.030207056552171707, + "ref_logps/chosen": -102.38270568847656, + "ref_logps/rejected": -94.22232818603516, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.05937194824219, + "rewards/margins": 131.42294311523438, + "rewards/rejected": -55.36357498168945, + "step": 500, + "u": -2.7893226146698, + "weight": 0.0692179948091507 + }, + { + "diff_generated": -54.81679153442383, + "epoch": 0.16526247569669475, + "grad_norm": 645.4650287289849, + "learning_rate": 4.406047516198704e-07, + "logits/chosen": -2.4166862964630127, + "logits/rejected": -2.5659446716308594, + "logps/chosen": -25.198596954345703, + "logps/rejected": -145.24667358398438, + "loss": 27.123, + "losses_ref": -0.2254674881696701, + "ref_logps/chosen": -93.64522552490234, + "ref_logps/rejected": -90.42987060546875, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 68.4466323852539, + "rewards/margins": 123.263427734375, + "rewards/rejected": -54.81679153442383, + "step": 510, + "u": -2.7496695518493652, + "weight": 0.08379890024662018 + }, + { + "diff_generated": -55.3721923828125, + "epoch": 0.16850291639662995, + "grad_norm": 754.8764848546118, + "learning_rate": 4.4924406047516195e-07, + "logits/chosen": -2.5180530548095703, + "logits/rejected": -2.5424976348876953, + "logps/chosen": -30.242000579833984, + "logps/rejected": -150.60255432128906, + "loss": 27.5853, + "losses_ref": -0.0379607193171978, + "ref_logps/chosen": -112.25959777832031, + "ref_logps/rejected": -95.2303466796875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.0176010131836, + "rewards/margins": 137.38980102539062, + "rewards/rejected": -55.3721923828125, + "step": 520, + "u": -2.863673686981201, + "weight": 0.04474290460348129 + }, + { + "diff_generated": -53.96966552734375, + "epoch": 0.17174335709656513, + "grad_norm": 687.2531690329358, + "learning_rate": 4.5788336933045356e-07, + "logits/chosen": -2.468636989593506, + "logits/rejected": -2.575831890106201, + "logps/chosen": -26.400867462158203, + "logps/rejected": -146.9187469482422, + "loss": 26.9127, + "losses_ref": -0.4472638964653015, + "ref_logps/chosen": -101.43266296386719, + "ref_logps/rejected": -92.9490966796875, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 75.03179931640625, + "rewards/margins": 129.00144958496094, + "rewards/rejected": -53.96966552734375, + "step": 530, + "u": -2.719735622406006, + "weight": 0.08883358538150787 + }, + { + "diff_generated": -55.33368682861328, + "epoch": 0.17498379779650033, + "grad_norm": 664.5048108611123, + "learning_rate": 4.665226781857451e-07, + "logits/chosen": -2.467604160308838, + "logits/rejected": -2.590390682220459, + "logps/chosen": -27.821060180664062, + "logps/rejected": -148.22567749023438, + "loss": 27.1637, + "losses_ref": -0.3465833365917206, + "ref_logps/chosen": -102.39645385742188, + "ref_logps/rejected": -92.89200592041016, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 74.57539367675781, + "rewards/margins": 129.90908813476562, + "rewards/rejected": -55.33368682861328, + "step": 540, + "u": -2.818753957748413, + "weight": 0.061188213527202606 + }, + { + "diff_generated": -53.33552169799805, + "epoch": 0.1782242384964355, + "grad_norm": 730.3351291004991, + "learning_rate": 4.751619870410367e-07, + "logits/chosen": -2.428861618041992, + "logits/rejected": -2.4772071838378906, + "logps/chosen": -27.735401153564453, + "logps/rejected": -141.1258544921875, + "loss": 27.7936, + "losses_ref": -0.001424860442057252, + "ref_logps/chosen": -101.93447875976562, + "ref_logps/rejected": -87.79032897949219, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 74.19908142089844, + "rewards/margins": 127.53459167480469, + "rewards/rejected": -53.33552169799805, + "step": 550, + "u": -2.7897605895996094, + "weight": 0.06876589357852936 + }, + { + "diff_generated": -58.458091735839844, + "epoch": 0.18146467919637072, + "grad_norm": 663.8908243083347, + "learning_rate": 4.838012958963283e-07, + "logits/chosen": -2.4482421875, + "logits/rejected": -2.5633554458618164, + "logps/chosen": -24.60186195373535, + "logps/rejected": -156.0877685546875, + "loss": 27.8651, + "losses_ref": -0.003950254060328007, + "ref_logps/chosen": -101.93538665771484, + "ref_logps/rejected": -97.62966918945312, + "rewards/accuracies": 0.96875, + "rewards/chosen": 77.33352661132812, + "rewards/margins": 135.79164123535156, + "rewards/rejected": -58.458091735839844, + "step": 560, + "u": -2.902024507522583, + "weight": 0.03134573623538017 + }, + { + "diff_generated": -60.79945755004883, + "epoch": 0.1847051198963059, + "grad_norm": 685.7692335401006, + "learning_rate": 4.924406047516198e-07, + "logits/chosen": -2.458861827850342, + "logits/rejected": -2.571686267852783, + "logps/chosen": -26.797931671142578, + "logps/rejected": -157.7087860107422, + "loss": 27.0617, + "losses_ref": -0.008472513407468796, + "ref_logps/chosen": -109.6872329711914, + "ref_logps/rejected": -96.90933990478516, + "rewards/accuracies": 0.96875, + "rewards/chosen": 82.88929748535156, + "rewards/margins": 143.6887664794922, + "rewards/rejected": -60.79945755004883, + "step": 570, + "u": -2.9020092487335205, + "weight": 0.03136172145605087 + }, + { + "diff_generated": -54.8674201965332, + "epoch": 0.1879455605962411, + "grad_norm": 688.6877497060314, + "learning_rate": 5.010799136069114e-07, + "logits/chosen": -2.521958827972412, + "logits/rejected": -2.5581581592559814, + "logps/chosen": -28.5933837890625, + "logps/rejected": -147.9677734375, + "loss": 27.474, + "losses_ref": -5.428441727417521e-05, + "ref_logps/chosen": -107.4176025390625, + "ref_logps/rejected": -93.10035705566406, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.82423400878906, + "rewards/margins": 133.691650390625, + "rewards/rejected": -54.8674201965332, + "step": 580, + "u": -2.827221632003784, + "weight": 0.0562509186565876 + }, + { + "diff_generated": -58.10841751098633, + "epoch": 0.19118600129617627, + "grad_norm": 635.8841172916929, + "learning_rate": 5.097192224622029e-07, + "logits/chosen": -2.4910545349121094, + "logits/rejected": -2.5638060569763184, + "logps/chosen": -26.427270889282227, + "logps/rejected": -150.41275024414062, + "loss": 27.2598, + "losses_ref": -0.7870520353317261, + "ref_logps/chosen": -106.36592102050781, + "ref_logps/rejected": -92.30433654785156, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 79.93864440917969, + "rewards/margins": 138.04705810546875, + "rewards/rejected": -58.10841751098633, + "step": 590, + "u": -2.8938279151916504, + "weight": 0.0318412259221077 + }, + { + "diff_generated": -56.88201904296875, + "epoch": 0.19442644199611148, + "grad_norm": 670.7825753160076, + "learning_rate": 5.183585313174946e-07, + "logits/chosen": -2.503103733062744, + "logits/rejected": -2.5682194232940674, + "logps/chosen": -32.12078094482422, + "logps/rejected": -150.4476776123047, + "loss": 27.4504, + "losses_ref": -0.08075644075870514, + "ref_logps/chosen": -108.149169921875, + "ref_logps/rejected": -93.56566619873047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 76.02838134765625, + "rewards/margins": 132.910400390625, + "rewards/rejected": -56.88201904296875, + "step": 600, + "u": -2.807526111602783, + "weight": 0.06345218420028687 + }, + { + "diff_generated": -56.32802200317383, + "epoch": 0.19766688269604665, + "grad_norm": 695.5443199709215, + "learning_rate": 5.269978401727861e-07, + "logits/chosen": -2.481253147125244, + "logits/rejected": -2.558227777481079, + "logps/chosen": -29.32651710510254, + "logps/rejected": -151.45556640625, + "loss": 27.456, + "losses_ref": -0.007582040969282389, + "ref_logps/chosen": -110.96148681640625, + "ref_logps/rejected": -95.12753295898438, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.63497161865234, + "rewards/margins": 137.96299743652344, + "rewards/rejected": -56.32802200317383, + "step": 610, + "u": -2.8644137382507324, + "weight": 0.04401334002614021 + }, + { + "diff_generated": -54.635101318359375, + "epoch": 0.20090732339598186, + "grad_norm": 702.066659568588, + "learning_rate": 5.356371490280777e-07, + "logits/chosen": -2.5014188289642334, + "logits/rejected": -2.604109525680542, + "logps/chosen": -28.138397216796875, + "logps/rejected": -153.0144805908203, + "loss": 26.3974, + "losses_ref": -0.0026658426504582167, + "ref_logps/chosen": -102.48536682128906, + "ref_logps/rejected": -98.37937927246094, + "rewards/accuracies": 0.9375, + "rewards/chosen": 74.34696197509766, + "rewards/margins": 128.9820556640625, + "rewards/rejected": -54.635101318359375, + "step": 620, + "u": -2.808465003967285, + "weight": 0.06253598630428314 + }, + { + "diff_generated": -56.20122146606445, + "epoch": 0.20414776409591703, + "grad_norm": 713.1651670351907, + "learning_rate": 5.442764578833693e-07, + "logits/chosen": -2.519637107849121, + "logits/rejected": -2.5696628093719482, + "logps/chosen": -30.5113582611084, + "logps/rejected": -153.999755859375, + "loss": 27.4408, + "losses_ref": -0.016383513808250427, + "ref_logps/chosen": -109.87528991699219, + "ref_logps/rejected": -97.79855346679688, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.36394500732422, + "rewards/margins": 135.56515502929688, + "rewards/rejected": -56.20122146606445, + "step": 630, + "u": -2.864102363586426, + "weight": 0.044332828372716904 + }, + { + "diff_generated": -55.92133331298828, + "epoch": 0.20738820479585224, + "grad_norm": 708.2284789366054, + "learning_rate": 5.529157667386608e-07, + "logits/chosen": -2.5184578895568848, + "logits/rejected": -2.5733492374420166, + "logps/chosen": -25.582706451416016, + "logps/rejected": -146.95883178710938, + "loss": 26.0938, + "losses_ref": -0.11125482618808746, + "ref_logps/chosen": -100.10874938964844, + "ref_logps/rejected": -91.03749084472656, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 74.52604675292969, + "rewards/margins": 130.4473876953125, + "rewards/rejected": -55.92133331298828, + "step": 640, + "u": -2.7516160011291504, + "weight": 0.08196442574262619 + }, + { + "diff_generated": -60.4934196472168, + "epoch": 0.21062864549578741, + "grad_norm": 674.7775917338871, + "learning_rate": 5.615550755939525e-07, + "logits/chosen": -2.4662938117980957, + "logits/rejected": -2.5818092823028564, + "logps/chosen": -26.435894012451172, + "logps/rejected": -155.3976287841797, + "loss": 27.6505, + "losses_ref": -0.16979587078094482, + "ref_logps/chosen": -102.91618347167969, + "ref_logps/rejected": -94.90421295166016, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.48027801513672, + "rewards/margins": 136.97369384765625, + "rewards/rejected": -60.4934196472168, + "step": 650, + "u": -2.843845844268799, + "weight": 0.051973629742860794 + }, + { + "diff_generated": -55.22865676879883, + "epoch": 0.21386908619572262, + "grad_norm": 707.2132309203021, + "learning_rate": 5.701943844492441e-07, + "logits/chosen": -2.5144200325012207, + "logits/rejected": -2.5284502506256104, + "logps/chosen": -29.344928741455078, + "logps/rejected": -146.8759765625, + "loss": 27.6178, + "losses_ref": -0.46125665307044983, + "ref_logps/chosen": -104.47818756103516, + "ref_logps/rejected": -91.64730834960938, + "rewards/accuracies": 0.9375, + "rewards/chosen": 75.13326263427734, + "rewards/margins": 130.36190795898438, + "rewards/rejected": -55.22865676879883, + "step": 660, + "u": -2.800485372543335, + "weight": 0.06856163591146469 + }, + { + "diff_generated": -63.19648361206055, + "epoch": 0.21710952689565782, + "grad_norm": 649.4757602968687, + "learning_rate": 5.788336933045357e-07, + "logits/chosen": -2.5100691318511963, + "logits/rejected": -2.5467848777770996, + "logps/chosen": -29.621448516845703, + "logps/rejected": -160.77467346191406, + "loss": 26.2028, + "losses_ref": -0.35821059346199036, + "ref_logps/chosen": -110.4535903930664, + "ref_logps/rejected": -97.57820129394531, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 80.83213806152344, + "rewards/margins": 144.02862548828125, + "rewards/rejected": -63.19648361206055, + "step": 670, + "u": -2.9170174598693848, + "weight": 0.028028449043631554 + }, + { + "diff_generated": -63.916168212890625, + "epoch": 0.220349967595593, + "grad_norm": 628.3849258041977, + "learning_rate": 5.874730021598272e-07, + "logits/chosen": -2.4870247840881348, + "logits/rejected": -2.573270320892334, + "logps/chosen": -25.00966453552246, + "logps/rejected": -163.73403930664062, + "loss": 26.5574, + "losses_ref": -0.03508678823709488, + "ref_logps/chosen": -106.8170394897461, + "ref_logps/rejected": -99.81785583496094, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 81.807373046875, + "rewards/margins": 145.72354125976562, + "rewards/rejected": -63.916168212890625, + "step": 680, + "u": -2.920351505279541, + "weight": 0.02549419179558754 + }, + { + "diff_generated": -59.71055221557617, + "epoch": 0.2235904082955282, + "grad_norm": 673.8497950367135, + "learning_rate": 5.961123110151188e-07, + "logits/chosen": -2.484591484069824, + "logits/rejected": -2.500356435775757, + "logps/chosen": -27.194311141967773, + "logps/rejected": -152.62860107421875, + "loss": 26.7425, + "losses_ref": -0.007726139388978481, + "ref_logps/chosen": -107.43864440917969, + "ref_logps/rejected": -92.91804504394531, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.24433898925781, + "rewards/margins": 139.9548797607422, + "rewards/rejected": -59.71055221557617, + "step": 690, + "u": -2.8645870685577393, + "weight": 0.04383586719632149 + }, + { + "diff_generated": -59.14935302734375, + "epoch": 0.22683084899546338, + "grad_norm": 686.4413294789177, + "learning_rate": 6.047516198704104e-07, + "logits/chosen": -2.4694008827209473, + "logits/rejected": -2.540020704269409, + "logps/chosen": -24.311485290527344, + "logps/rejected": -150.16282653808594, + "loss": 26.6456, + "losses_ref": -0.00461218599230051, + "ref_logps/chosen": -102.06470489501953, + "ref_logps/rejected": -91.01347351074219, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 77.75321960449219, + "rewards/margins": 136.90257263183594, + "rewards/rejected": -59.14935302734375, + "step": 700, + "u": -2.7896358966827393, + "weight": 0.06889645755290985 + }, + { + "diff_generated": -58.787933349609375, + "epoch": 0.23007128969539858, + "grad_norm": 593.5593182299435, + "learning_rate": 6.133909287257019e-07, + "logits/chosen": -2.410492420196533, + "logits/rejected": -2.4904074668884277, + "logps/chosen": -25.31143569946289, + "logps/rejected": -148.18141174316406, + "loss": 26.4049, + "losses_ref": -2.1401230696938e-05, + "ref_logps/chosen": -102.7784423828125, + "ref_logps/rejected": -89.39347076416016, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 77.4670181274414, + "rewards/margins": 136.25494384765625, + "rewards/rejected": -58.787933349609375, + "step": 710, + "u": -2.7897753715515137, + "weight": 0.06875047087669373 + }, + { + "diff_generated": -63.03206253051758, + "epoch": 0.23331173039533376, + "grad_norm": 699.9971887915182, + "learning_rate": 6.220302375809935e-07, + "logits/chosen": -2.484179973602295, + "logits/rejected": -2.5839457511901855, + "logps/chosen": -25.623340606689453, + "logps/rejected": -159.0945281982422, + "loss": 26.1432, + "losses_ref": -0.0748247355222702, + "ref_logps/chosen": -103.72444152832031, + "ref_logps/rejected": -96.06248474121094, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.1010971069336, + "rewards/margins": 141.13316345214844, + "rewards/rejected": -63.03206253051758, + "step": 720, + "u": -2.862562656402588, + "weight": 0.04578563943505287 + }, + { + "diff_generated": -61.96745681762695, + "epoch": 0.23655217109526896, + "grad_norm": 730.2474500721089, + "learning_rate": 6.306695464362851e-07, + "logits/chosen": -2.46178936958313, + "logits/rejected": -2.5654196739196777, + "logps/chosen": -25.737224578857422, + "logps/rejected": -157.86012268066406, + "loss": 26.1898, + "losses_ref": -0.0014304433716461062, + "ref_logps/chosen": -103.9644775390625, + "ref_logps/rejected": -95.8926773071289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.22725677490234, + "rewards/margins": 140.19471740722656, + "rewards/rejected": -61.96745681762695, + "step": 730, + "u": -2.8084633350372314, + "weight": 0.06253740191459656 + }, + { + "diff_generated": -60.012786865234375, + "epoch": 0.23979261179520414, + "grad_norm": 638.1430327783397, + "learning_rate": 6.393088552915767e-07, + "logits/chosen": -2.394742488861084, + "logits/rejected": -2.5220019817352295, + "logps/chosen": -24.21940803527832, + "logps/rejected": -151.1535186767578, + "loss": 26.3456, + "losses_ref": -0.0172689538449049, + "ref_logps/chosen": -101.16768646240234, + "ref_logps/rejected": -91.14073181152344, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 76.94828796386719, + "rewards/margins": 136.96107482910156, + "rewards/rejected": -60.012786865234375, + "step": 740, + "u": -2.733316421508789, + "weight": 0.0877995416522026 + }, + { + "diff_generated": -59.764854431152344, + "epoch": 0.24303305249513935, + "grad_norm": 655.3642996586939, + "learning_rate": 6.479481641468682e-07, + "logits/chosen": -2.44429349899292, + "logits/rejected": -2.4809410572052, + "logps/chosen": -25.51277732849121, + "logps/rejected": -149.52215576171875, + "loss": 26.7816, + "losses_ref": -0.0009433348895981908, + "ref_logps/chosen": -104.70662689208984, + "ref_logps/rejected": -89.7573013305664, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.19384765625, + "rewards/margins": 138.95870971679688, + "rewards/rejected": -59.764854431152344, + "step": 750, + "u": -2.789747714996338, + "weight": 0.06877966225147247 + }, + { + "diff_generated": -62.37859344482422, + "epoch": 0.24627349319507452, + "grad_norm": 621.506258767661, + "learning_rate": 6.565874730021598e-07, + "logits/chosen": -2.4868717193603516, + "logits/rejected": -2.614471197128296, + "logps/chosen": -26.513357162475586, + "logps/rejected": -162.24029541015625, + "loss": 26.3821, + "losses_ref": -0.028088023886084557, + "ref_logps/chosen": -104.30328369140625, + "ref_logps/rejected": -99.86168670654297, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 77.78993225097656, + "rewards/margins": 140.16851806640625, + "rewards/rejected": -62.37859344482422, + "step": 760, + "u": -2.9393086433410645, + "weight": 0.0190122053027153 + }, + { + "diff_generated": -60.59674835205078, + "epoch": 0.24951393389500973, + "grad_norm": 701.7304767128724, + "learning_rate": 6.652267818574514e-07, + "logits/chosen": -2.493251085281372, + "logits/rejected": -2.5534424781799316, + "logps/chosen": -26.941265106201172, + "logps/rejected": -151.6280059814453, + "loss": 26.3064, + "losses_ref": -0.013267618604004383, + "ref_logps/chosen": -104.0679702758789, + "ref_logps/rejected": -91.03125762939453, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 77.12669372558594, + "rewards/margins": 137.7234344482422, + "rewards/rejected": -60.59674835205078, + "step": 770, + "u": -2.752232789993286, + "weight": 0.08135087788105011 + }, + { + "diff_generated": -64.66901397705078, + "epoch": 0.25275437459494493, + "grad_norm": 590.6429607969785, + "learning_rate": 6.738660907127429e-07, + "logits/chosen": -2.481443166732788, + "logits/rejected": -2.536986827850342, + "logps/chosen": -29.76792335510254, + "logps/rejected": -162.86886596679688, + "loss": 26.9886, + "losses_ref": -0.0018984278431162238, + "ref_logps/chosen": -108.67779541015625, + "ref_logps/rejected": -98.19983673095703, + "rewards/accuracies": 0.96875, + "rewards/chosen": 78.90986633300781, + "rewards/margins": 143.57888793945312, + "rewards/rejected": -64.66901397705078, + "step": 780, + "u": -2.902082681655884, + "weight": 0.0312848761677742 + }, + { + "diff_generated": -56.31652069091797, + "epoch": 0.2559948152948801, + "grad_norm": 675.3022211708428, + "learning_rate": 6.825053995680345e-07, + "logits/chosen": -2.4406189918518066, + "logits/rejected": -2.5565524101257324, + "logps/chosen": -25.877826690673828, + "logps/rejected": -146.1416778564453, + "loss": 26.2029, + "losses_ref": -0.4356566369533539, + "ref_logps/chosen": -100.30241394042969, + "ref_logps/rejected": -89.82514953613281, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 74.42459106445312, + "rewards/margins": 130.74111938476562, + "rewards/rejected": -56.31652069091797, + "step": 790, + "u": -2.7635066509246826, + "weight": 0.07979506254196167 + }, + { + "diff_generated": -63.26179885864258, + "epoch": 0.2592352559948153, + "grad_norm": 637.6682986126765, + "learning_rate": 6.911447084233261e-07, + "logits/chosen": -2.469820976257324, + "logits/rejected": -2.544013261795044, + "logps/chosen": -25.00909996032715, + "logps/rejected": -158.84591674804688, + "loss": 25.4765, + "losses_ref": -0.1315481811761856, + "ref_logps/chosen": -105.43486022949219, + "ref_logps/rejected": -95.5841064453125, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.42577362060547, + "rewards/margins": 143.68756103515625, + "rewards/rejected": -63.26179885864258, + "step": 800, + "u": -2.8629374504089355, + "weight": 0.04540270194411278 + }, + { + "diff_generated": -58.66782760620117, + "epoch": 0.26247569669475046, + "grad_norm": 615.7590883273643, + "learning_rate": 6.997840172786177e-07, + "logits/chosen": -2.4986214637756348, + "logits/rejected": -2.537416934967041, + "logps/chosen": -27.8216609954834, + "logps/rejected": -152.74380493164062, + "loss": 25.1296, + "losses_ref": -0.35845038294792175, + "ref_logps/chosen": -108.5501937866211, + "ref_logps/rejected": -94.07597351074219, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.72853088378906, + "rewards/margins": 139.3963623046875, + "rewards/rejected": -58.66782760620117, + "step": 810, + "u": -2.85968017578125, + "weight": 0.04805522412061691 + }, + { + "diff_generated": -60.81633377075195, + "epoch": 0.2657161373946857, + "grad_norm": 576.8016502943253, + "learning_rate": 7.084233261339092e-07, + "logits/chosen": -2.472770929336548, + "logits/rejected": -2.607300043106079, + "logps/chosen": -25.254009246826172, + "logps/rejected": -155.61083984375, + "loss": 25.5254, + "losses_ref": -1.321690320968628, + "ref_logps/chosen": -100.71749877929688, + "ref_logps/rejected": -94.79450988769531, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.4634780883789, + "rewards/margins": 136.27981567382812, + "rewards/rejected": -60.81633377075195, + "step": 820, + "u": -2.745861530303955, + "weight": 0.07535470277070999 + }, + { + "diff_generated": -56.964515686035156, + "epoch": 0.26895657809462087, + "grad_norm": 652.4782452384312, + "learning_rate": 7.170626349892008e-07, + "logits/chosen": -2.4821269512176514, + "logits/rejected": -2.540287494659424, + "logps/chosen": -25.41446876525879, + "logps/rejected": -147.53280639648438, + "loss": 26.1567, + "losses_ref": -0.001235806499607861, + "ref_logps/chosen": -99.45085144042969, + "ref_logps/rejected": -90.56830596923828, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 74.03639221191406, + "rewards/margins": 131.0009002685547, + "rewards/rejected": -56.964515686035156, + "step": 830, + "u": -2.7710132598876953, + "weight": 0.07504115253686905 + }, + { + "diff_generated": -58.602081298828125, + "epoch": 0.27219701879455604, + "grad_norm": 799.2290974007984, + "learning_rate": 7.257019438444924e-07, + "logits/chosen": -2.523066520690918, + "logits/rejected": -2.5335185527801514, + "logps/chosen": -27.519084930419922, + "logps/rejected": -152.70413208007812, + "loss": 25.717, + "losses_ref": -0.001618326990865171, + "ref_logps/chosen": -108.7953872680664, + "ref_logps/rejected": -94.10205841064453, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.27629089355469, + "rewards/margins": 139.8783721923828, + "rewards/rejected": -58.602081298828125, + "step": 840, + "u": -2.845895767211914, + "weight": 0.0500524528324604 + }, + { + "diff_generated": -57.95105743408203, + "epoch": 0.2754374594944913, + "grad_norm": 641.9635385764836, + "learning_rate": 7.343412526997839e-07, + "logits/chosen": -2.4668707847595215, + "logits/rejected": -2.4877943992614746, + "logps/chosen": -26.313549041748047, + "logps/rejected": -147.4332733154297, + "loss": 26.1563, + "losses_ref": -0.0014186182525008917, + "ref_logps/chosen": -103.6099624633789, + "ref_logps/rejected": -89.48223114013672, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 77.29640197753906, + "rewards/margins": 135.24746704101562, + "rewards/rejected": -57.95105743408203, + "step": 850, + "u": -2.7335689067840576, + "weight": 0.08753886818885803 + }, + { + "diff_generated": -61.852027893066406, + "epoch": 0.27867790019442645, + "grad_norm": 653.5457101814015, + "learning_rate": 7.429805615550755e-07, + "logits/chosen": -2.4995462894439697, + "logits/rejected": -2.5535285472869873, + "logps/chosen": -27.849584579467773, + "logps/rejected": -155.34083557128906, + "loss": 25.8256, + "losses_ref": -0.0006288802251219749, + "ref_logps/chosen": -106.05692291259766, + "ref_logps/rejected": -93.48880004882812, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 78.20733642578125, + "rewards/margins": 140.05935668945312, + "rewards/rejected": -61.852027893066406, + "step": 860, + "u": -2.789764881134033, + "weight": 0.06876128911972046 + }, + { + "diff_generated": -63.80859375, + "epoch": 0.28191834089436163, + "grad_norm": 645.4800110914664, + "learning_rate": 7.516198704103671e-07, + "logits/chosen": -2.4218697547912598, + "logits/rejected": -2.58543062210083, + "logps/chosen": -20.747900009155273, + "logps/rejected": -156.9674835205078, + "loss": 24.421, + "losses_ref": -0.0049931942485272884, + "ref_logps/chosen": -96.66189575195312, + "ref_logps/rejected": -93.15888977050781, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 75.91400909423828, + "rewards/margins": 139.72259521484375, + "rewards/rejected": -63.80859375, + "step": 870, + "u": -2.827167510986328, + "weight": 0.056307483464479446 + }, + { + "diff_generated": -60.32050704956055, + "epoch": 0.2851587815942968, + "grad_norm": 634.2610763856094, + "learning_rate": 7.602591792656587e-07, + "logits/chosen": -2.503911256790161, + "logits/rejected": -2.628973960876465, + "logps/chosen": -28.216968536376953, + "logps/rejected": -154.07302856445312, + "loss": 25.4542, + "losses_ref": -0.001056908629834652, + "ref_logps/chosen": -105.15325927734375, + "ref_logps/rejected": -93.75252532958984, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 76.93629455566406, + "rewards/margins": 137.25680541992188, + "rewards/rejected": -60.32050704956055, + "step": 880, + "u": -2.8833649158477783, + "weight": 0.03752894327044487 + }, + { + "diff_generated": -59.31754684448242, + "epoch": 0.28839922229423204, + "grad_norm": 627.5743396303932, + "learning_rate": 7.688984881209502e-07, + "logits/chosen": -2.4742841720581055, + "logits/rejected": -2.5719363689422607, + "logps/chosen": -25.873092651367188, + "logps/rejected": -151.89659118652344, + "loss": 26.5475, + "losses_ref": -0.0009587672539055347, + "ref_logps/chosen": -102.14347839355469, + "ref_logps/rejected": -92.57904052734375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 76.2703857421875, + "rewards/margins": 135.5879364013672, + "rewards/rejected": -59.31754684448242, + "step": 890, + "u": -2.8084681034088135, + "weight": 0.0625327080488205 + }, + { + "diff_generated": -62.66937255859375, + "epoch": 0.2916396629941672, + "grad_norm": 672.2344138742552, + "learning_rate": 7.775377969762419e-07, + "logits/chosen": -2.500886917114258, + "logits/rejected": -2.6067519187927246, + "logps/chosen": -22.781291961669922, + "logps/rejected": -156.03436279296875, + "loss": 24.6767, + "losses_ref": -0.004392404109239578, + "ref_logps/chosen": -101.62608337402344, + "ref_logps/rejected": -93.364990234375, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.84479522705078, + "rewards/margins": 141.51416015625, + "rewards/rejected": -62.66937255859375, + "step": 900, + "u": -2.8645243644714355, + "weight": 0.04390067234635353 + }, + { + "diff_generated": -59.74578857421875, + "epoch": 0.2948801036941024, + "grad_norm": 647.8095867351392, + "learning_rate": 7.861771058315335e-07, + "logits/chosen": -2.5178160667419434, + "logits/rejected": -2.611015558242798, + "logps/chosen": -24.30489730834961, + "logps/rejected": -156.92311096191406, + "loss": 26.078, + "losses_ref": -0.09906181693077087, + "ref_logps/chosen": -105.82955169677734, + "ref_logps/rejected": -97.17731475830078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.52466583251953, + "rewards/margins": 141.27044677734375, + "rewards/rejected": -59.74578857421875, + "step": 910, + "u": -2.900771379470825, + "weight": 0.03258995711803436 + }, + { + "diff_generated": -59.63584518432617, + "epoch": 0.29812054439403757, + "grad_norm": 634.226322030038, + "learning_rate": 7.94816414686825e-07, + "logits/chosen": -2.45780873298645, + "logits/rejected": -2.5526909828186035, + "logps/chosen": -23.072711944580078, + "logps/rejected": -152.64865112304688, + "loss": 25.0608, + "losses_ref": -0.33719271421432495, + "ref_logps/chosen": -100.89134216308594, + "ref_logps/rejected": -93.01278686523438, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.81863403320312, + "rewards/margins": 137.45448303222656, + "rewards/rejected": -59.63584518432617, + "step": 920, + "u": -2.857234477996826, + "weight": 0.04835718125104904 + }, + { + "diff_generated": -63.3922233581543, + "epoch": 0.3013609850939728, + "grad_norm": 581.5773018274067, + "learning_rate": 7.999995450631473e-07, + "logits/chosen": -2.497527599334717, + "logits/rejected": -2.609323024749756, + "logps/chosen": -23.8373966217041, + "logps/rejected": -159.68966674804688, + "loss": 25.7439, + "losses_ref": -0.0006137991440482438, + "ref_logps/chosen": -101.13523864746094, + "ref_logps/rejected": -96.29743957519531, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.29784393310547, + "rewards/margins": 140.6900634765625, + "rewards/rejected": -63.3922233581543, + "step": 930, + "u": -2.8272130489349365, + "weight": 0.05625985190272331 + }, + { + "diff_generated": -60.2132568359375, + "epoch": 0.304601425793908, + "grad_norm": 574.8607406709366, + "learning_rate": 7.999944270354383e-07, + "logits/chosen": -2.4377999305725098, + "logits/rejected": -2.5765769481658936, + "logps/chosen": -26.66571617126465, + "logps/rejected": -152.7779083251953, + "loss": 25.2812, + "losses_ref": -6.957701407372952e-05, + "ref_logps/chosen": -100.65777587890625, + "ref_logps/rejected": -92.56465148925781, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 73.99205780029297, + "rewards/margins": 134.20530700683594, + "rewards/rejected": -60.2132568359375, + "step": 940, + "u": -2.864666700363159, + "weight": 0.04375224933028221 + }, + { + "diff_generated": -61.20196533203125, + "epoch": 0.30784186649384315, + "grad_norm": 602.290184679235, + "learning_rate": 7.99983622381959e-07, + "logits/chosen": -2.476480007171631, + "logits/rejected": -2.521477460861206, + "logps/chosen": -26.971847534179688, + "logps/rejected": -154.27720642089844, + "loss": 25.601, + "losses_ref": -0.21073582768440247, + "ref_logps/chosen": -108.45231628417969, + "ref_logps/rejected": -93.07524108886719, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.48046112060547, + "rewards/margins": 142.6824493408203, + "rewards/rejected": -61.20196533203125, + "step": 950, + "u": -2.860682487487793, + "weight": 0.046927742660045624 + }, + { + "diff_generated": -62.45361328125, + "epoch": 0.31108230719377833, + "grad_norm": 601.6689079670924, + "learning_rate": 7.999671312563164e-07, + "logits/chosen": -2.4776501655578613, + "logits/rejected": -2.4989523887634277, + "logps/chosen": -25.706836700439453, + "logps/rejected": -153.31521606445312, + "loss": 24.6883, + "losses_ref": -9.761693945620209e-05, + "ref_logps/chosen": -105.71771240234375, + "ref_logps/rejected": -90.86161041259766, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.01087188720703, + "rewards/margins": 142.4644775390625, + "rewards/rejected": -62.45361328125, + "step": 960, + "u": -2.827221632003784, + "weight": 0.056250859051942825 + }, + { + "diff_generated": -64.14892578125, + "epoch": 0.31432274789371356, + "grad_norm": 587.4905783939034, + "learning_rate": 7.999449538929611e-07, + "logits/chosen": -2.438697099685669, + "logits/rejected": -2.5441195964813232, + "logps/chosen": -24.696014404296875, + "logps/rejected": -153.0398406982422, + "loss": 25.4498, + "losses_ref": -0.008285412564873695, + "ref_logps/chosen": -101.59489440917969, + "ref_logps/rejected": -88.89091491699219, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.89888000488281, + "rewards/margins": 141.04782104492188, + "rewards/rejected": -64.14892578125, + "step": 970, + "u": -2.8645687103271484, + "weight": 0.04385475814342499 + }, + { + "diff_generated": -62.532623291015625, + "epoch": 0.31756318859364874, + "grad_norm": 592.8020857634815, + "learning_rate": 7.99917090607183e-07, + "logits/chosen": -2.4569623470306396, + "logits/rejected": -2.573836088180542, + "logps/chosen": -22.58658218383789, + "logps/rejected": -157.61378479003906, + "loss": 24.9722, + "losses_ref": -0.0004892147844657302, + "ref_logps/chosen": -101.17801666259766, + "ref_logps/rejected": -95.0811538696289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.5914306640625, + "rewards/margins": 141.1240692138672, + "rewards/rejected": -62.532623291015625, + "step": 980, + "u": -2.8459298610687256, + "weight": 0.05001651123166084 + }, + { + "diff_generated": -62.2236213684082, + "epoch": 0.3208036292935839, + "grad_norm": 670.87379786136, + "learning_rate": 7.998835417951081e-07, + "logits/chosen": -2.492297649383545, + "logits/rejected": -2.5866012573242188, + "logps/chosen": -25.326244354248047, + "logps/rejected": -156.25930786132812, + "loss": 25.4704, + "losses_ref": -0.05882773920893669, + "ref_logps/chosen": -104.6904296875, + "ref_logps/rejected": -94.03569030761719, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.36417388916016, + "rewards/margins": 141.5878143310547, + "rewards/rejected": -62.2236213684082, + "step": 990, + "u": -2.882542371749878, + "weight": 0.03833792358636856 + }, + { + "diff_generated": -64.97667694091797, + "epoch": 0.32404406999351915, + "grad_norm": 662.4839627282153, + "learning_rate": 7.998443079336919e-07, + "logits/chosen": -2.4461517333984375, + "logits/rejected": -2.5580058097839355, + "logps/chosen": -25.72981834411621, + "logps/rejected": -164.4043426513672, + "loss": 25.0388, + "losses_ref": -5.769023118773475e-05, + "ref_logps/chosen": -107.21466064453125, + "ref_logps/rejected": -99.42767333984375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.48484802246094, + "rewards/margins": 146.46151733398438, + "rewards/rejected": -64.97667694091797, + "step": 1000, + "u": -2.902114152908325, + "weight": 0.031251829117536545 + }, + { + "diff_generated": -61.70808029174805, + "epoch": 0.3272845106934543, + "grad_norm": 605.9969099129746, + "learning_rate": 7.997993895807128e-07, + "logits/chosen": -2.517409086227417, + "logits/rejected": -2.5834014415740967, + "logps/chosen": -25.087865829467773, + "logps/rejected": -156.69766235351562, + "loss": 25.552, + "losses_ref": -1.016873193293577e-05, + "ref_logps/chosen": -105.58438873291016, + "ref_logps/rejected": -94.9896011352539, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.49651336669922, + "rewards/margins": 142.20458984375, + "rewards/rejected": -61.70808029174805, + "step": 1010, + "u": -2.864668846130371, + "weight": 0.04375026375055313 + }, + { + "diff_generated": -60.59503936767578, + "epoch": 0.3305249513933895, + "grad_norm": 563.1180792818357, + "learning_rate": 7.997487873747646e-07, + "logits/chosen": -2.4820923805236816, + "logits/rejected": -2.553140163421631, + "logps/chosen": -23.072246551513672, + "logps/rejected": -154.8348846435547, + "loss": 23.6096, + "losses_ref": -0.0016669088508933783, + "ref_logps/chosen": -102.42779541015625, + "ref_logps/rejected": -94.2398681640625, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.35554504394531, + "rewards/margins": 139.95057678222656, + "rewards/rejected": -60.59503936767578, + "step": 1020, + "u": -2.789760112762451, + "weight": 0.06876659393310547 + }, + { + "diff_generated": -61.9303092956543, + "epoch": 0.3337653920933247, + "grad_norm": 650.6903479860271, + "learning_rate": 7.996925020352465e-07, + "logits/chosen": -2.4720969200134277, + "logits/rejected": -2.5271055698394775, + "logps/chosen": -27.77056312561035, + "logps/rejected": -156.66505432128906, + "loss": 26.3679, + "losses_ref": -0.02138182893395424, + "ref_logps/chosen": -108.66754150390625, + "ref_logps/rejected": -94.73475646972656, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.89697265625, + "rewards/margins": 142.82730102539062, + "rewards/rejected": -61.9303092956543, + "step": 1030, + "u": -2.864337921142578, + "weight": 0.04409436136484146 + }, + { + "diff_generated": -60.864776611328125, + "epoch": 0.3370058327932599, + "grad_norm": 549.2909964864288, + "learning_rate": 7.99630534362354e-07, + "logits/chosen": -2.4408340454101562, + "logits/rejected": -2.5210554599761963, + "logps/chosen": -22.748125076293945, + "logps/rejected": -153.09182739257812, + "loss": 25.3284, + "losses_ref": -4.615772013494279e-06, + "ref_logps/chosen": -98.7553482055664, + "ref_logps/rejected": -92.22706604003906, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.00723266601562, + "rewards/margins": 136.87200927734375, + "rewards/rejected": -60.864776611328125, + "step": 1040, + "u": -2.789775848388672, + "weight": 0.06875006854534149 + }, + { + "diff_generated": -60.370582580566406, + "epoch": 0.3402462734931951, + "grad_norm": 555.5935481482165, + "learning_rate": 7.995628852370667e-07, + "logits/chosen": -2.4244742393493652, + "logits/rejected": -2.5339009761810303, + "logps/chosen": -24.179418563842773, + "logps/rejected": -154.94049072265625, + "loss": 25.4596, + "losses_ref": -4.167252882325556e-06, + "ref_logps/chosen": -100.73070526123047, + "ref_logps/rejected": -94.56989288330078, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.55128479003906, + "rewards/margins": 136.921875, + "rewards/rejected": -60.370582580566406, + "step": 1050, + "u": -2.8272223472595215, + "weight": 0.05625007301568985 + }, + { + "diff_generated": -62.10785675048828, + "epoch": 0.34348671419313026, + "grad_norm": 548.8503909872417, + "learning_rate": 7.994895556211363e-07, + "logits/chosen": -2.4397008419036865, + "logits/rejected": -2.5903897285461426, + "logps/chosen": -24.546621322631836, + "logps/rejected": -156.2115478515625, + "loss": 24.2978, + "losses_ref": -3.29654649249278e-05, + "ref_logps/chosen": -101.30360412597656, + "ref_logps/rejected": -94.10367584228516, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.75697326660156, + "rewards/margins": 138.86483764648438, + "rewards/rejected": -62.10785675048828, + "step": 1060, + "u": -2.864668130874634, + "weight": 0.043751008808612823 + }, + { + "diff_generated": -62.08075714111328, + "epoch": 0.34672715489306544, + "grad_norm": 584.8071568694654, + "learning_rate": 7.994105465570722e-07, + "logits/chosen": -2.4493916034698486, + "logits/rejected": -2.507539749145508, + "logps/chosen": -27.621841430664062, + "logps/rejected": -154.74923706054688, + "loss": 24.9576, + "losses_ref": -0.0002413702168269083, + "ref_logps/chosen": -107.9101791381836, + "ref_logps/rejected": -92.66847229003906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.28834533691406, + "rewards/margins": 142.36911010742188, + "rewards/rejected": -62.08075714111328, + "step": 1070, + "u": -2.8084914684295654, + "weight": 0.0625080019235611 + }, + { + "diff_generated": -61.06934356689453, + "epoch": 0.34996759559300067, + "grad_norm": 609.031497017433, + "learning_rate": 7.993258591681279e-07, + "logits/chosen": -2.425107955932617, + "logits/rejected": -2.503246784210205, + "logps/chosen": -25.235126495361328, + "logps/rejected": -150.1800537109375, + "loss": 25.5887, + "losses_ref": -0.08424490690231323, + "ref_logps/chosen": -100.81827545166016, + "ref_logps/rejected": -89.11073303222656, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 75.58314514160156, + "rewards/margins": 136.65249633789062, + "rewards/rejected": -61.06934356689453, + "step": 1080, + "u": -2.7504265308380127, + "weight": 0.08297665417194366 + }, + { + "diff_generated": -62.70374298095703, + "epoch": 0.35320803629293585, + "grad_norm": 560.1491886481731, + "learning_rate": 7.992354946582836e-07, + "logits/chosen": -2.471681594848633, + "logits/rejected": -2.573151111602783, + "logps/chosen": -23.826358795166016, + "logps/rejected": -158.4449005126953, + "loss": 24.6076, + "losses_ref": -0.009558220393955708, + "ref_logps/chosen": -103.78206634521484, + "ref_logps/rejected": -95.74115753173828, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.95570373535156, + "rewards/margins": 142.65943908691406, + "rewards/rejected": -62.70374298095703, + "step": 1090, + "u": -2.8645691871643066, + "weight": 0.043854910880327225 + }, + { + "diff_generated": -58.56966018676758, + "epoch": 0.356448476992871, + "grad_norm": 620.9834538385791, + "learning_rate": 7.991394543122304e-07, + "logits/chosen": -2.4512717723846436, + "logits/rejected": -2.535726547241211, + "logps/chosen": -25.30792999267578, + "logps/rejected": -149.34432983398438, + "loss": 24.9501, + "losses_ref": -0.04097381606698036, + "ref_logps/chosen": -102.82730865478516, + "ref_logps/rejected": -90.77467346191406, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.51936340332031, + "rewards/margins": 136.08901977539062, + "rewards/rejected": -58.56966018676758, + "step": 1100, + "u": -2.844625949859619, + "weight": 0.051252931356430054 + }, + { + "diff_generated": -59.12998580932617, + "epoch": 0.3596889176928062, + "grad_norm": 564.9184901933917, + "learning_rate": 7.990377394953507e-07, + "logits/chosen": -2.425802230834961, + "logits/rejected": -2.565328598022461, + "logps/chosen": -24.115642547607422, + "logps/rejected": -155.70703125, + "loss": 24.4545, + "losses_ref": -6.220198702067137e-05, + "ref_logps/chosen": -103.17607116699219, + "ref_logps/rejected": -96.57704162597656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.0604248046875, + "rewards/margins": 138.19041442871094, + "rewards/rejected": -59.12998580932617, + "step": 1110, + "u": -2.8459436893463135, + "weight": 0.050002049654722214 + }, + { + "diff_generated": -60.0655517578125, + "epoch": 0.36292935839274143, + "grad_norm": 532.0960030236337, + "learning_rate": 7.989303516537001e-07, + "logits/chosen": -2.4729647636413574, + "logits/rejected": -2.59649920463562, + "logps/chosen": -20.899898529052734, + "logps/rejected": -154.85513305664062, + "loss": 24.8186, + "losses_ref": -0.010020644403994083, + "ref_logps/chosen": -99.94255065917969, + "ref_logps/rejected": -94.7895736694336, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.04264831542969, + "rewards/margins": 139.10821533203125, + "rewards/rejected": -60.0655517578125, + "step": 1120, + "u": -2.8455822467803955, + "weight": 0.050371576100587845 + }, + { + "diff_generated": -61.76154327392578, + "epoch": 0.3661697990926766, + "grad_norm": 611.0123967345178, + "learning_rate": 7.98817292313986e-07, + "logits/chosen": -2.5177502632141113, + "logits/rejected": -2.6054959297180176, + "logps/chosen": -28.331207275390625, + "logps/rejected": -160.3716583251953, + "loss": 24.3842, + "losses_ref": -0.01926909014582634, + "ref_logps/chosen": -111.9024658203125, + "ref_logps/rejected": -98.61011505126953, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 83.57127380371094, + "rewards/margins": 145.3328094482422, + "rewards/rejected": -61.76154327392578, + "step": 1130, + "u": -2.883004903793335, + "weight": 0.03790263459086418 + }, + { + "diff_generated": -57.56560134887695, + "epoch": 0.3694102397926118, + "grad_norm": 565.7437114420287, + "learning_rate": 7.986985630835463e-07, + "logits/chosen": -2.449852466583252, + "logits/rejected": -2.524470090866089, + "logps/chosen": -24.552753448486328, + "logps/rejected": -152.75930786132812, + "loss": 24.5576, + "losses_ref": -0.0029926716815680265, + "ref_logps/chosen": -106.67436218261719, + "ref_logps/rejected": -95.19371032714844, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 82.12162017822266, + "rewards/margins": 139.68722534179688, + "rewards/rejected": -57.56560134887695, + "step": 1140, + "u": -2.9207377433776855, + "weight": 0.02510598860681057 + }, + { + "diff_generated": -56.214500427246094, + "epoch": 0.37265068049254696, + "grad_norm": 585.7143637032268, + "learning_rate": 7.985741656503261e-07, + "logits/chosen": -2.490262508392334, + "logits/rejected": -2.5420994758605957, + "logps/chosen": -28.755840301513672, + "logps/rejected": -147.6808319091797, + "loss": 25.1618, + "losses_ref": -0.20432765781879425, + "ref_logps/chosen": -107.40836334228516, + "ref_logps/rejected": -91.4663314819336, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.65251922607422, + "rewards/margins": 134.8670196533203, + "rewards/rejected": -56.214500427246094, + "step": 1150, + "u": -2.8224270343780518, + "weight": 0.059785760939121246 + }, + { + "diff_generated": -59.897605895996094, + "epoch": 0.3758911211924822, + "grad_norm": 570.2360137354781, + "learning_rate": 7.984441017828543e-07, + "logits/chosen": -2.4570813179016113, + "logits/rejected": -2.553859233856201, + "logps/chosen": -25.598440170288086, + "logps/rejected": -154.27281188964844, + "loss": 24.3611, + "losses_ref": -0.20918317139148712, + "ref_logps/chosen": -106.09423828125, + "ref_logps/rejected": -94.37522888183594, + "rewards/accuracies": 0.96875, + "rewards/chosen": 80.49581146240234, + "rewards/margins": 140.39340209960938, + "rewards/rejected": -59.897605895996094, + "step": 1160, + "u": -2.899475336074829, + "weight": 0.0337781198322773 + }, + { + "diff_generated": -58.429527282714844, + "epoch": 0.37913156189241737, + "grad_norm": 554.8138369309839, + "learning_rate": 7.983083733302178e-07, + "logits/chosen": -2.5052692890167236, + "logits/rejected": -2.5308260917663574, + "logps/chosen": -25.392253875732422, + "logps/rejected": -152.5596466064453, + "loss": 24.5401, + "losses_ref": -0.0005921843112446368, + "ref_logps/chosen": -103.8319091796875, + "ref_logps/rejected": -94.13011932373047, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.43965148925781, + "rewards/margins": 136.86917114257812, + "rewards/rejected": -58.429527282714844, + "step": 1170, + "u": -2.845940589904785, + "weight": 0.0500054657459259 + }, + { + "diff_generated": -57.435081481933594, + "epoch": 0.38237200259235254, + "grad_norm": 558.5165169858202, + "learning_rate": 7.98166982222036e-07, + "logits/chosen": -2.4755935668945312, + "logits/rejected": -2.5091288089752197, + "logps/chosen": -25.46775245666504, + "logps/rejected": -147.2650146484375, + "loss": 24.6859, + "losses_ref": -0.08600326627492905, + "ref_logps/chosen": -106.25887298583984, + "ref_logps/rejected": -89.82994079589844, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.79112243652344, + "rewards/margins": 138.2261962890625, + "rewards/rejected": -57.435081481933594, + "step": 1180, + "u": -2.8637096881866455, + "weight": 0.044689107686281204 + }, + { + "diff_generated": -60.880401611328125, + "epoch": 0.3856124432922878, + "grad_norm": 553.3360257259751, + "learning_rate": 7.980199304684328e-07, + "logits/chosen": -2.449470043182373, + "logits/rejected": -2.5108659267425537, + "logps/chosen": -25.85076332092285, + "logps/rejected": -159.79776000976562, + "loss": 24.5164, + "losses_ref": -0.00033252747380174696, + "ref_logps/chosen": -108.08426666259766, + "ref_logps/rejected": -98.91737365722656, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.23350524902344, + "rewards/margins": 143.11392211914062, + "rewards/rejected": -60.880401611328125, + "step": 1190, + "u": -2.8833820819854736, + "weight": 0.037510983645915985 + }, + { + "diff_generated": -58.96329879760742, + "epoch": 0.38885288399222295, + "grad_norm": 564.139791300194, + "learning_rate": 7.978672201600077e-07, + "logits/chosen": -2.426279306411743, + "logits/rejected": -2.546734571456909, + "logps/chosen": -23.6950740814209, + "logps/rejected": -151.8271484375, + "loss": 23.8821, + "losses_ref": -8.829331636661664e-05, + "ref_logps/chosen": -101.88237762451172, + "ref_logps/rejected": -92.86384582519531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.18730926513672, + "rewards/margins": 137.15060424804688, + "rewards/rejected": -58.96329879760742, + "step": 1200, + "u": -2.8084969520568848, + "weight": 0.06250225752592087 + }, + { + "diff_generated": -61.696739196777344, + "epoch": 0.39209332469215813, + "grad_norm": 542.9205356291774, + "learning_rate": 7.97708853467807e-07, + "logits/chosen": -2.4830985069274902, + "logits/rejected": -2.617919445037842, + "logps/chosen": -22.31697654724121, + "logps/rejected": -156.75045776367188, + "loss": 23.8798, + "losses_ref": -0.00018334609922021627, + "ref_logps/chosen": -104.34793853759766, + "ref_logps/rejected": -95.05369567871094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 82.03095245361328, + "rewards/margins": 143.72769165039062, + "rewards/rejected": -61.696739196777344, + "step": 1210, + "u": -2.902114152908325, + "weight": 0.03125162795186043 + }, + { + "diff_generated": -58.81583786010742, + "epoch": 0.3953337653920933, + "grad_norm": 578.038256412928, + "learning_rate": 7.975448326432927e-07, + "logits/chosen": -2.465670347213745, + "logits/rejected": -2.5721380710601807, + "logps/chosen": -24.39229965209961, + "logps/rejected": -154.65382385253906, + "loss": 24.1119, + "losses_ref": -0.010452238842844963, + "ref_logps/chosen": -102.02569580078125, + "ref_logps/rejected": -95.83799743652344, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.63338470458984, + "rewards/margins": 136.44923400878906, + "rewards/rejected": -58.81583786010742, + "step": 1220, + "u": -2.864520311355591, + "weight": 0.04390513524413109 + }, + { + "diff_generated": -60.96055221557617, + "epoch": 0.39857420609202854, + "grad_norm": 557.8533704994326, + "learning_rate": 7.973751600183094e-07, + "logits/chosen": -2.4817824363708496, + "logits/rejected": -2.5495259761810303, + "logps/chosen": -25.767566680908203, + "logps/rejected": -155.77487182617188, + "loss": 25.223, + "losses_ref": -0.004706860054284334, + "ref_logps/chosen": -107.4765853881836, + "ref_logps/rejected": -94.81434631347656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.70903015136719, + "rewards/margins": 142.66958618164062, + "rewards/rejected": -60.96055221557617, + "step": 1230, + "u": -2.902073383331299, + "weight": 0.03129463642835617 + }, + { + "diff_generated": -59.66600799560547, + "epoch": 0.4018146467919637, + "grad_norm": 557.6145452725019, + "learning_rate": 7.971998380050529e-07, + "logits/chosen": -2.4542365074157715, + "logits/rejected": -2.5326638221740723, + "logps/chosen": -25.42231559753418, + "logps/rejected": -148.4939422607422, + "loss": 24.7245, + "losses_ref": -0.324226438999176, + "ref_logps/chosen": -107.45170593261719, + "ref_logps/rejected": -88.82794952392578, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 82.02940368652344, + "rewards/margins": 141.69540405273438, + "rewards/rejected": -59.66600799560547, + "step": 1240, + "u": -2.9268898963928223, + "weight": 0.024464670568704605 + }, + { + "diff_generated": -63.06318283081055, + "epoch": 0.4050550874918989, + "grad_norm": 505.62122107242897, + "learning_rate": 7.970188690960343e-07, + "logits/chosen": -2.3925909996032715, + "logits/rejected": -2.5492215156555176, + "logps/chosen": -20.986413955688477, + "logps/rejected": -155.66217041015625, + "loss": 23.3582, + "losses_ref": -2.983683043566998e-05, + "ref_logps/chosen": -98.38398742675781, + "ref_logps/rejected": -92.59899139404297, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.39756774902344, + "rewards/margins": 140.4607696533203, + "rewards/rejected": -63.06318283081055, + "step": 1250, + "u": -2.827221393585205, + "weight": 0.056250907480716705 + }, + { + "diff_generated": -62.156578063964844, + "epoch": 0.40829552819183407, + "grad_norm": 561.2620162838996, + "learning_rate": 7.968322558640458e-07, + "logits/chosen": -2.413391590118408, + "logits/rejected": -2.539297103881836, + "logps/chosen": -24.1887149810791, + "logps/rejected": -156.310791015625, + "loss": 24.7436, + "losses_ref": -0.2747315764427185, + "ref_logps/chosen": -103.8145523071289, + "ref_logps/rejected": -94.15421295166016, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.6258316040039, + "rewards/margins": 141.78240966796875, + "rewards/rejected": -62.156578063964844, + "step": 1260, + "u": -2.8222765922546387, + "weight": 0.05984902381896973 + }, + { + "diff_generated": -58.499603271484375, + "epoch": 0.4115359688917693, + "grad_norm": 603.3977602834112, + "learning_rate": 7.966400009621233e-07, + "logits/chosen": -2.4518990516662598, + "logits/rejected": -2.536153793334961, + "logps/chosen": -24.702558517456055, + "logps/rejected": -147.30894470214844, + "loss": 25.1141, + "losses_ref": -0.006268263794481754, + "ref_logps/chosen": -101.77778625488281, + "ref_logps/rejected": -88.80934143066406, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 77.07522583007812, + "rewards/margins": 135.5748291015625, + "rewards/rejected": -58.499603271484375, + "step": 1270, + "u": -2.752103567123413, + "weight": 0.08148328959941864 + }, + { + "diff_generated": -57.64625930786133, + "epoch": 0.4147764095917045, + "grad_norm": 528.115963112855, + "learning_rate": 7.964421071235092e-07, + "logits/chosen": -2.411189079284668, + "logits/rejected": -2.5435092449188232, + "logps/chosen": -23.354013442993164, + "logps/rejected": -145.05821228027344, + "loss": 24.0461, + "losses_ref": -0.00014029743033461273, + "ref_logps/chosen": -96.66365051269531, + "ref_logps/rejected": -87.4119644165039, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 73.30963897705078, + "rewards/margins": 130.95590209960938, + "rewards/rejected": -57.64625930786133, + "step": 1280, + "u": -2.7523257732391357, + "weight": 0.08125344663858414 + }, + { + "diff_generated": -57.99530029296875, + "epoch": 0.41801685029163965, + "grad_norm": 551.1264166317477, + "learning_rate": 7.962385771616133e-07, + "logits/chosen": -2.436392307281494, + "logits/rejected": -2.4846584796905518, + "logps/chosen": -24.565990447998047, + "logps/rejected": -145.689208984375, + "loss": 24.0086, + "losses_ref": -0.0034762464929372072, + "ref_logps/chosen": -102.29422760009766, + "ref_logps/rejected": -87.69392395019531, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 77.72823333740234, + "rewards/margins": 135.72354125976562, + "rewards/rejected": -57.99530029296875, + "step": 1290, + "u": -2.771005392074585, + "weight": 0.07504956424236298 + }, + { + "diff_generated": -60.86104202270508, + "epoch": 0.42125729099157483, + "grad_norm": 509.1172301700267, + "learning_rate": 7.960294139699724e-07, + "logits/chosen": -2.453728675842285, + "logits/rejected": -2.5464932918548584, + "logps/chosen": -22.965999603271484, + "logps/rejected": -156.35317993164062, + "loss": 23.5007, + "losses_ref": -5.912642882321961e-05, + "ref_logps/chosen": -104.04813385009766, + "ref_logps/rejected": -95.4921646118164, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.08213806152344, + "rewards/margins": 141.94317626953125, + "rewards/rejected": -60.86104202270508, + "step": 1300, + "u": -2.827220916748047, + "weight": 0.05625147372484207 + }, + { + "diff_generated": -64.40738677978516, + "epoch": 0.42449773169151006, + "grad_norm": 565.5747858592057, + "learning_rate": 7.958146205222102e-07, + "logits/chosen": -2.42254900932312, + "logits/rejected": -2.5176830291748047, + "logps/chosen": -22.295743942260742, + "logps/rejected": -158.38809204101562, + "loss": 23.8388, + "losses_ref": -4.426790383149637e-06, + "ref_logps/chosen": -107.21588134765625, + "ref_logps/rejected": -93.98070526123047, + "rewards/accuracies": 0.96875, + "rewards/chosen": 84.92013549804688, + "rewards/margins": 149.3275146484375, + "rewards/rejected": -64.40738677978516, + "step": 1310, + "u": -2.9021153450012207, + "weight": 0.031250111758708954 + }, + { + "diff_generated": -59.734657287597656, + "epoch": 0.42773817239144524, + "grad_norm": 550.6142392765456, + "learning_rate": 7.955941998719939e-07, + "logits/chosen": -2.4167730808258057, + "logits/rejected": -2.500880479812622, + "logps/chosen": -24.408432006835938, + "logps/rejected": -151.30154418945312, + "loss": 23.573, + "losses_ref": -2.2531708054884803e-06, + "ref_logps/chosen": -100.78544616699219, + "ref_logps/rejected": -91.56687927246094, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 76.37702941894531, + "rewards/margins": 136.11166381835938, + "rewards/rejected": -59.734657287597656, + "step": 1320, + "u": -2.7523293495178223, + "weight": 0.08125004917383194 + }, + { + "diff_generated": -59.58417892456055, + "epoch": 0.4309786130913804, + "grad_norm": 538.206305925096, + "learning_rate": 7.953681551529918e-07, + "logits/chosen": -2.3940765857696533, + "logits/rejected": -2.492981433868408, + "logps/chosen": -21.914344787597656, + "logps/rejected": -151.33755493164062, + "loss": 23.4911, + "losses_ref": -0.0006687212735414505, + "ref_logps/chosen": -102.36250305175781, + "ref_logps/rejected": -91.75337219238281, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.44816589355469, + "rewards/margins": 140.03233337402344, + "rewards/rejected": -59.58417892456055, + "step": 1330, + "u": -2.827209949493408, + "weight": 0.05626332759857178 + }, + { + "diff_generated": -62.592247009277344, + "epoch": 0.43421905379131565, + "grad_norm": 622.2994752353687, + "learning_rate": 7.951364895788277e-07, + "logits/chosen": -2.4446358680725098, + "logits/rejected": -2.5049664974212646, + "logps/chosen": -22.867691040039062, + "logps/rejected": -156.77749633789062, + "loss": 23.5277, + "losses_ref": -0.007556392345577478, + "ref_logps/chosen": -105.43354797363281, + "ref_logps/rejected": -94.18524169921875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.56585693359375, + "rewards/margins": 145.15811157226562, + "rewards/rejected": -62.592247009277344, + "step": 1340, + "u": -2.8645882606506348, + "weight": 0.04383472353219986 + }, + { + "diff_generated": -60.754119873046875, + "epoch": 0.4374594944912508, + "grad_norm": 836.0275761692405, + "learning_rate": 7.948992064430363e-07, + "logits/chosen": -2.456305742263794, + "logits/rejected": -2.576406478881836, + "logps/chosen": -24.8270206451416, + "logps/rejected": -156.05142211914062, + "loss": 24.4547, + "losses_ref": -0.00010340138396713883, + "ref_logps/chosen": -105.81303405761719, + "ref_logps/rejected": -95.29730224609375, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.98602294921875, + "rewards/margins": 141.74014282226562, + "rewards/rejected": -60.754119873046875, + "step": 1350, + "u": -2.8646674156188965, + "weight": 0.04375159740447998 + }, + { + "diff_generated": -60.79752731323242, + "epoch": 0.440699935191186, + "grad_norm": 577.1583425207615, + "learning_rate": 7.946563091190154e-07, + "logits/chosen": -2.4474077224731445, + "logits/rejected": -2.5316262245178223, + "logps/chosen": -24.314289093017578, + "logps/rejected": -152.74404907226562, + "loss": 24.7954, + "losses_ref": -0.00924721360206604, + "ref_logps/chosen": -103.41661071777344, + "ref_logps/rejected": -91.94651794433594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.1023178100586, + "rewards/margins": 139.89984130859375, + "rewards/rejected": -60.79752731323242, + "step": 1360, + "u": -2.845734119415283, + "weight": 0.05022118240594864 + }, + { + "diff_generated": -58.5832633972168, + "epoch": 0.4439403758911212, + "grad_norm": 568.7873604751329, + "learning_rate": 7.944078010599788e-07, + "logits/chosen": -2.4737284183502197, + "logits/rejected": -2.477033853530884, + "logps/chosen": -25.391706466674805, + "logps/rejected": -149.84629821777344, + "loss": 23.8622, + "losses_ref": -1.4776187526877038e-05, + "ref_logps/chosen": -106.20867919921875, + "ref_logps/rejected": -91.2630386352539, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.81697845458984, + "rewards/margins": 139.40023803710938, + "rewards/rejected": -58.5832633972168, + "step": 1370, + "u": -2.771052122116089, + "weight": 0.07500027120113373 + }, + { + "diff_generated": -61.097816467285156, + "epoch": 0.4471808165910564, + "grad_norm": 543.5220460792306, + "learning_rate": 7.941536857989063e-07, + "logits/chosen": -2.3851571083068848, + "logits/rejected": -2.4775819778442383, + "logps/chosen": -24.525358200073242, + "logps/rejected": -156.26597595214844, + "loss": 23.8074, + "losses_ref": -0.012715299613773823, + "ref_logps/chosen": -103.4111099243164, + "ref_logps/rejected": -95.16815185546875, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 78.88575744628906, + "rewards/margins": 139.9835662841797, + "rewards/rejected": -61.097816467285156, + "step": 1380, + "u": -2.789517879486084, + "weight": 0.06901643425226212 + }, + { + "diff_generated": -60.606903076171875, + "epoch": 0.4504212572909916, + "grad_norm": 531.0683398534825, + "learning_rate": 7.938939669484943e-07, + "logits/chosen": -2.42455792427063, + "logits/rejected": -2.5245718955993652, + "logps/chosen": -21.008312225341797, + "logps/rejected": -153.11978149414062, + "loss": 23.3139, + "losses_ref": -0.0068437992595136166, + "ref_logps/chosen": -102.11225891113281, + "ref_logps/rejected": -92.51286315917969, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.10395050048828, + "rewards/margins": 141.71084594726562, + "rewards/rejected": -60.606903076171875, + "step": 1390, + "u": -2.86457896232605, + "weight": 0.043844155967235565 + }, + { + "diff_generated": -59.722434997558594, + "epoch": 0.45366169799092676, + "grad_norm": 524.0223907923139, + "learning_rate": 7.936286482011041e-07, + "logits/chosen": -2.39502215385437, + "logits/rejected": -2.4825236797332764, + "logps/chosen": -24.45802879333496, + "logps/rejected": -154.26536560058594, + "loss": 24.7615, + "losses_ref": -5.191981836105697e-05, + "ref_logps/chosen": -104.090576171875, + "ref_logps/rejected": -94.54292297363281, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.6325454711914, + "rewards/margins": 139.35499572753906, + "rewards/rejected": -59.722434997558594, + "step": 1400, + "u": -2.845944404602051, + "weight": 0.050001360476017 + }, + { + "diff_generated": -67.22505187988281, + "epoch": 0.45690213869086194, + "grad_norm": 530.0518343533638, + "learning_rate": 7.933577333287091e-07, + "logits/chosen": -2.3679230213165283, + "logits/rejected": -2.526124954223633, + "logps/chosen": -23.136608123779297, + "logps/rejected": -163.09498596191406, + "loss": 22.6513, + "losses_ref": -0.00027964648324996233, + "ref_logps/chosen": -101.16484832763672, + "ref_logps/rejected": -95.86993408203125, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.02824401855469, + "rewards/margins": 145.2532958984375, + "rewards/rejected": -67.22505187988281, + "step": 1410, + "u": -2.8646605014801025, + "weight": 0.043758973479270935 + }, + { + "diff_generated": -61.26875686645508, + "epoch": 0.46014257939079717, + "grad_norm": 518.2332001958645, + "learning_rate": 7.930812261828421e-07, + "logits/chosen": -2.4148430824279785, + "logits/rejected": -2.514356851577759, + "logps/chosen": -27.328014373779297, + "logps/rejected": -155.63111877441406, + "loss": 24.8621, + "losses_ref": -0.0005711294361390173, + "ref_logps/chosen": -104.68330383300781, + "ref_logps/rejected": -94.36234283447266, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.35530090332031, + "rewards/margins": 138.62403869628906, + "rewards/rejected": -61.26875686645508, + "step": 1420, + "u": -2.864650249481201, + "weight": 0.04376951605081558 + }, + { + "diff_generated": -60.6711540222168, + "epoch": 0.46338302009073234, + "grad_norm": 517.7182030021377, + "learning_rate": 7.92799130694539e-07, + "logits/chosen": -2.453164577484131, + "logits/rejected": -2.522785186767578, + "logps/chosen": -23.800975799560547, + "logps/rejected": -153.56021118164062, + "loss": 22.8541, + "losses_ref": -6.637422302446794e-06, + "ref_logps/chosen": -104.95344543457031, + "ref_logps/rejected": -92.88905334472656, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 81.15245056152344, + "rewards/margins": 141.82363891601562, + "rewards/rejected": -60.6711540222168, + "step": 1430, + "u": -2.7897756099700928, + "weight": 0.06875012069940567 + }, + { + "diff_generated": -60.21378707885742, + "epoch": 0.4666234607906675, + "grad_norm": 540.4048125029803, + "learning_rate": 7.925114508742848e-07, + "logits/chosen": -2.4429352283477783, + "logits/rejected": -2.55712890625, + "logps/chosen": -22.36754035949707, + "logps/rejected": -153.32528686523438, + "loss": 23.7542, + "losses_ref": -2.3834094463381916e-05, + "ref_logps/chosen": -99.67716979980469, + "ref_logps/rejected": -93.11149597167969, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.30963897705078, + "rewards/margins": 137.52340698242188, + "rewards/rejected": -60.21378707885742, + "step": 1440, + "u": -2.845945119857788, + "weight": 0.05000070855021477 + }, + { + "diff_generated": -68.31291198730469, + "epoch": 0.4698639014906027, + "grad_norm": 527.0371137800358, + "learning_rate": 7.92218190811955e-07, + "logits/chosen": -2.4218368530273438, + "logits/rejected": -2.556283712387085, + "logps/chosen": -23.499523162841797, + "logps/rejected": -165.60202026367188, + "loss": 23.0451, + "losses_ref": -0.00014205127081368119, + "ref_logps/chosen": -103.48432922363281, + "ref_logps/rejected": -97.28910827636719, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.98480224609375, + "rewards/margins": 148.2977294921875, + "rewards/rejected": -68.31291198730469, + "step": 1450, + "u": -2.8646674156188965, + "weight": 0.043751902878284454 + }, + { + "diff_generated": -63.3177375793457, + "epoch": 0.47310434219053793, + "grad_norm": 521.6127747066482, + "learning_rate": 7.919193546767581e-07, + "logits/chosen": -2.414783239364624, + "logits/rejected": -2.4879963397979736, + "logps/chosen": -23.851436614990234, + "logps/rejected": -156.83462524414062, + "loss": 23.6939, + "losses_ref": -0.0012180242920294404, + "ref_logps/chosen": -102.31185913085938, + "ref_logps/rejected": -93.51688385009766, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.4604263305664, + "rewards/margins": 141.77816772460938, + "rewards/rejected": -63.3177375793457, + "step": 1460, + "u": -2.864628791809082, + "weight": 0.04379221796989441 + }, + { + "diff_generated": -63.06159591674805, + "epoch": 0.4763447828904731, + "grad_norm": 493.6876430773276, + "learning_rate": 7.916149467171768e-07, + "logits/chosen": -2.43074631690979, + "logits/rejected": -2.5403378009796143, + "logps/chosen": -20.250608444213867, + "logps/rejected": -152.62335205078125, + "loss": 23.1336, + "losses_ref": -0.0006198140908963978, + "ref_logps/chosen": -99.69451904296875, + "ref_logps/rejected": -89.56175231933594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 79.44390869140625, + "rewards/margins": 142.50550842285156, + "rewards/rejected": -63.06159591674805, + "step": 1470, + "u": -2.808478355407715, + "weight": 0.06252166628837585 + }, + { + "diff_generated": -60.9228515625, + "epoch": 0.4795852235904083, + "grad_norm": 491.6094579870981, + "learning_rate": 7.913049712609066e-07, + "logits/chosen": -2.4075615406036377, + "logits/rejected": -2.505866765975952, + "logps/chosen": -22.791784286499023, + "logps/rejected": -151.3705596923828, + "loss": 22.7425, + "losses_ref": -8.824100950732827e-05, + "ref_logps/chosen": -101.34063720703125, + "ref_logps/rejected": -90.44770050048828, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 78.5488510131836, + "rewards/margins": 139.47171020507812, + "rewards/rejected": -60.9228515625, + "step": 1480, + "u": -2.771050214767456, + "weight": 0.07500235736370087 + }, + { + "diff_generated": -61.94826126098633, + "epoch": 0.48282566429034346, + "grad_norm": 522.0449909951569, + "learning_rate": 7.909894327147949e-07, + "logits/chosen": -2.4437947273254395, + "logits/rejected": -2.546353340148926, + "logps/chosen": -23.505924224853516, + "logps/rejected": -156.75686645507812, + "loss": 23.2703, + "losses_ref": -0.00494359340518713, + "ref_logps/chosen": -104.78910827636719, + "ref_logps/rejected": -94.80860900878906, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.28319549560547, + "rewards/margins": 143.23146057128906, + "rewards/rejected": -61.94826126098633, + "step": 1490, + "u": -2.8833258152008057, + "weight": 0.03756953403353691 + }, + { + "diff_generated": -63.96868896484375, + "epoch": 0.4860661049902787, + "grad_norm": 550.7892278050978, + "learning_rate": 7.906683355647783e-07, + "logits/chosen": -2.4391255378723145, + "logits/rejected": -2.5381453037261963, + "logps/chosen": -22.410463333129883, + "logps/rejected": -160.52137756347656, + "loss": 23.0265, + "losses_ref": -0.0005110603524371982, + "ref_logps/chosen": -103.95661926269531, + "ref_logps/rejected": -96.55268859863281, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.54615020751953, + "rewards/margins": 145.5148468017578, + "rewards/rejected": -63.96868896484375, + "step": 1500, + "u": -2.8833823204040527, + "weight": 0.03751056641340256 + }, + { + "diff_generated": -65.94358825683594, + "epoch": 0.48930654569021387, + "grad_norm": 563.2783409210464, + "learning_rate": 7.903416843758187e-07, + "logits/chosen": -2.4705750942230225, + "logits/rejected": -2.583146333694458, + "logps/chosen": -22.41499900817871, + "logps/rejected": -159.68580627441406, + "loss": 23.5309, + "losses_ref": -0.0003054616681765765, + "ref_logps/chosen": -102.96517181396484, + "ref_logps/rejected": -93.74221801757812, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.55018615722656, + "rewards/margins": 146.49375915527344, + "rewards/rejected": -65.94358825683594, + "step": 1510, + "u": -2.8084893226623535, + "weight": 0.06251008808612823 + }, + { + "diff_generated": -63.47590255737305, + "epoch": 0.49254698639014904, + "grad_norm": 520.3450542357509, + "learning_rate": 7.900094837918385e-07, + "logits/chosen": -2.455735921859741, + "logits/rejected": -2.539025068283081, + "logps/chosen": -26.121158599853516, + "logps/rejected": -160.17105102539062, + "loss": 24.0349, + "losses_ref": -2.6779718609759584e-05, + "ref_logps/chosen": -106.8904800415039, + "ref_logps/rejected": -96.69514465332031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 80.76933288574219, + "rewards/margins": 144.24522399902344, + "rewards/rejected": -63.47590255737305, + "step": 1520, + "u": -2.9021151065826416, + "weight": 0.03125074505805969 + }, + { + "diff_generated": -62.85212326049805, + "epoch": 0.4957874270900843, + "grad_norm": 533.0122872675029, + "learning_rate": 7.896717385356545e-07, + "logits/chosen": -2.458217144012451, + "logits/rejected": -2.5963573455810547, + "logps/chosen": -22.722042083740234, + "logps/rejected": -160.3801727294922, + "loss": 22.55, + "losses_ref": -0.0010542498202994466, + "ref_logps/chosen": -104.97696685791016, + "ref_logps/rejected": -97.5280532836914, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 82.25492095947266, + "rewards/margins": 145.10702514648438, + "rewards/rejected": -62.85212326049805, + "step": 1530, + "u": -2.9208128452301025, + "weight": 0.025027502328157425 + }, + { + "diff_generated": -64.49129486083984, + "epoch": 0.49902786779001945, + "grad_norm": 585.5842895837126, + "learning_rate": 7.893284534089109e-07, + "logits/chosen": -2.4219799041748047, + "logits/rejected": -2.530082941055298, + "logps/chosen": -22.585084915161133, + "logps/rejected": -157.50753784179688, + "loss": 23.7577, + "losses_ref": -0.004659146536141634, + "ref_logps/chosen": -104.06495666503906, + "ref_logps/rejected": -93.01624298095703, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.47987365722656, + "rewards/margins": 145.97116088867188, + "rewards/rejected": -64.49129486083984, + "step": 1540, + "u": -2.883218288421631, + "weight": 0.037680789828300476 + }, + { + "diff_generated": -60.09766387939453, + "epoch": 0.5022683084899546, + "grad_norm": 511.6404508252784, + "learning_rate": 7.889796332920106e-07, + "logits/chosen": -2.388531446456909, + "logits/rejected": -2.518735408782959, + "logps/chosen": -21.837175369262695, + "logps/rejected": -149.7837677001953, + "loss": 22.649, + "losses_ref": -0.0007943492382764816, + "ref_logps/chosen": -98.83009338378906, + "ref_logps/rejected": -89.68608856201172, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.99291229248047, + "rewards/margins": 137.090576171875, + "rewards/rejected": -60.09766387939453, + "step": 1550, + "u": -2.7897555828094482, + "weight": 0.06877147406339645 + }, + { + "diff_generated": -62.22406768798828, + "epoch": 0.5055087491898899, + "grad_norm": 523.527989366479, + "learning_rate": 7.886252831440465e-07, + "logits/chosen": -2.445432186126709, + "logits/rejected": -2.570587158203125, + "logps/chosen": -24.73207664489746, + "logps/rejected": -159.7403564453125, + "loss": 23.151, + "losses_ref": -0.008048786781728268, + "ref_logps/chosen": -105.26564025878906, + "ref_logps/rejected": -97.51630401611328, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 80.53353881835938, + "rewards/margins": 142.75759887695312, + "rewards/rejected": -62.22406768798828, + "step": 1560, + "u": -2.9207725524902344, + "weight": 0.025069724768400192 + }, + { + "diff_generated": -64.640380859375, + "epoch": 0.508749189889825, + "grad_norm": 505.30002973356125, + "learning_rate": 7.882654080027304e-07, + "logits/chosen": -2.438961982727051, + "logits/rejected": -2.5489273071289062, + "logps/chosen": -22.685306549072266, + "logps/rejected": -161.23162841796875, + "loss": 23.5632, + "losses_ref": -0.00023544761643279344, + "ref_logps/chosen": -105.44688415527344, + "ref_logps/rejected": -96.59124755859375, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.76158142089844, + "rewards/margins": 147.40194702148438, + "rewards/rejected": -64.640380859375, + "step": 1570, + "u": -2.883385181427002, + "weight": 0.0375075526535511 + }, + { + "diff_generated": -59.41249465942383, + "epoch": 0.5119896305897602, + "grad_norm": 459.2871134228077, + "learning_rate": 7.879000129843218e-07, + "logits/chosen": -2.4876296520233154, + "logits/rejected": -2.530703544616699, + "logps/chosen": -26.23647689819336, + "logps/rejected": -152.01101684570312, + "loss": 23.1863, + "losses_ref": -0.11281219869852066, + "ref_logps/chosen": -105.07035827636719, + "ref_logps/rejected": -92.59850311279297, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 78.8338851928711, + "rewards/margins": 138.24636840820312, + "rewards/rejected": -59.41249465942383, + "step": 1580, + "u": -2.7689361572265625, + "weight": 0.07689359784126282 + }, + { + "diff_generated": -61.61199188232422, + "epoch": 0.5152300712896954, + "grad_norm": 524.6603640460535, + "learning_rate": 7.87529103283555e-07, + "logits/chosen": -2.496485471725464, + "logits/rejected": -2.5591673851013184, + "logps/chosen": -24.393762588500977, + "logps/rejected": -159.10330200195312, + "loss": 23.5204, + "losses_ref": -0.000706658698618412, + "ref_logps/chosen": -105.49637603759766, + "ref_logps/rejected": -97.49131774902344, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.10260772705078, + "rewards/margins": 142.71463012695312, + "rewards/rejected": -61.61199188232422, + "step": 1590, + "u": -2.827200412750244, + "weight": 0.056273262947797775 + }, + { + "diff_generated": -62.04583740234375, + "epoch": 0.5184705119896306, + "grad_norm": 486.00112587998933, + "learning_rate": 7.871526841735649e-07, + "logits/chosen": -2.460685968399048, + "logits/rejected": -2.5224931240081787, + "logps/chosen": -22.099401473999023, + "logps/rejected": -156.4468536376953, + "loss": 23.1256, + "losses_ref": -0.001140061765909195, + "ref_logps/chosen": -102.5761947631836, + "ref_logps/rejected": -94.4010009765625, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.47679901123047, + "rewards/margins": 142.5226287841797, + "rewards/rejected": -62.04583740234375, + "step": 1600, + "u": -2.827183723449707, + "weight": 0.05629079416394234 + }, + { + "diff_generated": -65.4512710571289, + "epoch": 0.5217109526895658, + "grad_norm": 466.42772903384247, + "learning_rate": 7.867707610058127e-07, + "logits/chosen": -2.4642579555511475, + "logits/rejected": -2.5653014183044434, + "logps/chosen": -22.1795654296875, + "logps/rejected": -161.69505310058594, + "loss": 24.3911, + "losses_ref": -0.0003676009946502745, + "ref_logps/chosen": -107.274658203125, + "ref_logps/rejected": -96.24378967285156, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.09510803222656, + "rewards/margins": 150.54637145996094, + "rewards/rejected": -65.4512710571289, + "step": 1610, + "u": -2.8646576404571533, + "weight": 0.04376210644841194 + }, + { + "diff_generated": -66.88667297363281, + "epoch": 0.5249513933895009, + "grad_norm": 524.7309648696627, + "learning_rate": 7.863833392100093e-07, + "logits/chosen": -2.3845362663269043, + "logits/rejected": -2.546915292739868, + "logps/chosen": -20.653705596923828, + "logps/rejected": -162.47006225585938, + "loss": 23.1285, + "losses_ref": -0.00017949177708942443, + "ref_logps/chosen": -100.44062805175781, + "ref_logps/rejected": -95.5833969116211, + "rewards/accuracies": 0.96875, + "rewards/chosen": 79.78692626953125, + "rewards/margins": 146.67359924316406, + "rewards/rejected": -66.88667297363281, + "step": 1620, + "u": -2.9021100997924805, + "weight": 0.03125579282641411 + }, + { + "diff_generated": -64.42332458496094, + "epoch": 0.5281918340894362, + "grad_norm": 521.3325633377792, + "learning_rate": 7.859904242940385e-07, + "logits/chosen": -2.4505696296691895, + "logits/rejected": -2.5312981605529785, + "logps/chosen": -22.638744354248047, + "logps/rejected": -158.0549774169922, + "loss": 23.2638, + "losses_ref": -0.0024958422873169184, + "ref_logps/chosen": -105.48319244384766, + "ref_logps/rejected": -93.63166809082031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 82.84444427490234, + "rewards/margins": 147.26779174804688, + "rewards/rejected": -64.42332458496094, + "step": 1630, + "u": -2.902041435241699, + "weight": 0.031327806413173676 + }, + { + "diff_generated": -63.451873779296875, + "epoch": 0.5314322747893714, + "grad_norm": 622.6509381105307, + "learning_rate": 7.855920218438783e-07, + "logits/chosen": -2.4348044395446777, + "logits/rejected": -2.5007541179656982, + "logps/chosen": -22.81783676147461, + "logps/rejected": -154.0760955810547, + "loss": 24.1201, + "losses_ref": -9.59602912189439e-05, + "ref_logps/chosen": -103.3770523071289, + "ref_logps/rejected": -90.62422180175781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.5592041015625, + "rewards/margins": 144.01107788085938, + "rewards/rejected": -63.451873779296875, + "step": 1640, + "u": -2.845944881439209, + "weight": 0.05000091716647148 + }, + { + "diff_generated": -63.432151794433594, + "epoch": 0.5346727154893065, + "grad_norm": 546.6727877995182, + "learning_rate": 7.851881375235216e-07, + "logits/chosen": -2.4938337802886963, + "logits/rejected": -2.552638530731201, + "logps/chosen": -21.497270584106445, + "logps/rejected": -155.7118377685547, + "loss": 22.9475, + "losses_ref": -1.2555911780509632e-05, + "ref_logps/chosen": -104.17326354980469, + "ref_logps/rejected": -92.2796859741211, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 82.67601013183594, + "rewards/margins": 146.10818481445312, + "rewards/rejected": -63.432151794433594, + "step": 1650, + "u": -2.752329111099243, + "weight": 0.0812501534819603 + }, + { + "diff_generated": -63.13446044921875, + "epoch": 0.5379131561892417, + "grad_norm": 580.421752704657, + "learning_rate": 7.847787770748959e-07, + "logits/chosen": -2.4939372539520264, + "logits/rejected": -2.530005693435669, + "logps/chosen": -24.140911102294922, + "logps/rejected": -160.89926147460938, + "loss": 24.2631, + "losses_ref": -0.05048029497265816, + "ref_logps/chosen": -106.90547943115234, + "ref_logps/rejected": -97.76480865478516, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.76457214355469, + "rewards/margins": 145.89903259277344, + "rewards/rejected": -63.13446044921875, + "step": 1660, + "u": -2.8825602531433105, + "weight": 0.038321636617183685 + }, + { + "diff_generated": -62.831153869628906, + "epoch": 0.541153596889177, + "grad_norm": 489.8524115970577, + "learning_rate": 7.843639463177815e-07, + "logits/chosen": -2.463425636291504, + "logits/rejected": -2.5472099781036377, + "logps/chosen": -23.383384704589844, + "logps/rejected": -160.83154296875, + "loss": 22.1327, + "losses_ref": -0.000257982435869053, + "ref_logps/chosen": -104.5407485961914, + "ref_logps/rejected": -98.00038146972656, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.15735626220703, + "rewards/margins": 143.98849487304688, + "rewards/rejected": -62.831153869628906, + "step": 1670, + "u": -2.883389949798584, + "weight": 0.037502724677324295 + }, + { + "diff_generated": -62.441497802734375, + "epoch": 0.5443940375891121, + "grad_norm": 509.1945555911371, + "learning_rate": 7.839436511497288e-07, + "logits/chosen": -2.4598562717437744, + "logits/rejected": -2.5456697940826416, + "logps/chosen": -23.473520278930664, + "logps/rejected": -160.74217224121094, + "loss": 22.2912, + "losses_ref": -0.011212515644729137, + "ref_logps/chosen": -105.08975982666016, + "ref_logps/rejected": -98.3006591796875, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.61624908447266, + "rewards/margins": 144.0577392578125, + "rewards/rejected": -62.441497802734375, + "step": 1680, + "u": -2.8271243572235107, + "weight": 0.05635226517915726 + }, + { + "diff_generated": -64.6219711303711, + "epoch": 0.5476344782890473, + "grad_norm": 523.6951385500053, + "learning_rate": 7.835178975459744e-07, + "logits/chosen": -2.418170928955078, + "logits/rejected": -2.54695463180542, + "logps/chosen": -22.01700782775879, + "logps/rejected": -156.66183471679688, + "loss": 22.9691, + "losses_ref": -3.7558904296020046e-05, + "ref_logps/chosen": -100.3355941772461, + "ref_logps/rejected": -92.03987121582031, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.31858825683594, + "rewards/margins": 142.94058227539062, + "rewards/rejected": -64.6219711303711, + "step": 1690, + "u": -2.864668369293213, + "weight": 0.04375080019235611 + }, + { + "diff_generated": -66.69950103759766, + "epoch": 0.5508749189889826, + "grad_norm": 519.6350502512817, + "learning_rate": 7.83086691559356e-07, + "logits/chosen": -2.4784340858459473, + "logits/rejected": -2.543238639831543, + "logps/chosen": -21.36613655090332, + "logps/rejected": -163.73825073242188, + "loss": 23.1716, + "losses_ref": -6.978793680900708e-05, + "ref_logps/chosen": -106.7024154663086, + "ref_logps/rejected": -97.03874206542969, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 85.33628845214844, + "rewards/margins": 152.03579711914062, + "rewards/rejected": -66.69950103759766, + "step": 1700, + "u": -2.8459436893463135, + "weight": 0.050002217292785645 + }, + { + "diff_generated": -62.40593338012695, + "epoch": 0.5541153596889177, + "grad_norm": 485.5030879907328, + "learning_rate": 7.826500393202268e-07, + "logits/chosen": -2.4303226470947266, + "logits/rejected": -2.4927942752838135, + "logps/chosen": -24.63642692565918, + "logps/rejected": -156.96841430664062, + "loss": 23.4314, + "losses_ref": -3.987304921793111e-07, + "ref_logps/chosen": -107.1015625, + "ref_logps/rejected": -94.56246948242188, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.46513366699219, + "rewards/margins": 144.87107849121094, + "rewards/rejected": -62.40593338012695, + "step": 1710, + "u": -2.86466908454895, + "weight": 0.04375000670552254 + }, + { + "diff_generated": -63.31097412109375, + "epoch": 0.5573558003888529, + "grad_norm": 545.1671033122269, + "learning_rate": 7.82207947036368e-07, + "logits/chosen": -2.396975040435791, + "logits/rejected": -2.517516613006592, + "logps/chosen": -21.996206283569336, + "logps/rejected": -156.7769012451172, + "loss": 22.7569, + "losses_ref": -3.080959868384525e-05, + "ref_logps/chosen": -101.49336242675781, + "ref_logps/rejected": -93.46592712402344, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.49714660644531, + "rewards/margins": 142.80812072753906, + "rewards/rejected": -63.31097412109375, + "step": 1720, + "u": -2.827221632003784, + "weight": 0.056250907480716705 + }, + { + "diff_generated": -61.9562873840332, + "epoch": 0.560596241088788, + "grad_norm": 501.8892398648424, + "learning_rate": 7.817604209929007e-07, + "logits/chosen": -2.4421796798706055, + "logits/rejected": -2.4990382194519043, + "logps/chosen": -25.083301544189453, + "logps/rejected": -155.49093627929688, + "loss": 23.5091, + "losses_ref": -0.0001709732023300603, + "ref_logps/chosen": -107.28742980957031, + "ref_logps/rejected": -93.53466796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 82.20413208007812, + "rewards/margins": 144.16041564941406, + "rewards/rejected": -61.9562873840332, + "step": 1730, + "u": -2.8084943294525146, + "weight": 0.06250493228435516 + }, + { + "diff_generated": -66.76475524902344, + "epoch": 0.5638366817887233, + "grad_norm": 478.4304565461847, + "learning_rate": 7.813074675521962e-07, + "logits/chosen": -2.5093724727630615, + "logits/rejected": -2.563171625137329, + "logps/chosen": -24.625473022460938, + "logps/rejected": -160.79348754882812, + "loss": 23.7473, + "losses_ref": -0.00032444013049826026, + "ref_logps/chosen": -109.4178466796875, + "ref_logps/rejected": -94.02871704101562, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 84.7923812866211, + "rewards/margins": 151.55715942382812, + "rewards/rejected": -66.76475524902344, + "step": 1740, + "u": -2.920833110809326, + "weight": 0.02500629983842373 + }, + { + "diff_generated": -63.10648727416992, + "epoch": 0.5670771224886585, + "grad_norm": 490.42313347010605, + "learning_rate": 7.80849093153786e-07, + "logits/chosen": -2.4644055366516113, + "logits/rejected": -2.574089765548706, + "logps/chosen": -21.094036102294922, + "logps/rejected": -158.0782928466797, + "loss": 22.1476, + "losses_ref": -0.0006667963461950421, + "ref_logps/chosen": -102.20223236083984, + "ref_logps/rejected": -94.97178649902344, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.10820007324219, + "rewards/margins": 144.21469116210938, + "rewards/rejected": -63.10648727416992, + "step": 1750, + "u": -2.8084781169891357, + "weight": 0.0625220313668251 + }, + { + "diff_generated": -64.43269348144531, + "epoch": 0.5703175631885936, + "grad_norm": 469.7503258339084, + "learning_rate": 7.803853043142702e-07, + "logits/chosen": -2.4399752616882324, + "logits/rejected": -2.5511956214904785, + "logps/chosen": -24.27750015258789, + "logps/rejected": -158.4519500732422, + "loss": 22.0519, + "losses_ref": -5.32972126165987e-06, + "ref_logps/chosen": -104.4724349975586, + "ref_logps/rejected": -94.0192642211914, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.19493103027344, + "rewards/margins": 144.62762451171875, + "rewards/rejected": -64.43269348144531, + "step": 1760, + "u": -2.8272221088409424, + "weight": 0.05625014379620552 + }, + { + "diff_generated": -62.670074462890625, + "epoch": 0.5735580038885288, + "grad_norm": 489.2762977810382, + "learning_rate": 7.799161076272245e-07, + "logits/chosen": -2.4400739669799805, + "logits/rejected": -2.551038980484009, + "logps/chosen": -22.8772029876709, + "logps/rejected": -153.8516082763672, + "loss": 22.5905, + "losses_ref": -0.001177564263343811, + "ref_logps/chosen": -102.48980712890625, + "ref_logps/rejected": -91.18153381347656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.61260223388672, + "rewards/margins": 142.28268432617188, + "rewards/rejected": -62.670074462890625, + "step": 1770, + "u": -2.84592604637146, + "weight": 0.0500204935669899 + }, + { + "diff_generated": -63.65525436401367, + "epoch": 0.5767984445884641, + "grad_norm": 567.6852836934356, + "learning_rate": 7.794415097631066e-07, + "logits/chosen": -2.465444803237915, + "logits/rejected": -2.5374231338500977, + "logps/chosen": -22.26080894470215, + "logps/rejected": -155.75405883789062, + "loss": 23.2949, + "losses_ref": -0.0010055515449494123, + "ref_logps/chosen": -103.94819641113281, + "ref_logps/rejected": -92.09880065917969, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 81.68737030029297, + "rewards/margins": 145.34262084960938, + "rewards/rejected": -63.65525436401367, + "step": 1780, + "u": -2.920820951461792, + "weight": 0.02501886710524559 + }, + { + "diff_generated": -65.3624267578125, + "epoch": 0.5800388852883992, + "grad_norm": 513.5847187253291, + "learning_rate": 7.789615174691619e-07, + "logits/chosen": -2.4127378463745117, + "logits/rejected": -2.553704023361206, + "logps/chosen": -24.556291580200195, + "logps/rejected": -162.8341827392578, + "loss": 23.1237, + "losses_ref": -6.812562787672505e-06, + "ref_logps/chosen": -104.76051330566406, + "ref_logps/rejected": -97.47174072265625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.20420837402344, + "rewards/margins": 145.56663513183594, + "rewards/rejected": -65.3624267578125, + "step": 1790, + "u": -2.8459458351135254, + "weight": 0.05000011995434761 + }, + { + "diff_generated": -62.64613723754883, + "epoch": 0.5832793259883344, + "grad_norm": 522.2699412816897, + "learning_rate": 7.784761375693268e-07, + "logits/chosen": -2.3942253589630127, + "logits/rejected": -2.475468158721924, + "logps/chosen": -23.327556610107422, + "logps/rejected": -157.86233520507812, + "loss": 23.1586, + "losses_ref": -0.0005751922726631165, + "ref_logps/chosen": -101.79295349121094, + "ref_logps/rejected": -95.2161865234375, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.46539306640625, + "rewards/margins": 141.1115264892578, + "rewards/rejected": -62.64613723754883, + "step": 1800, + "u": -2.8272063732147217, + "weight": 0.056266941130161285 + }, + { + "diff_generated": -62.276100158691406, + "epoch": 0.5865197666882696, + "grad_norm": 511.5652585880466, + "learning_rate": 7.779853769641319e-07, + "logits/chosen": -2.4133849143981934, + "logits/rejected": -2.516460418701172, + "logps/chosen": -24.87993621826172, + "logps/rejected": -154.90475463867188, + "loss": 22.386, + "losses_ref": -0.0681997612118721, + "ref_logps/chosen": -105.0778579711914, + "ref_logps/rejected": -92.6286392211914, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.19792175292969, + "rewards/margins": 142.47402954101562, + "rewards/rejected": -62.276100158691406, + "step": 1810, + "u": -2.8637335300445557, + "weight": 0.044672973453998566 + }, + { + "diff_generated": -62.598541259765625, + "epoch": 0.5897602073882048, + "grad_norm": 514.7686797519576, + "learning_rate": 7.774892426306042e-07, + "logits/chosen": -2.442140579223633, + "logits/rejected": -2.5441012382507324, + "logps/chosen": -21.814807891845703, + "logps/rejected": -156.7989501953125, + "loss": 22.4342, + "losses_ref": -0.028686290606856346, + "ref_logps/chosen": -100.86701965332031, + "ref_logps/rejected": -94.20040130615234, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 79.05220794677734, + "rewards/margins": 141.6507568359375, + "rewards/rejected": -62.598541259765625, + "step": 1820, + "u": -2.770651340484619, + "weight": 0.07541613280773163 + }, + { + "diff_generated": -62.50495147705078, + "epoch": 0.59300064808814, + "grad_norm": 462.48253575696555, + "learning_rate": 7.769877416221678e-07, + "logits/chosen": -2.435598850250244, + "logits/rejected": -2.4805240631103516, + "logps/chosen": -25.63004493713379, + "logps/rejected": -154.00819396972656, + "loss": 24.0052, + "losses_ref": -6.09873459325172e-05, + "ref_logps/chosen": -106.66398620605469, + "ref_logps/rejected": -91.50323486328125, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.033935546875, + "rewards/margins": 143.5388946533203, + "rewards/rejected": -62.50495147705078, + "step": 1830, + "u": -2.8459439277648926, + "weight": 0.05000190809369087 + }, + { + "diff_generated": -61.7323112487793, + "epoch": 0.5962410887880751, + "grad_norm": 482.46333258166396, + "learning_rate": 7.764808810685433e-07, + "logits/chosen": -2.4022223949432373, + "logits/rejected": -2.508943557739258, + "logps/chosen": -19.269611358642578, + "logps/rejected": -148.99969482421875, + "loss": 22.679, + "losses_ref": -0.12175735086202621, + "ref_logps/chosen": -97.94807434082031, + "ref_logps/rejected": -87.26737976074219, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 78.67845153808594, + "rewards/margins": 140.4107666015625, + "rewards/rejected": -61.7323112487793, + "step": 1840, + "u": -2.76997709274292, + "weight": 0.07604043185710907 + }, + { + "diff_generated": -65.25732421875, + "epoch": 0.5994815294880104, + "grad_norm": 493.6422581588208, + "learning_rate": 7.759686681756468e-07, + "logits/chosen": -2.4459421634674072, + "logits/rejected": -2.5107102394104004, + "logps/chosen": -22.35720443725586, + "logps/rejected": -157.98370361328125, + "loss": 22.4855, + "losses_ref": -0.07618378102779388, + "ref_logps/chosen": -103.86003112792969, + "ref_logps/rejected": -92.72637939453125, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.50282287597656, + "rewards/margins": 146.76016235351562, + "rewards/rejected": -65.25732421875, + "step": 1850, + "u": -2.8438050746917725, + "weight": 0.052067242562770844 + }, + { + "diff_generated": -67.14825439453125, + "epoch": 0.6027219701879456, + "grad_norm": 454.52641698007017, + "learning_rate": 7.754511102254876e-07, + "logits/chosen": -2.4075095653533936, + "logits/rejected": -2.548067569732666, + "logps/chosen": -21.86354637145996, + "logps/rejected": -158.21429443359375, + "loss": 23.3475, + "losses_ref": -9.742849215399474e-06, + "ref_logps/chosen": -99.64906311035156, + "ref_logps/rejected": -91.06602478027344, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.78551483154297, + "rewards/margins": 144.9337615966797, + "rewards/rejected": -67.14825439453125, + "step": 1860, + "u": -2.864668846130371, + "weight": 0.04375017434358597 + }, + { + "diff_generated": -62.16118621826172, + "epoch": 0.6059624108878807, + "grad_norm": 487.2540164995362, + "learning_rate": 7.74928214576064e-07, + "logits/chosen": -2.455625057220459, + "logits/rejected": -2.4872801303863525, + "logps/chosen": -23.484718322753906, + "logps/rejected": -158.16915893554688, + "loss": 23.0171, + "losses_ref": -0.0007770330994389951, + "ref_logps/chosen": -107.91748046875, + "ref_logps/rejected": -96.00798034667969, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.4327621459961, + "rewards/margins": 146.59396362304688, + "rewards/rejected": -62.16118621826172, + "step": 1870, + "u": -2.8833842277526855, + "weight": 0.037508707493543625 + }, + { + "diff_generated": -66.49128723144531, + "epoch": 0.609202851587816, + "grad_norm": 481.5167142557763, + "learning_rate": 7.743999886612591e-07, + "logits/chosen": -2.427978992462158, + "logits/rejected": -2.5397520065307617, + "logps/chosen": -22.91992950439453, + "logps/rejected": -166.8085174560547, + "loss": 21.9439, + "losses_ref": -6.627455877605826e-05, + "ref_logps/chosen": -106.3371353149414, + "ref_logps/rejected": -100.3172378540039, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 83.41720581054688, + "rewards/margins": 149.90847778320312, + "rewards/rejected": -66.49128723144531, + "step": 1880, + "u": -2.958284854888916, + "weight": 0.012500783428549767 + }, + { + "diff_generated": -67.05741882324219, + "epoch": 0.6124432922877512, + "grad_norm": 496.53902730741385, + "learning_rate": 7.738664399907355e-07, + "logits/chosen": -2.4462687969207764, + "logits/rejected": -2.5562310218811035, + "logps/chosen": -22.51824951171875, + "logps/rejected": -161.39178466796875, + "loss": 21.531, + "losses_ref": -0.02634531632065773, + "ref_logps/chosen": -104.7353286743164, + "ref_logps/rejected": -94.33436584472656, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 82.21707916259766, + "rewards/margins": 149.2744903564453, + "rewards/rejected": -67.05741882324219, + "step": 1890, + "u": -2.938530445098877, + "weight": 0.01975167542695999 + }, + { + "diff_generated": -63.624107360839844, + "epoch": 0.6156837329876863, + "grad_norm": 484.850293764759, + "learning_rate": 7.733275761498278e-07, + "logits/chosen": -2.455961227416992, + "logits/rejected": -2.5032289028167725, + "logps/chosen": -24.390783309936523, + "logps/rejected": -157.72637939453125, + "loss": 22.9816, + "losses_ref": -8.3882812759839e-05, + "ref_logps/chosen": -104.32967376708984, + "ref_logps/rejected": -94.10226440429688, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.93888854980469, + "rewards/margins": 143.56300354003906, + "rewards/rejected": -63.624107360839844, + "step": 1900, + "u": -2.8272206783294678, + "weight": 0.056251801550388336 + }, + { + "diff_generated": -63.5076904296875, + "epoch": 0.6189241736876215, + "grad_norm": 517.0610809402186, + "learning_rate": 7.727834047994353e-07, + "logits/chosen": -2.435137987136841, + "logits/rejected": -2.527508497238159, + "logps/chosen": -26.37552833557129, + "logps/rejected": -158.0044708251953, + "loss": 22.5725, + "losses_ref": -0.0003489324008114636, + "ref_logps/chosen": -107.90934753417969, + "ref_logps/rejected": -94.49675750732422, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.53382110595703, + "rewards/margins": 145.04153442382812, + "rewards/rejected": -63.5076904296875, + "step": 1910, + "u": -2.8459413051605225, + "weight": 0.050004612654447556 + }, + { + "diff_generated": -63.08461380004883, + "epoch": 0.6221646143875567, + "grad_norm": 500.0113269557425, + "learning_rate": 7.722339336759129e-07, + "logits/chosen": -2.374927520751953, + "logits/rejected": -2.5143895149230957, + "logps/chosen": -23.31403923034668, + "logps/rejected": -157.79208374023438, + "loss": 23.0743, + "losses_ref": -0.000542301801033318, + "ref_logps/chosen": -100.7695541381836, + "ref_logps/rejected": -94.70748138427734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 77.45551300048828, + "rewards/margins": 140.54013061523438, + "rewards/rejected": -63.08461380004883, + "step": 1920, + "u": -2.808495283126831, + "weight": 0.06250405311584473 + }, + { + "diff_generated": -60.091407775878906, + "epoch": 0.6254050550874919, + "grad_norm": 545.7189624860678, + "learning_rate": 7.71679170590961e-07, + "logits/chosen": -2.480167865753174, + "logits/rejected": -2.5257725715637207, + "logps/chosen": -24.214298248291016, + "logps/rejected": -151.6998291015625, + "loss": 22.0246, + "losses_ref": -0.0001407624949933961, + "ref_logps/chosen": -105.802734375, + "ref_logps/rejected": -91.60841369628906, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.58843994140625, + "rewards/margins": 141.67984008789062, + "rewards/rejected": -60.091407775878906, + "step": 1930, + "u": -2.8459417819976807, + "weight": 0.05000431090593338 + }, + { + "diff_generated": -66.3674545288086, + "epoch": 0.6286454957874271, + "grad_norm": 435.99074017457156, + "learning_rate": 7.711191234315146e-07, + "logits/chosen": -2.439845561981201, + "logits/rejected": -2.5313148498535156, + "logps/chosen": -23.811113357543945, + "logps/rejected": -166.7593536376953, + "loss": 22.3956, + "losses_ref": -2.1102810933371074e-05, + "ref_logps/chosen": -107.8023910522461, + "ref_logps/rejected": -100.39189147949219, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 83.99127960205078, + "rewards/margins": 150.35873413085938, + "rewards/rejected": -66.3674545288086, + "step": 1940, + "u": -2.9208381175994873, + "weight": 0.025000810623168945 + }, + { + "diff_generated": -64.84213256835938, + "epoch": 0.6318859364873622, + "grad_norm": 475.5077212160026, + "learning_rate": 7.705538001596312e-07, + "logits/chosen": -2.442000389099121, + "logits/rejected": -2.5633978843688965, + "logps/chosen": -20.576784133911133, + "logps/rejected": -161.9834442138672, + "loss": 22.8372, + "losses_ref": -2.7589265982896904e-07, + "ref_logps/chosen": -101.49675750732422, + "ref_logps/rejected": -97.14131164550781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.91997528076172, + "rewards/margins": 145.76210021972656, + "rewards/rejected": -64.84213256835938, + "step": 1950, + "u": -2.8459455966949463, + "weight": 0.05000000447034836 + }, + { + "diff_generated": -63.08556365966797, + "epoch": 0.6351263771872975, + "grad_norm": 496.28924401889697, + "learning_rate": 7.699832088123774e-07, + "logits/chosen": -2.468907117843628, + "logits/rejected": -2.493875503540039, + "logps/chosen": -24.173206329345703, + "logps/rejected": -156.0330810546875, + "loss": 23.0837, + "losses_ref": -0.00012453203089535236, + "ref_logps/chosen": -108.07179260253906, + "ref_logps/rejected": -92.94752502441406, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 83.8985824584961, + "rewards/margins": 146.98414611816406, + "rewards/rejected": -63.08556365966797, + "step": 1960, + "u": -2.8646676540374756, + "weight": 0.04375167936086655 + }, + { + "diff_generated": -63.73643112182617, + "epoch": 0.6383668178872327, + "grad_norm": 517.9038733979648, + "learning_rate": 7.694073575017151e-07, + "logits/chosen": -2.3692421913146973, + "logits/rejected": -2.484090566635132, + "logps/chosen": -20.927684783935547, + "logps/rejected": -153.87741088867188, + "loss": 22.1402, + "losses_ref": -0.00039308052510023117, + "ref_logps/chosen": -99.08850860595703, + "ref_logps/rejected": -90.14097595214844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.16082000732422, + "rewards/margins": 141.89724731445312, + "rewards/rejected": -63.73643112182617, + "step": 1970, + "u": -2.808486223220825, + "weight": 0.06251360476016998 + }, + { + "diff_generated": -61.4334716796875, + "epoch": 0.6416072585871678, + "grad_norm": 517.1665763284212, + "learning_rate": 7.688262544143854e-07, + "logits/chosen": -2.450671434402466, + "logits/rejected": -2.5171408653259277, + "logps/chosen": -22.22519302368164, + "logps/rejected": -153.62709045410156, + "loss": 22.4668, + "losses_ref": -0.0037123418878763914, + "ref_logps/chosen": -102.99430084228516, + "ref_logps/rejected": -92.19358825683594, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.76910400390625, + "rewards/margins": 142.2025909423828, + "rewards/rejected": -61.4334716796875, + "step": 1980, + "u": -2.7709667682647705, + "weight": 0.07509000599384308 + }, + { + "diff_generated": -61.0735969543457, + "epoch": 0.6448476992871031, + "grad_norm": 467.9000491482675, + "learning_rate": 7.682399078117928e-07, + "logits/chosen": -2.4567606449127197, + "logits/rejected": -2.4964141845703125, + "logps/chosen": -21.273324966430664, + "logps/rejected": -155.58253479003906, + "loss": 22.9956, + "losses_ref": -0.0011844413820654154, + "ref_logps/chosen": -106.13948059082031, + "ref_logps/rejected": -94.5089340209961, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 84.86614990234375, + "rewards/margins": 145.93975830078125, + "rewards/rejected": -61.0735969543457, + "step": 1990, + "u": -2.789741039276123, + "weight": 0.06878674030303955 + }, + { + "diff_generated": -60.8897705078125, + "epoch": 0.6480881399870383, + "grad_norm": 490.0431658581082, + "learning_rate": 7.67648326029888e-07, + "logits/chosen": -2.458367109298706, + "logits/rejected": -2.5214147567749023, + "logps/chosen": -24.48371124267578, + "logps/rejected": -155.35421752929688, + "loss": 23.3033, + "losses_ref": -0.13151639699935913, + "ref_logps/chosen": -106.46592712402344, + "ref_logps/rejected": -94.46443939208984, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.98223114013672, + "rewards/margins": 142.87197875976562, + "rewards/rejected": -60.8897705078125, + "step": 2000, + "u": -2.806457042694092, + "weight": 0.06433537602424622 + }, + { + "diff_generated": -61.09978103637695, + "epoch": 0.6513285806869734, + "grad_norm": 502.60124902624244, + "learning_rate": 7.670515174790485e-07, + "logits/chosen": -2.4394757747650146, + "logits/rejected": -2.4780218601226807, + "logps/chosen": -23.52560806274414, + "logps/rejected": -153.04684448242188, + "loss": 23.0476, + "losses_ref": -4.396005533635616e-05, + "ref_logps/chosen": -104.49568176269531, + "ref_logps/rejected": -91.94706726074219, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.97007751464844, + "rewards/margins": 142.06985473632812, + "rewards/rejected": -61.09978103637695, + "step": 2010, + "u": -2.8646678924560547, + "weight": 0.04375128075480461 + }, + { + "diff_generated": -65.47056579589844, + "epoch": 0.6545690213869086, + "grad_norm": 495.671101844857, + "learning_rate": 7.664494906439598e-07, + "logits/chosen": -2.4299168586730957, + "logits/rejected": -2.4831480979919434, + "logps/chosen": -21.032333374023438, + "logps/rejected": -160.3956298828125, + "loss": 21.4842, + "losses_ref": -4.356700628704857e-06, + "ref_logps/chosen": -105.74322509765625, + "ref_logps/rejected": -94.92507934570312, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 84.71089172363281, + "rewards/margins": 150.1814422607422, + "rewards/rejected": -65.47056579589844, + "step": 2020, + "u": -2.864668846130371, + "weight": 0.04375012591481209 + }, + { + "diff_generated": -65.01509094238281, + "epoch": 0.6578094620868438, + "grad_norm": 445.63138523279576, + "learning_rate": 7.658422540834943e-07, + "logits/chosen": -2.456162929534912, + "logits/rejected": -2.5066542625427246, + "logps/chosen": -25.305461883544922, + "logps/rejected": -163.2794189453125, + "loss": 23.6974, + "losses_ref": -9.959562703443225e-06, + "ref_logps/chosen": -109.1541519165039, + "ref_logps/rejected": -98.26432800292969, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 83.84867858886719, + "rewards/margins": 148.86378479003906, + "rewards/rejected": -65.01509094238281, + "step": 2030, + "u": -2.9208388328552246, + "weight": 0.025000208988785744 + }, + { + "diff_generated": -63.924659729003906, + "epoch": 0.661049902786779, + "grad_norm": 468.64499107985927, + "learning_rate": 7.6522981643059e-07, + "logits/chosen": -2.41487455368042, + "logits/rejected": -2.517202377319336, + "logps/chosen": -24.286367416381836, + "logps/rejected": -157.21624755859375, + "loss": 21.7411, + "losses_ref": -0.002179847564548254, + "ref_logps/chosen": -106.35169982910156, + "ref_logps/rejected": -93.29158782958984, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.06531524658203, + "rewards/margins": 145.98995971679688, + "rewards/rejected": -63.924659729003906, + "step": 2040, + "u": -2.883330821990967, + "weight": 0.03756442666053772 + }, + { + "diff_generated": -66.58832550048828, + "epoch": 0.6642903434867142, + "grad_norm": 505.77673209327895, + "learning_rate": 7.646121863921278e-07, + "logits/chosen": -2.4321842193603516, + "logits/rejected": -2.4582810401916504, + "logps/chosen": -24.985980987548828, + "logps/rejected": -161.1887664794922, + "loss": 22.7123, + "losses_ref": -6.723775186401326e-06, + "ref_logps/chosen": -112.06063079833984, + "ref_logps/rejected": -94.60044860839844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 87.07464599609375, + "rewards/margins": 153.66297912597656, + "rewards/rejected": -66.58832550048828, + "step": 2050, + "u": -2.9021155834198, + "weight": 0.03125009313225746 + }, + { + "diff_generated": -66.82035827636719, + "epoch": 0.6675307841866494, + "grad_norm": 449.26925201751106, + "learning_rate": 7.639893727488069e-07, + "logits/chosen": -2.4013147354125977, + "logits/rejected": -2.5611088275909424, + "logps/chosen": -21.46106719970703, + "logps/rejected": -164.8437957763672, + "loss": 21.7194, + "losses_ref": -5.366952973417938e-05, + "ref_logps/chosen": -101.9166488647461, + "ref_logps/rejected": -98.0234375, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.45558166503906, + "rewards/margins": 147.27593994140625, + "rewards/rejected": -66.82035827636719, + "step": 2060, + "u": -2.8833909034729004, + "weight": 0.03750170022249222 + }, + { + "diff_generated": -61.822166442871094, + "epoch": 0.6707712248865846, + "grad_norm": 515.5706810364948, + "learning_rate": 7.633613843550212e-07, + "logits/chosen": -2.4471516609191895, + "logits/rejected": -2.4825403690338135, + "logps/chosen": -23.67911720275879, + "logps/rejected": -153.21046447753906, + "loss": 23.3547, + "losses_ref": -0.0002042811829596758, + "ref_logps/chosen": -107.107421875, + "ref_logps/rejected": -91.38829040527344, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 83.42830657958984, + "rewards/margins": 145.25047302246094, + "rewards/rejected": -61.822166442871094, + "step": 2070, + "u": -2.8272159099578857, + "weight": 0.056256841868162155 + }, + { + "diff_generated": -63.19563674926758, + "epoch": 0.6740116655865198, + "grad_norm": 518.2182827065114, + "learning_rate": 7.627282301387325e-07, + "logits/chosen": -2.343768358230591, + "logits/rejected": -2.4819016456604004, + "logps/chosen": -21.6712703704834, + "logps/rejected": -157.62033081054688, + "loss": 22.1577, + "losses_ref": -6.540703179780394e-05, + "ref_logps/chosen": -98.38591766357422, + "ref_logps/rejected": -94.42469787597656, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.71464538574219, + "rewards/margins": 139.91029357910156, + "rewards/rejected": -63.19563674926758, + "step": 2080, + "u": -2.789773941040039, + "weight": 0.0687519758939743 + }, + { + "diff_generated": -63.01131057739258, + "epoch": 0.6772521062864549, + "grad_norm": 453.6543222237952, + "learning_rate": 7.620899191013438e-07, + "logits/chosen": -2.380980968475342, + "logits/rejected": -2.469796657562256, + "logps/chosen": -24.606945037841797, + "logps/rejected": -157.47056579589844, + "loss": 22.8886, + "losses_ref": -0.0032991624902933836, + "ref_logps/chosen": -103.7176284790039, + "ref_logps/rejected": -94.45926666259766, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.11067962646484, + "rewards/margins": 142.1219940185547, + "rewards/rejected": -63.01131057739258, + "step": 2090, + "u": -2.827118396759033, + "weight": 0.05635886266827583 + }, + { + "diff_generated": -63.28471755981445, + "epoch": 0.6804925469863902, + "grad_norm": 486.84061677943566, + "learning_rate": 7.614464603175717e-07, + "logits/chosen": -2.4669952392578125, + "logits/rejected": -2.4586520195007324, + "logps/chosen": -23.30873680114746, + "logps/rejected": -155.25479125976562, + "loss": 21.5548, + "losses_ref": -3.5264114558231086e-05, + "ref_logps/chosen": -109.42105865478516, + "ref_logps/rejected": -91.97007751464844, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 86.11231231689453, + "rewards/margins": 149.39703369140625, + "rewards/rejected": -63.28471755981445, + "step": 2100, + "u": -2.8272218704223633, + "weight": 0.056250572204589844 + }, + { + "diff_generated": -63.9326057434082, + "epoch": 0.6837329876863253, + "grad_norm": 608.3829410411037, + "learning_rate": 7.607978629353167e-07, + "logits/chosen": -2.4042506217956543, + "logits/rejected": -2.4919590950012207, + "logps/chosen": -23.0515193939209, + "logps/rejected": -159.70335388183594, + "loss": 22.2682, + "losses_ref": -1.2647595212911256e-05, + "ref_logps/chosen": -102.62166595458984, + "ref_logps/rejected": -95.770751953125, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.57013702392578, + "rewards/margins": 143.5027313232422, + "rewards/rejected": -63.9326057434082, + "step": 2110, + "u": -2.8459455966949463, + "weight": 0.050000179558992386 + }, + { + "diff_generated": -66.02317810058594, + "epoch": 0.6869734283862605, + "grad_norm": 575.9626101415919, + "learning_rate": 7.60144136175534e-07, + "logits/chosen": -2.4289004802703857, + "logits/rejected": -2.5169577598571777, + "logps/chosen": -19.686887741088867, + "logps/rejected": -158.76817321777344, + "loss": 21.8352, + "losses_ref": -5.7465281315671746e-06, + "ref_logps/chosen": -103.3939208984375, + "ref_logps/rejected": -92.7449951171875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 83.70702362060547, + "rewards/margins": 149.73019409179688, + "rewards/rejected": -66.02317810058594, + "step": 2120, + "u": -2.8084988594055176, + "weight": 0.06250015646219254 + }, + { + "diff_generated": -62.25181198120117, + "epoch": 0.6902138690861958, + "grad_norm": 539.948074502807, + "learning_rate": 7.594852893321015e-07, + "logits/chosen": -2.391740083694458, + "logits/rejected": -2.504075288772583, + "logps/chosen": -21.70474624633789, + "logps/rejected": -154.61550903320312, + "loss": 22.0856, + "losses_ref": -1.4690643183712382e-05, + "ref_logps/chosen": -100.77751159667969, + "ref_logps/rejected": -92.36370086669922, + "rewards/accuracies": 0.9375, + "rewards/chosen": 79.07275390625, + "rewards/margins": 141.32456970214844, + "rewards/rejected": -62.25181198120117, + "step": 2130, + "u": -2.8084988594055176, + "weight": 0.06250045448541641 + }, + { + "diff_generated": -60.95367431640625, + "epoch": 0.6934543097861309, + "grad_norm": 455.15519241736564, + "learning_rate": 7.588213317716883e-07, + "logits/chosen": -2.3332302570343018, + "logits/rejected": -2.4906795024871826, + "logps/chosen": -20.162681579589844, + "logps/rejected": -153.51536560058594, + "loss": 22.3847, + "losses_ref": -3.23783970088698e-05, + "ref_logps/chosen": -98.51542663574219, + "ref_logps/rejected": -92.56169128417969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.35274505615234, + "rewards/margins": 139.30642700195312, + "rewards/rejected": -60.95367431640625, + "step": 2140, + "u": -2.8084988594055176, + "weight": 0.06250031292438507 + }, + { + "diff_generated": -62.808319091796875, + "epoch": 0.6966947504860661, + "grad_norm": 476.1131594433043, + "learning_rate": 7.581522729336214e-07, + "logits/chosen": -2.358593702316284, + "logits/rejected": -2.4386630058288574, + "logps/chosen": -21.54372787475586, + "logps/rejected": -151.33340454101562, + "loss": 21.9699, + "losses_ref": -0.0032075338531285524, + "ref_logps/chosen": -104.14180755615234, + "ref_logps/rejected": -88.52510070800781, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 82.59809112548828, + "rewards/margins": 145.40640258789062, + "rewards/rejected": -62.808319091796875, + "step": 2150, + "u": -2.78973126411438, + "weight": 0.06879661977291107 + }, + { + "diff_generated": -64.31022644042969, + "epoch": 0.6999351911860013, + "grad_norm": 461.8653155813388, + "learning_rate": 7.574781223297513e-07, + "logits/chosen": -2.4114131927490234, + "logits/rejected": -2.4808099269866943, + "logps/chosen": -23.671085357666016, + "logps/rejected": -158.84121704101562, + "loss": 21.2518, + "losses_ref": -7.819013262633234e-05, + "ref_logps/chosen": -107.03285217285156, + "ref_logps/rejected": -94.53096008300781, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 83.36175537109375, + "rewards/margins": 147.6719970703125, + "rewards/rejected": -64.31022644042969, + "step": 2160, + "u": -2.8272206783294678, + "weight": 0.05625178664922714 + }, + { + "diff_generated": -61.34014892578125, + "epoch": 0.7031756318859365, + "grad_norm": 458.9677086656394, + "learning_rate": 7.567988895443173e-07, + "logits/chosen": -2.403904914855957, + "logits/rejected": -2.456617832183838, + "logps/chosen": -20.48429298400879, + "logps/rejected": -154.02468872070312, + "loss": 22.2045, + "losses_ref": -7.258141704369336e-05, + "ref_logps/chosen": -102.9297866821289, + "ref_logps/rejected": -92.68452453613281, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 82.44548034667969, + "rewards/margins": 143.78562927246094, + "rewards/rejected": -61.34014892578125, + "step": 2170, + "u": -2.7710511684417725, + "weight": 0.07500146329402924 + }, + { + "diff_generated": -63.43732452392578, + "epoch": 0.7064160725858717, + "grad_norm": 461.1658348906773, + "learning_rate": 7.561145842338102e-07, + "logits/chosen": -2.4059627056121826, + "logits/rejected": -2.483588933944702, + "logps/chosen": -22.730510711669922, + "logps/rejected": -157.40292358398438, + "loss": 21.8283, + "losses_ref": -0.002027056645601988, + "ref_logps/chosen": -103.70097351074219, + "ref_logps/rejected": -93.9655990600586, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.97046661376953, + "rewards/margins": 144.4077911376953, + "rewards/rejected": -63.43732452392578, + "step": 2180, + "u": -2.8646457195281982, + "weight": 0.04377438873052597 + }, + { + "diff_generated": -64.67951202392578, + "epoch": 0.7096565132858069, + "grad_norm": 516.5860749311239, + "learning_rate": 7.554252161268365e-07, + "logits/chosen": -2.3907630443573, + "logits/rejected": -2.484009265899658, + "logps/chosen": -23.040851593017578, + "logps/rejected": -158.6120147705078, + "loss": 21.6585, + "losses_ref": -2.7939324354520068e-05, + "ref_logps/chosen": -103.9288101196289, + "ref_logps/rejected": -93.93251037597656, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.88795471191406, + "rewards/margins": 145.5674591064453, + "rewards/rejected": -64.67951202392578, + "step": 2190, + "u": -2.827221632003784, + "weight": 0.05625084042549133 + }, + { + "diff_generated": -63.912109375, + "epoch": 0.712896953985742, + "grad_norm": 490.5482765339361, + "learning_rate": 7.547307950239785e-07, + "logits/chosen": -2.4400482177734375, + "logits/rejected": -2.5055909156799316, + "logps/chosen": -22.498706817626953, + "logps/rejected": -156.80374145507812, + "loss": 22.3672, + "losses_ref": -0.0010926555842161179, + "ref_logps/chosen": -108.22175598144531, + "ref_logps/rejected": -92.89161682128906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.72303771972656, + "rewards/margins": 149.63516235351562, + "rewards/rejected": -63.912109375, + "step": 2200, + "u": -2.8646504878997803, + "weight": 0.04376964271068573 + }, + { + "diff_generated": -60.452056884765625, + "epoch": 0.7161373946856773, + "grad_norm": 506.02258490932047, + "learning_rate": 7.540313307976563e-07, + "logits/chosen": -2.410672664642334, + "logits/rejected": -2.4670870304107666, + "logps/chosen": -21.85871124267578, + "logps/rejected": -146.9501495361328, + "loss": 23.2672, + "losses_ref": -3.913135878974572e-05, + "ref_logps/chosen": -100.46055603027344, + "ref_logps/rejected": -86.49808502197266, + "rewards/accuracies": 0.90625, + "rewards/chosen": 78.60184478759766, + "rewards/margins": 139.05389404296875, + "rewards/rejected": -60.452056884765625, + "step": 2210, + "u": -2.714881420135498, + "weight": 0.09375067800283432 + }, + { + "diff_generated": -64.68046569824219, + "epoch": 0.7193778353856124, + "grad_norm": 426.80190950422735, + "learning_rate": 7.533268333919865e-07, + "logits/chosen": -2.4156734943389893, + "logits/rejected": -2.5220518112182617, + "logps/chosen": -23.024089813232422, + "logps/rejected": -160.01979064941406, + "loss": 21.8487, + "losses_ref": -2.0198485799483024e-06, + "ref_logps/chosen": -105.49406433105469, + "ref_logps/rejected": -95.33931732177734, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.46998596191406, + "rewards/margins": 147.15042114257812, + "rewards/rejected": -64.68046569824219, + "step": 2220, + "u": -2.883392333984375, + "weight": 0.03750001639127731 + }, + { + "diff_generated": -66.26643371582031, + "epoch": 0.7226182760855476, + "grad_norm": 466.1440755552816, + "learning_rate": 7.526173128226416e-07, + "logits/chosen": -2.4000744819641113, + "logits/rejected": -2.5063486099243164, + "logps/chosen": -23.757949829101562, + "logps/rejected": -160.63470458984375, + "loss": 21.44, + "losses_ref": -4.0359450395044405e-06, + "ref_logps/chosen": -102.93437194824219, + "ref_logps/rejected": -94.3682632446289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.17642211914062, + "rewards/margins": 145.44285583496094, + "rewards/rejected": -66.26643371582031, + "step": 2230, + "u": -2.8459458351135254, + "weight": 0.0500001423060894 + }, + { + "diff_generated": -62.00419998168945, + "epoch": 0.7258587167854829, + "grad_norm": 440.4294281394254, + "learning_rate": 7.519027791767069e-07, + "logits/chosen": -2.3969852924346924, + "logits/rejected": -2.445456027984619, + "logps/chosen": -25.067035675048828, + "logps/rejected": -158.9746856689453, + "loss": 22.5539, + "losses_ref": -7.336337148444727e-06, + "ref_logps/chosen": -107.4830093383789, + "ref_logps/rejected": -96.97047424316406, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.41597747802734, + "rewards/margins": 144.420166015625, + "rewards/rejected": -62.00419998168945, + "step": 2240, + "u": -2.864668846130371, + "weight": 0.043750133365392685 + }, + { + "diff_generated": -60.13362503051758, + "epoch": 0.729099157485418, + "grad_norm": 517.9713428292987, + "learning_rate": 7.511832426125375e-07, + "logits/chosen": -2.4315335750579834, + "logits/rejected": -2.459440231323242, + "logps/chosen": -22.896793365478516, + "logps/rejected": -151.60752868652344, + "loss": 22.1816, + "losses_ref": -0.03598102182149887, + "ref_logps/chosen": -105.4724349975586, + "ref_logps/rejected": -91.47391510009766, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 82.57564544677734, + "rewards/margins": 142.70925903320312, + "rewards/rejected": -60.13362503051758, + "step": 2250, + "u": -2.7703163623809814, + "weight": 0.07573078572750092 + }, + { + "diff_generated": -61.66132354736328, + "epoch": 0.7323395981853532, + "grad_norm": 516.4359264166292, + "learning_rate": 7.504587133596141e-07, + "logits/chosen": -2.4692695140838623, + "logits/rejected": -2.5395781993865967, + "logps/chosen": -21.603050231933594, + "logps/rejected": -155.9254150390625, + "loss": 21.559, + "losses_ref": -9.956043504644185e-05, + "ref_logps/chosen": -102.48640441894531, + "ref_logps/rejected": -94.26409912109375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.88334655761719, + "rewards/margins": 142.54466247558594, + "rewards/rejected": -61.66132354736328, + "step": 2260, + "u": -2.7710509300231934, + "weight": 0.07500142604112625 + }, + { + "diff_generated": -69.13394165039062, + "epoch": 0.7355800388852884, + "grad_norm": 495.25176006178754, + "learning_rate": 7.497292017183965e-07, + "logits/chosen": -2.4685215950012207, + "logits/rejected": -2.557250499725342, + "logps/chosen": -21.886051177978516, + "logps/rejected": -162.83438110351562, + "loss": 22.6062, + "losses_ref": -7.937352347653359e-05, + "ref_logps/chosen": -106.17291259765625, + "ref_logps/rejected": -93.70042419433594, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 84.28688049316406, + "rewards/margins": 153.4208221435547, + "rewards/rejected": -69.13394165039062, + "step": 2270, + "u": -2.920837879180908, + "weight": 0.025001097470521927 + }, + { + "diff_generated": -67.47743225097656, + "epoch": 0.7388204795852236, + "grad_norm": 485.1923407466973, + "learning_rate": 7.489947180601791e-07, + "logits/chosen": -2.3922696113586426, + "logits/rejected": -2.482814311981201, + "logps/chosen": -21.585960388183594, + "logps/rejected": -162.74942016601562, + "loss": 20.9103, + "losses_ref": -0.003178620245307684, + "ref_logps/chosen": -101.3736801147461, + "ref_logps/rejected": -95.27201843261719, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.78772735595703, + "rewards/margins": 147.26513671875, + "rewards/rejected": -67.47743225097656, + "step": 2280, + "u": -2.827193021774292, + "weight": 0.05628061294555664 + }, + { + "diff_generated": -67.1082763671875, + "epoch": 0.7420609202851588, + "grad_norm": 521.4494695236634, + "learning_rate": 7.482552728269412e-07, + "logits/chosen": -2.459716320037842, + "logits/rejected": -2.52567195892334, + "logps/chosen": -22.746732711791992, + "logps/rejected": -161.5008544921875, + "loss": 21.7148, + "losses_ref": -2.0488827431108803e-05, + "ref_logps/chosen": -105.77946472167969, + "ref_logps/rejected": -94.39257049560547, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.03275299072266, + "rewards/margins": 150.14102172851562, + "rewards/rejected": -67.1082763671875, + "step": 2290, + "u": -2.845945119857788, + "weight": 0.050000615417957306 + }, + { + "diff_generated": -65.12721252441406, + "epoch": 0.7453013609850939, + "grad_norm": 479.8495051392024, + "learning_rate": 7.475108765312001e-07, + "logits/chosen": -2.4147284030914307, + "logits/rejected": -2.4764370918273926, + "logps/chosen": -22.323062896728516, + "logps/rejected": -157.99337768554688, + "loss": 22.2798, + "losses_ref": -1.4420872503251303e-05, + "ref_logps/chosen": -104.8254165649414, + "ref_logps/rejected": -92.86617279052734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 82.50235748291016, + "rewards/margins": 147.62954711914062, + "rewards/rejected": -65.12721252441406, + "step": 2300, + "u": -2.8084986209869385, + "weight": 0.06250043213367462 + }, + { + "diff_generated": -64.4495849609375, + "epoch": 0.7485418016850292, + "grad_norm": 490.381996304794, + "learning_rate": 7.467615397558613e-07, + "logits/chosen": -2.425992488861084, + "logits/rejected": -2.536778211593628, + "logps/chosen": -22.38633918762207, + "logps/rejected": -158.5471954345703, + "loss": 22.8358, + "losses_ref": -1.2244420759088825e-05, + "ref_logps/chosen": -102.4664077758789, + "ref_logps/rejected": -94.09761047363281, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.08006286621094, + "rewards/margins": 144.5296630859375, + "rewards/rejected": -64.4495849609375, + "step": 2310, + "u": -2.8084990978240967, + "weight": 0.06250017136335373 + }, + { + "diff_generated": -64.47420501708984, + "epoch": 0.7517822423849644, + "grad_norm": 485.0885572896212, + "learning_rate": 7.460072731540676e-07, + "logits/chosen": -2.426664113998413, + "logits/rejected": -2.540827512741089, + "logps/chosen": -20.13199234008789, + "logps/rejected": -158.8080596923828, + "loss": 21.2484, + "losses_ref": -1.036675371324236e-06, + "ref_logps/chosen": -101.04064178466797, + "ref_logps/rejected": -94.33384704589844, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.90865325927734, + "rewards/margins": 145.38284301757812, + "rewards/rejected": -64.47420501708984, + "step": 2320, + "u": -2.8459458351135254, + "weight": 0.05000002309679985 + }, + { + "diff_generated": -64.52836608886719, + "epoch": 0.7550226830848995, + "grad_norm": 496.78201327829345, + "learning_rate": 7.452480874490483e-07, + "logits/chosen": -2.433242082595825, + "logits/rejected": -2.5348401069641113, + "logps/chosen": -20.570690155029297, + "logps/rejected": -159.38168334960938, + "loss": 21.5967, + "losses_ref": -0.17097142338752747, + "ref_logps/chosen": -103.46836853027344, + "ref_logps/rejected": -94.85330963134766, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.8976821899414, + "rewards/margins": 147.42605590820312, + "rewards/rejected": -64.52836608886719, + "step": 2330, + "u": -2.8245105743408203, + "weight": 0.058567456901073456 + }, + { + "diff_generated": -68.34838104248047, + "epoch": 0.7582631237848347, + "grad_norm": 471.99192442956496, + "learning_rate": 7.44483993433966e-07, + "logits/chosen": -2.409271717071533, + "logits/rejected": -2.536099672317505, + "logps/chosen": -18.12385368347168, + "logps/rejected": -163.3461456298828, + "loss": 21.7101, + "losses_ref": -9.695874177850783e-05, + "ref_logps/chosen": -98.65773010253906, + "ref_logps/rejected": -94.99775695800781, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.53387451171875, + "rewards/margins": 148.8822479248047, + "rewards/rejected": -68.34838104248047, + "step": 2340, + "u": -2.8272194862365723, + "weight": 0.05625307559967041 + }, + { + "diff_generated": -64.6845474243164, + "epoch": 0.76150356448477, + "grad_norm": 489.92266490281014, + "learning_rate": 7.437150019717641e-07, + "logits/chosen": -2.3955681324005127, + "logits/rejected": -2.508866310119629, + "logps/chosen": -19.950611114501953, + "logps/rejected": -154.4866180419922, + "loss": 22.6635, + "losses_ref": -7.071851723594591e-05, + "ref_logps/chosen": -100.37982177734375, + "ref_logps/rejected": -89.80207824707031, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.42921447753906, + "rewards/margins": 145.11373901367188, + "rewards/rejected": -64.6845474243164, + "step": 2350, + "u": -2.7710509300231934, + "weight": 0.07500143349170685 + }, + { + "diff_generated": -66.65843963623047, + "epoch": 0.7647440051847051, + "grad_norm": 477.8300325994623, + "learning_rate": 7.429411239950116e-07, + "logits/chosen": -2.4518046379089355, + "logits/rejected": -2.560556411743164, + "logps/chosen": -23.21895980834961, + "logps/rejected": -164.80648803710938, + "loss": 22.1374, + "losses_ref": -4.8610025260131806e-05, + "ref_logps/chosen": -104.90296936035156, + "ref_logps/rejected": -98.14803314208984, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.68400573730469, + "rewards/margins": 148.34243774414062, + "rewards/rejected": -66.65843963623047, + "step": 2360, + "u": -2.845944881439209, + "weight": 0.05000089854001999 + }, + { + "diff_generated": -66.3208236694336, + "epoch": 0.7679844458846403, + "grad_norm": 452.81373646927426, + "learning_rate": 7.421623705057477e-07, + "logits/chosen": -2.4707469940185547, + "logits/rejected": -2.5160088539123535, + "logps/chosen": -18.89572525024414, + "logps/rejected": -159.04458618164062, + "loss": 21.3106, + "losses_ref": -9.07492358237505e-05, + "ref_logps/chosen": -102.90705871582031, + "ref_logps/rejected": -92.72374725341797, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 84.01132202148438, + "rewards/margins": 150.33213806152344, + "rewards/rejected": -66.3208236694336, + "step": 2370, + "u": -2.7710506916046143, + "weight": 0.07500214129686356 + }, + { + "diff_generated": -61.952720642089844, + "epoch": 0.7712248865845756, + "grad_norm": 498.58487987552655, + "learning_rate": 7.413787525753261e-07, + "logits/chosen": -2.402759552001953, + "logits/rejected": -2.4742043018341064, + "logps/chosen": -21.622535705566406, + "logps/rejected": -149.8624725341797, + "loss": 22.742, + "losses_ref": -0.002540594432502985, + "ref_logps/chosen": -99.20856475830078, + "ref_logps/rejected": -87.90975189208984, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 77.58602142333984, + "rewards/margins": 139.53875732421875, + "rewards/rejected": -61.952720642089844, + "step": 2380, + "u": -2.7335448265075684, + "weight": 0.08756421506404877 + }, + { + "diff_generated": -65.39524841308594, + "epoch": 0.7744653272845107, + "grad_norm": 469.90953526285074, + "learning_rate": 7.405902813442564e-07, + "logits/chosen": -2.4351794719696045, + "logits/rejected": -2.5125985145568848, + "logps/chosen": -19.192697525024414, + "logps/rejected": -160.94482421875, + "loss": 20.883, + "losses_ref": -0.00011022499529644847, + "ref_logps/chosen": -103.31660461425781, + "ref_logps/rejected": -95.549560546875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 84.12390899658203, + "rewards/margins": 149.5191650390625, + "rewards/rejected": -65.39524841308594, + "step": 2390, + "u": -2.864665985107422, + "weight": 0.04375321418046951 + }, + { + "diff_generated": -66.54057312011719, + "epoch": 0.7777057679844459, + "grad_norm": 443.37611926249815, + "learning_rate": 7.39796968022047e-07, + "logits/chosen": -2.370534658432007, + "logits/rejected": -2.4970390796661377, + "logps/chosen": -19.731678009033203, + "logps/rejected": -162.2751922607422, + "loss": 20.9988, + "losses_ref": -0.0014762509381398559, + "ref_logps/chosen": -100.18806457519531, + "ref_logps/rejected": -95.73462677001953, + "rewards/accuracies": 0.96875, + "rewards/chosen": 80.45638275146484, + "rewards/margins": 146.9969482421875, + "rewards/rejected": -66.54057312011719, + "step": 2400, + "u": -2.9020659923553467, + "weight": 0.031302180141210556 + }, + { + "diff_generated": -66.64973449707031, + "epoch": 0.780946208684381, + "grad_norm": 486.44332089600374, + "learning_rate": 7.389988238870451e-07, + "logits/chosen": -2.4307727813720703, + "logits/rejected": -2.447831392288208, + "logps/chosen": -23.99560546875, + "logps/rejected": -163.25650024414062, + "loss": 21.7153, + "losses_ref": -0.0023669307120144367, + "ref_logps/chosen": -112.10261535644531, + "ref_logps/rejected": -96.60676574707031, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 88.10700988769531, + "rewards/margins": 154.75674438476562, + "rewards/rejected": -66.64973449707031, + "step": 2410, + "u": -2.883309841156006, + "weight": 0.037586063146591187 + }, + { + "diff_generated": -67.35139465332031, + "epoch": 0.7841866493843163, + "grad_norm": 497.9936829582562, + "learning_rate": 7.381958602862763e-07, + "logits/chosen": -2.425899028778076, + "logits/rejected": -2.4827542304992676, + "logps/chosen": -22.701671600341797, + "logps/rejected": -162.77073669433594, + "loss": 21.7061, + "losses_ref": -0.0016478713368996978, + "ref_logps/chosen": -108.4243392944336, + "ref_logps/rejected": -95.41934204101562, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.72265625, + "rewards/margins": 153.07406616210938, + "rewards/rejected": -67.35139465332031, + "step": 2420, + "u": -2.8833794593811035, + "weight": 0.03751363232731819 + }, + { + "diff_generated": -67.99402618408203, + "epoch": 0.7874270900842515, + "grad_norm": 520.3935841964872, + "learning_rate": 7.373880886352832e-07, + "logits/chosen": -2.4826571941375732, + "logits/rejected": -2.5400936603546143, + "logps/chosen": -24.883174896240234, + "logps/rejected": -164.97372436523438, + "loss": 21.7416, + "losses_ref": -0.0028067389503121376, + "ref_logps/chosen": -110.7287368774414, + "ref_logps/rejected": -96.97972106933594, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.84556579589844, + "rewards/margins": 153.839599609375, + "rewards/rejected": -67.99402618408203, + "step": 2430, + "u": -2.883361339569092, + "weight": 0.0375325009226799 + }, + { + "diff_generated": -68.01702880859375, + "epoch": 0.7906675307841866, + "grad_norm": 485.6200345774653, + "learning_rate": 7.365755204179637e-07, + "logits/chosen": -2.337303638458252, + "logits/rejected": -2.5004334449768066, + "logps/chosen": -21.580249786376953, + "logps/rejected": -164.0987091064453, + "loss": 21.8716, + "losses_ref": -0.0004503504023887217, + "ref_logps/chosen": -100.92903137207031, + "ref_logps/rejected": -96.08168029785156, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.34877014160156, + "rewards/margins": 147.36581420898438, + "rewards/rejected": -68.01702880859375, + "step": 2440, + "u": -2.8646552562713623, + "weight": 0.04376443102955818 + }, + { + "diff_generated": -66.02622985839844, + "epoch": 0.7939079714841218, + "grad_norm": 491.08878187266504, + "learning_rate": 7.357581671864073e-07, + "logits/chosen": -2.3765900135040283, + "logits/rejected": -2.5279154777526855, + "logps/chosen": -21.805896759033203, + "logps/rejected": -162.8059844970703, + "loss": 22.1446, + "losses_ref": -0.00038308973307721317, + "ref_logps/chosen": -104.0298080444336, + "ref_logps/rejected": -96.77973937988281, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.22390747070312, + "rewards/margins": 148.25015258789062, + "rewards/rejected": -66.02622985839844, + "step": 2450, + "u": -2.8272156715393066, + "weight": 0.05625728517770767 + }, + { + "diff_generated": -65.53976440429688, + "epoch": 0.7971484121840571, + "grad_norm": 441.5921689816669, + "learning_rate": 7.349360405607303e-07, + "logits/chosen": -2.3526320457458496, + "logits/rejected": -2.5054166316986084, + "logps/chosen": -18.177913665771484, + "logps/rejected": -153.53543090820312, + "loss": 21.1706, + "losses_ref": -2.2838194126961753e-05, + "ref_logps/chosen": -96.81034851074219, + "ref_logps/rejected": -87.99567413330078, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 78.63243865966797, + "rewards/margins": 144.17221069335938, + "rewards/rejected": -65.53976440429688, + "step": 2460, + "u": -2.73360538482666, + "weight": 0.0875006690621376 + }, + { + "diff_generated": -63.57398223876953, + "epoch": 0.8003888528839922, + "grad_norm": 443.2087952671502, + "learning_rate": 7.341091522289122e-07, + "logits/chosen": -2.4686570167541504, + "logits/rejected": -2.539186954498291, + "logps/chosen": -20.775897979736328, + "logps/rejected": -156.01101684570312, + "loss": 21.1844, + "losses_ref": -0.020325956866145134, + "ref_logps/chosen": -104.54405212402344, + "ref_logps/rejected": -92.4370346069336, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 83.7681655883789, + "rewards/margins": 147.34213256835938, + "rewards/rejected": -63.57398223876953, + "step": 2470, + "u": -2.8642375469207764, + "weight": 0.044188931584358215 + }, + { + "diff_generated": -67.1478271484375, + "epoch": 0.8036292935839274, + "grad_norm": 464.5524810852338, + "learning_rate": 7.332775139466278e-07, + "logits/chosen": -2.5062315464019775, + "logits/rejected": -2.5958425998687744, + "logps/chosen": -20.429386138916016, + "logps/rejected": -166.99806213378906, + "loss": 21.832, + "losses_ref": -4.462396100279875e-05, + "ref_logps/chosen": -106.252685546875, + "ref_logps/rejected": -99.85023498535156, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 85.82329559326172, + "rewards/margins": 152.97113037109375, + "rewards/rejected": -67.1478271484375, + "step": 2480, + "u": -2.939561128616333, + "weight": 0.018751276656985283 + }, + { + "diff_generated": -61.29233932495117, + "epoch": 0.8068697342838627, + "grad_norm": 452.46199016409196, + "learning_rate": 7.324411375370809e-07, + "logits/chosen": -2.3994901180267334, + "logits/rejected": -2.4693729877471924, + "logps/chosen": -21.55607032775879, + "logps/rejected": -152.15652465820312, + "loss": 22.3104, + "losses_ref": -4.651785729947733e-07, + "ref_logps/chosen": -100.73274230957031, + "ref_logps/rejected": -90.86417388916016, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 79.17667388916016, + "rewards/margins": 140.46902465820312, + "rewards/rejected": -61.29233932495117, + "step": 2490, + "u": -2.771052360534668, + "weight": 0.07500000298023224 + }, + { + "diff_generated": -63.45140838623047, + "epoch": 0.8101101749837978, + "grad_norm": 482.5709323861089, + "learning_rate": 7.316000348908365e-07, + "logits/chosen": -2.4395322799682617, + "logits/rejected": -2.494189739227295, + "logps/chosen": -22.55654525756836, + "logps/rejected": -155.06121826171875, + "loss": 21.8039, + "losses_ref": -0.002513646613806486, + "ref_logps/chosen": -102.3602523803711, + "ref_logps/rejected": -91.60980987548828, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 79.80370330810547, + "rewards/margins": 143.25511169433594, + "rewards/rejected": -63.45140838623047, + "step": 2500, + "u": -2.770974636077881, + "weight": 0.07508156448602676 + }, + { + "diff_generated": -65.52870178222656, + "epoch": 0.813350615683733, + "grad_norm": 504.7063906203371, + "learning_rate": 7.307542179656511e-07, + "logits/chosen": -2.433494806289673, + "logits/rejected": -2.512331247329712, + "logps/chosen": -21.219797134399414, + "logps/rejected": -157.972412109375, + "loss": 21.5012, + "losses_ref": -5.332523869583383e-05, + "ref_logps/chosen": -103.51334381103516, + "ref_logps/rejected": -92.44371032714844, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 82.29354858398438, + "rewards/margins": 147.82223510742188, + "rewards/rejected": -65.52870178222656, + "step": 2510, + "u": -2.7710516452789307, + "weight": 0.07500091940164566 + }, + { + "diff_generated": -62.45106887817383, + "epoch": 0.8165910563836681, + "grad_norm": 475.13560592764156, + "learning_rate": 7.29903698786303e-07, + "logits/chosen": -2.4201724529266357, + "logits/rejected": -2.4631454944610596, + "logps/chosen": -22.06174659729004, + "logps/rejected": -152.70657348632812, + "loss": 21.7992, + "losses_ref": -0.003750152885913849, + "ref_logps/chosen": -105.0264892578125, + "ref_logps/rejected": -90.2554931640625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 82.9647445678711, + "rewards/margins": 145.4158172607422, + "rewards/rejected": -62.45106887817383, + "step": 2520, + "u": -2.8458142280578613, + "weight": 0.05013728886842728 + }, + { + "diff_generated": -59.080657958984375, + "epoch": 0.8198314970836034, + "grad_norm": 476.35332307840963, + "learning_rate": 7.290484894444214e-07, + "logits/chosen": -2.3711142539978027, + "logits/rejected": -2.4935173988342285, + "logps/chosen": -19.110456466674805, + "logps/rejected": -148.75198364257812, + "loss": 20.8768, + "losses_ref": -1.9302149212307995e-06, + "ref_logps/chosen": -95.0177001953125, + "ref_logps/rejected": -89.67131042480469, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 75.9072494506836, + "rewards/margins": 134.9879150390625, + "rewards/rejected": -59.080657958984375, + "step": 2530, + "u": -2.65871262550354, + "weight": 0.11250003427267075 + }, + { + "diff_generated": -63.03108596801758, + "epoch": 0.8230719377835386, + "grad_norm": 509.794427016166, + "learning_rate": 7.281886020983144e-07, + "logits/chosen": -2.421325445175171, + "logits/rejected": -2.4717049598693848, + "logps/chosen": -23.188867568969727, + "logps/rejected": -155.51673889160156, + "loss": 20.9843, + "losses_ref": -3.044925324502401e-05, + "ref_logps/chosen": -106.98453521728516, + "ref_logps/rejected": -92.48563385009766, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 83.79566192626953, + "rewards/margins": 146.82675170898438, + "rewards/rejected": -63.03108596801758, + "step": 2540, + "u": -2.883391857147217, + "weight": 0.03750057518482208 + }, + { + "diff_generated": -63.89447784423828, + "epoch": 0.8263123784834737, + "grad_norm": 509.63529206637907, + "learning_rate": 7.273240489727963e-07, + "logits/chosen": -2.4009127616882324, + "logits/rejected": -2.4554474353790283, + "logps/chosen": -22.355283737182617, + "logps/rejected": -153.71575927734375, + "loss": 21.594, + "losses_ref": -0.0010917767649516463, + "ref_logps/chosen": -104.6422348022461, + "ref_logps/rejected": -89.8212890625, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 82.28695678710938, + "rewards/margins": 146.18142700195312, + "rewards/rejected": -63.89447784423828, + "step": 2550, + "u": -2.7897419929504395, + "weight": 0.06878544390201569 + }, + { + "diff_generated": -65.38443756103516, + "epoch": 0.829552819183409, + "grad_norm": 449.2525910292251, + "learning_rate": 7.264548423590133e-07, + "logits/chosen": -2.411677837371826, + "logits/rejected": -2.497188091278076, + "logps/chosen": -20.991043090820312, + "logps/rejected": -158.5404510498047, + "loss": 21.2943, + "losses_ref": -3.5285770536575e-06, + "ref_logps/chosen": -102.8228759765625, + "ref_logps/rejected": -93.15602111816406, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.83184051513672, + "rewards/margins": 147.2162628173828, + "rewards/rejected": -65.38443756103516, + "step": 2560, + "u": -2.8272223472595215, + "weight": 0.056250084191560745 + }, + { + "diff_generated": -64.46638488769531, + "epoch": 0.8327932598833442, + "grad_norm": 482.2496726200671, + "learning_rate": 7.255809946142695e-07, + "logits/chosen": -2.4007577896118164, + "logits/rejected": -2.467890501022339, + "logps/chosen": -22.70101547241211, + "logps/rejected": -158.30746459960938, + "loss": 22.7473, + "losses_ref": -3.9935894164955243e-05, + "ref_logps/chosen": -105.04328918457031, + "ref_logps/rejected": -93.8410873413086, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.3422622680664, + "rewards/margins": 146.80865478515625, + "rewards/rejected": -64.46638488769531, + "step": 2570, + "u": -2.827221632003784, + "weight": 0.056250907480716705 + }, + { + "diff_generated": -66.88107299804688, + "epoch": 0.8360337005832793, + "grad_norm": 480.3455155687526, + "learning_rate": 7.247025181618508e-07, + "logits/chosen": -2.434239387512207, + "logits/rejected": -2.5026652812957764, + "logps/chosen": -22.651269912719727, + "logps/rejected": -163.5029754638672, + "loss": 21.5124, + "losses_ref": -4.334291588747874e-05, + "ref_logps/chosen": -105.74259948730469, + "ref_logps/rejected": -96.62191009521484, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 83.09132385253906, + "rewards/margins": 149.97239685058594, + "rewards/rejected": -66.88107299804688, + "step": 2580, + "u": -2.8833911418914795, + "weight": 0.03750128298997879 + }, + { + "diff_generated": -66.57487487792969, + "epoch": 0.8392741412832145, + "grad_norm": 516.8389468913304, + "learning_rate": 7.238194254908483e-07, + "logits/chosen": -2.364807367324829, + "logits/rejected": -2.4656434059143066, + "logps/chosen": -21.80525779724121, + "logps/rejected": -159.70773315429688, + "loss": 23.2982, + "losses_ref": -4.899716077488847e-05, + "ref_logps/chosen": -102.61576843261719, + "ref_logps/rejected": -93.1328353881836, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.8105239868164, + "rewards/margins": 147.38540649414062, + "rewards/rejected": -66.57487487792969, + "step": 2590, + "u": -2.8272204399108887, + "weight": 0.05625171214342117 + }, + { + "diff_generated": -64.31034851074219, + "epoch": 0.8425145819831497, + "grad_norm": 429.4437523001111, + "learning_rate": 7.229317291559807e-07, + "logits/chosen": -2.421973705291748, + "logits/rejected": -2.543602466583252, + "logps/chosen": -22.478836059570312, + "logps/rejected": -159.15264892578125, + "loss": 21.5832, + "losses_ref": -0.010902591049671173, + "ref_logps/chosen": -102.53382873535156, + "ref_logps/rejected": -94.84230041503906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.05498504638672, + "rewards/margins": 144.36532592773438, + "rewards/rejected": -64.31034851074219, + "step": 2600, + "u": -2.86452054977417, + "weight": 0.04390469565987587 + }, + { + "diff_generated": -63.15277099609375, + "epoch": 0.8457550226830849, + "grad_norm": 475.58022107129483, + "learning_rate": 7.22039441777416e-07, + "logits/chosen": -2.386164665222168, + "logits/rejected": -2.4948296546936035, + "logps/chosen": -20.19105339050293, + "logps/rejected": -155.09671020507812, + "loss": 21.9778, + "losses_ref": -0.5333021879196167, + "ref_logps/chosen": -100.70254516601562, + "ref_logps/rejected": -91.94395446777344, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 80.51148986816406, + "rewards/margins": 143.66427612304688, + "rewards/rejected": -63.15277099609375, + "step": 2610, + "u": -2.774341344833374, + "weight": 0.07484979927539825 + }, + { + "diff_generated": -65.25281524658203, + "epoch": 0.8489954633830201, + "grad_norm": 476.3490711162794, + "learning_rate": 7.21142576040592e-07, + "logits/chosen": -2.4489424228668213, + "logits/rejected": -2.5270750522613525, + "logps/chosen": -23.80867576599121, + "logps/rejected": -158.56993103027344, + "loss": 21.3916, + "losses_ref": -2.8012605071126018e-06, + "ref_logps/chosen": -108.41732025146484, + "ref_logps/rejected": -93.31712341308594, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.60865020751953, + "rewards/margins": 149.86146545410156, + "rewards/rejected": -65.25281524658203, + "step": 2620, + "u": -2.883392333984375, + "weight": 0.03750006482005119 + }, + { + "diff_generated": -71.90721130371094, + "epoch": 0.8522359040829552, + "grad_norm": 460.74924900016555, + "learning_rate": 7.202411446960357e-07, + "logits/chosen": -2.4267380237579346, + "logits/rejected": -2.514688491821289, + "logps/chosen": -23.868526458740234, + "logps/rejected": -172.26315307617188, + "loss": 21.7441, + "losses_ref": -0.0003219700593035668, + "ref_logps/chosen": -109.62545013427734, + "ref_logps/rejected": -100.35595703125, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 85.75691986083984, + "rewards/margins": 157.66412353515625, + "rewards/rejected": -71.90721130371094, + "step": 2630, + "u": -2.939551830291748, + "weight": 0.018760699778795242 + }, + { + "diff_generated": -67.33240509033203, + "epoch": 0.8554763447828905, + "grad_norm": 458.961819298331, + "learning_rate": 7.193351605591825e-07, + "logits/chosen": -2.4317545890808105, + "logits/rejected": -2.5447564125061035, + "logps/chosen": -20.274295806884766, + "logps/rejected": -160.0033416748047, + "loss": 20.6512, + "losses_ref": -0.004495877772569656, + "ref_logps/chosen": -100.54878234863281, + "ref_logps/rejected": -92.67094421386719, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.27449035644531, + "rewards/margins": 147.6068878173828, + "rewards/rejected": -67.33240509033203, + "step": 2640, + "u": -2.845808267593384, + "weight": 0.05014340952038765 + }, + { + "diff_generated": -66.26509857177734, + "epoch": 0.8587167854828257, + "grad_norm": 493.0098324223735, + "learning_rate": 7.184246365101939e-07, + "logits/chosen": -2.4812989234924316, + "logits/rejected": -2.4861576557159424, + "logps/chosen": -23.611316680908203, + "logps/rejected": -164.7738037109375, + "loss": 22.9302, + "losses_ref": -1.1925852959393524e-05, + "ref_logps/chosen": -111.0340805053711, + "ref_logps/rejected": -98.50871276855469, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 87.4227523803711, + "rewards/margins": 153.6878662109375, + "rewards/rejected": -66.26509857177734, + "step": 2650, + "u": -2.883392095565796, + "weight": 0.03750019520521164 + }, + { + "diff_generated": -67.60322570800781, + "epoch": 0.8619572261827608, + "grad_norm": 474.88283722662726, + "learning_rate": 7.175095854937739e-07, + "logits/chosen": -2.430361270904541, + "logits/rejected": -2.506678819656372, + "logps/chosen": -21.14377784729004, + "logps/rejected": -163.33004760742188, + "loss": 22.1799, + "losses_ref": -8.996898941404652e-06, + "ref_logps/chosen": -107.8935775756836, + "ref_logps/rejected": -95.72682189941406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 86.74979400634766, + "rewards/margins": 154.35302734375, + "rewards/rejected": -67.60322570800781, + "step": 2660, + "u": -2.9021155834198, + "weight": 0.031250081956386566 + }, + { + "diff_generated": -65.65187072753906, + "epoch": 0.8651976668826961, + "grad_norm": 476.0503207025247, + "learning_rate": 7.165900205189853e-07, + "logits/chosen": -2.4238972663879395, + "logits/rejected": -2.548886775970459, + "logps/chosen": -20.459117889404297, + "logps/rejected": -162.53985595703125, + "loss": 20.9651, + "losses_ref": -1.285673306483659e-06, + "ref_logps/chosen": -101.68338775634766, + "ref_logps/rejected": -96.88798522949219, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.2242660522461, + "rewards/margins": 146.8761444091797, + "rewards/rejected": -65.65187072753906, + "step": 2670, + "u": -2.8272223472595215, + "weight": 0.05625001713633537 + }, + { + "diff_generated": -68.16822052001953, + "epoch": 0.8684381075826313, + "grad_norm": 441.27316639147307, + "learning_rate": 7.156659546590653e-07, + "logits/chosen": -2.372884511947632, + "logits/rejected": -2.4970219135284424, + "logps/chosen": -19.2420597076416, + "logps/rejected": -163.27047729492188, + "loss": 21.3463, + "losses_ref": -1.6201815014937893e-05, + "ref_logps/chosen": -102.10786437988281, + "ref_logps/rejected": -95.10227966308594, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.86579895019531, + "rewards/margins": 151.03402709960938, + "rewards/rejected": -68.16822052001953, + "step": 2680, + "u": -2.864668607711792, + "weight": 0.04375042766332626 + }, + { + "diff_generated": -62.75067901611328, + "epoch": 0.8716785482825664, + "grad_norm": 477.9844945711301, + "learning_rate": 7.147374010512385e-07, + "logits/chosen": -2.336512327194214, + "logits/rejected": -2.411938190460205, + "logps/chosen": -19.760942459106445, + "logps/rejected": -151.9824981689453, + "loss": 20.9396, + "losses_ref": -0.0028138109482824802, + "ref_logps/chosen": -97.89578247070312, + "ref_logps/rejected": -89.23181915283203, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 78.13484191894531, + "rewards/margins": 140.88552856445312, + "rewards/rejected": -62.75067901611328, + "step": 2690, + "u": -2.7335686683654785, + "weight": 0.0875391885638237 + }, + { + "diff_generated": -64.98628234863281, + "epoch": 0.8749189889825016, + "grad_norm": 463.3283372772628, + "learning_rate": 7.13804372896531e-07, + "logits/chosen": -2.34386944770813, + "logits/rejected": -2.469419240951538, + "logps/chosen": -20.207157135009766, + "logps/rejected": -156.82220458984375, + "loss": 21.6988, + "losses_ref": -8.056328624661546e-06, + "ref_logps/chosen": -100.11888885498047, + "ref_logps/rejected": -91.8359375, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.91172790527344, + "rewards/margins": 144.8979949951172, + "rewards/rejected": -64.98628234863281, + "step": 2700, + "u": -2.8272223472595215, + "weight": 0.056250233203172684 + }, + { + "diff_generated": -65.19478607177734, + "epoch": 0.8781594296824368, + "grad_norm": 451.0588003541249, + "learning_rate": 7.128668834595827e-07, + "logits/chosen": -2.456400156021118, + "logits/rejected": -2.477356433868408, + "logps/chosen": -24.446142196655273, + "logps/rejected": -160.22738647460938, + "loss": 20.6906, + "losses_ref": -0.49245530366897583, + "ref_logps/chosen": -108.24623107910156, + "ref_logps/rejected": -95.0325927734375, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.80009460449219, + "rewards/margins": 148.99490356445312, + "rewards/rejected": -65.19478607177734, + "step": 2710, + "u": -2.8409817218780518, + "weight": 0.05361700803041458 + }, + { + "diff_generated": -65.49759674072266, + "epoch": 0.881399870382372, + "grad_norm": 495.22343761633147, + "learning_rate": 7.119249460684583e-07, + "logits/chosen": -2.367755889892578, + "logits/rejected": -2.434497117996216, + "logps/chosen": -23.03337287902832, + "logps/rejected": -157.77207946777344, + "loss": 22.5919, + "losses_ref": -4.087133675056975e-06, + "ref_logps/chosen": -105.26911926269531, + "ref_logps/rejected": -92.27449035644531, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.2357406616211, + "rewards/margins": 147.7333526611328, + "rewards/rejected": -65.49759674072266, + "step": 2720, + "u": -2.883392095565796, + "weight": 0.037500105798244476 + }, + { + "diff_generated": -65.98580169677734, + "epoch": 0.8846403110823072, + "grad_norm": 444.99842861935, + "learning_rate": 7.109785741144577e-07, + "logits/chosen": -2.3372819423675537, + "logits/rejected": -2.4926247596740723, + "logps/chosen": -22.061023712158203, + "logps/rejected": -161.5064239501953, + "loss": 21.3548, + "losses_ref": -0.0001229167974088341, + "ref_logps/chosen": -99.80843353271484, + "ref_logps/rejected": -95.5206298828125, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.7474136352539, + "rewards/margins": 143.73321533203125, + "rewards/rejected": -65.98580169677734, + "step": 2730, + "u": -2.864666700363159, + "weight": 0.04375254735350609 + }, + { + "diff_generated": -65.17032623291016, + "epoch": 0.8878807517822424, + "grad_norm": 431.9604761609712, + "learning_rate": 7.100277810519264e-07, + "logits/chosen": -2.429638385772705, + "logits/rejected": -2.466818332672119, + "logps/chosen": -21.803356170654297, + "logps/rejected": -160.6989288330078, + "loss": 21.1908, + "losses_ref": -1.469298149459064e-05, + "ref_logps/chosen": -107.4203109741211, + "ref_logps/rejected": -95.52860260009766, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.61695861816406, + "rewards/margins": 150.7872772216797, + "rewards/rejected": -65.17032623291016, + "step": 2740, + "u": -2.864668607711792, + "weight": 0.043750371783971786 + }, + { + "diff_generated": -62.27899169921875, + "epoch": 0.8911211924821776, + "grad_norm": 445.17508131716704, + "learning_rate": 7.090725803980633e-07, + "logits/chosen": -2.3705966472625732, + "logits/rejected": -2.4691929817199707, + "logps/chosen": -20.565372467041016, + "logps/rejected": -153.2864532470703, + "loss": 21.8369, + "losses_ref": -0.284327894449234, + "ref_logps/chosen": -99.65605163574219, + "ref_logps/rejected": -91.00746154785156, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.0906753540039, + "rewards/margins": 141.36965942382812, + "rewards/rejected": -62.27899169921875, + "step": 2750, + "u": -2.769773006439209, + "weight": 0.07507076114416122 + }, + { + "diff_generated": -64.82371520996094, + "epoch": 0.8943616331821128, + "grad_norm": 453.49151697328085, + "learning_rate": 7.081129857327297e-07, + "logits/chosen": -2.4251158237457275, + "logits/rejected": -2.4884116649627686, + "logps/chosen": -20.679590225219727, + "logps/rejected": -155.69390869140625, + "loss": 22.4464, + "losses_ref": -9.631844477553386e-06, + "ref_logps/chosen": -106.25617980957031, + "ref_logps/rejected": -90.87019348144531, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 85.57656860351562, + "rewards/margins": 150.40029907226562, + "rewards/rejected": -64.82371520996094, + "step": 2760, + "u": -2.8459455966949463, + "weight": 0.05000028759241104 + }, + { + "diff_generated": -64.34266662597656, + "epoch": 0.8976020738820479, + "grad_norm": 491.41312846460556, + "learning_rate": 7.071490106982547e-07, + "logits/chosen": -2.413074493408203, + "logits/rejected": -2.484881639480591, + "logps/chosen": -22.38163185119629, + "logps/rejected": -160.21389770507812, + "loss": 22.0316, + "losses_ref": -0.002562170149758458, + "ref_logps/chosen": -105.37538146972656, + "ref_logps/rejected": -95.87122344970703, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 82.99375915527344, + "rewards/margins": 147.33642578125, + "rewards/rejected": -64.34266662597656, + "step": 2770, + "u": -2.7710483074188232, + "weight": 0.07500428706407547 + }, + { + "diff_generated": -67.3309326171875, + "epoch": 0.9008425145819832, + "grad_norm": 502.84459768270165, + "learning_rate": 7.061806689992424e-07, + "logits/chosen": -2.3805642127990723, + "logits/rejected": -2.461560010910034, + "logps/chosen": -22.008333206176758, + "logps/rejected": -161.05548095703125, + "loss": 20.997, + "losses_ref": -1.2658249943342526e-05, + "ref_logps/chosen": -104.16725158691406, + "ref_logps/rejected": -93.72454071044922, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.15892028808594, + "rewards/margins": 149.48983764648438, + "rewards/rejected": -67.3309326171875, + "step": 2780, + "u": -2.8272221088409424, + "weight": 0.056250374764204025 + }, + { + "diff_generated": -68.47444152832031, + "epoch": 0.9040829552819183, + "grad_norm": 472.69960968145017, + "learning_rate": 7.052079744023769e-07, + "logits/chosen": -2.5325608253479004, + "logits/rejected": -2.539520740509033, + "logps/chosen": -25.220869064331055, + "logps/rejected": -161.8907470703125, + "loss": 21.6475, + "losses_ref": -0.0035590652842074633, + "ref_logps/chosen": -113.51033782958984, + "ref_logps/rejected": -93.41632080078125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 88.28947448730469, + "rewards/margins": 156.763916015625, + "rewards/rejected": -68.47444152832031, + "step": 2790, + "u": -2.9020447731018066, + "weight": 0.03132425993680954 + }, + { + "diff_generated": -67.08792877197266, + "epoch": 0.9073233959818535, + "grad_norm": 488.04719352739943, + "learning_rate": 7.042309407362264e-07, + "logits/chosen": -2.373950958251953, + "logits/rejected": -2.4799036979675293, + "logps/chosen": -19.957622528076172, + "logps/rejected": -161.4197998046875, + "loss": 21.9205, + "losses_ref": -0.002272902289405465, + "ref_logps/chosen": -104.90579986572266, + "ref_logps/rejected": -94.3318862915039, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 84.94816589355469, + "rewards/margins": 152.03610229492188, + "rewards/rejected": -67.08792877197266, + "step": 2800, + "u": -2.864635944366455, + "weight": 0.04378475621342659 + }, + { + "diff_generated": -66.03021240234375, + "epoch": 0.9105638366817888, + "grad_norm": 490.3691307236369, + "learning_rate": 7.032495818910462e-07, + "logits/chosen": -2.4491524696350098, + "logits/rejected": -2.489548444747925, + "logps/chosen": -20.264293670654297, + "logps/rejected": -161.38272094726562, + "loss": 21.0841, + "losses_ref": -0.00022527421242557466, + "ref_logps/chosen": -104.36869049072266, + "ref_logps/rejected": -95.35247802734375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 84.10438537597656, + "rewards/margins": 150.13461303710938, + "rewards/rejected": -66.03021240234375, + "step": 2810, + "u": -2.8084964752197266, + "weight": 0.06250251829624176 + }, + { + "diff_generated": -67.41888427734375, + "epoch": 0.9138042773817239, + "grad_norm": 486.8872212427091, + "learning_rate": 7.022639118185819e-07, + "logits/chosen": -2.4307072162628174, + "logits/rejected": -2.4828052520751953, + "logps/chosen": -22.83551597595215, + "logps/rejected": -165.50588989257812, + "loss": 21.1277, + "losses_ref": -1.439565221517114e-05, + "ref_logps/chosen": -109.61708068847656, + "ref_logps/rejected": -98.0870132446289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 86.78156280517578, + "rewards/margins": 154.200439453125, + "rewards/rejected": -67.41888427734375, + "step": 2820, + "u": -2.8459455966949463, + "weight": 0.0500003881752491 + }, + { + "diff_generated": -67.82255554199219, + "epoch": 0.9170447180816591, + "grad_norm": 451.73707471536756, + "learning_rate": 7.012739445318712e-07, + "logits/chosen": -2.4566941261291504, + "logits/rejected": -2.5175421237945557, + "logps/chosen": -22.337270736694336, + "logps/rejected": -162.39474487304688, + "loss": 21.2607, + "losses_ref": -5.0442671636119485e-05, + "ref_logps/chosen": -108.32450866699219, + "ref_logps/rejected": -94.57220458984375, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.98723602294922, + "rewards/margins": 153.80978393554688, + "rewards/rejected": -67.82255554199219, + "step": 2830, + "u": -2.8833909034729004, + "weight": 0.03750161454081535 + }, + { + "diff_generated": -61.91322708129883, + "epoch": 0.9202851587815943, + "grad_norm": 464.9502645006035, + "learning_rate": 7.002796941050435e-07, + "logits/chosen": -2.440680980682373, + "logits/rejected": -2.4862771034240723, + "logps/chosen": -20.703678131103516, + "logps/rejected": -156.7772216796875, + "loss": 22.5431, + "losses_ref": -0.00046888887300156057, + "ref_logps/chosen": -102.28060150146484, + "ref_logps/rejected": -94.86397552490234, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.5769271850586, + "rewards/margins": 143.4901580810547, + "rewards/rejected": -61.91322708129883, + "step": 2840, + "u": -2.808485507965088, + "weight": 0.06251436471939087 + }, + { + "diff_generated": -70.20816802978516, + "epoch": 0.9235255994815295, + "grad_norm": 493.41459172605283, + "learning_rate": 6.992811746731213e-07, + "logits/chosen": -2.441591739654541, + "logits/rejected": -2.5436251163482666, + "logps/chosen": -23.536701202392578, + "logps/rejected": -168.00621032714844, + "loss": 22.1721, + "losses_ref": -5.428822760222829e-07, + "ref_logps/chosen": -107.83604431152344, + "ref_logps/rejected": -97.79803466796875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 84.29934692382812, + "rewards/margins": 154.5075225830078, + "rewards/rejected": -70.20816802978516, + "step": 2850, + "u": -2.86466908454895, + "weight": 0.043750010430812836 + }, + { + "diff_generated": -70.08772277832031, + "epoch": 0.9267660401814647, + "grad_norm": 462.0927338424099, + "learning_rate": 6.98278400431818e-07, + "logits/chosen": -2.4696240425109863, + "logits/rejected": -2.5594918727874756, + "logps/chosen": -23.94564437866211, + "logps/rejected": -169.55609130859375, + "loss": 22.0111, + "losses_ref": -2.789838436001446e-05, + "ref_logps/chosen": -109.51057434082031, + "ref_logps/rejected": -99.4683609008789, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 85.56492614746094, + "rewards/margins": 155.6526641845703, + "rewards/rejected": -70.08772277832031, + "step": 2860, + "u": -2.9208388328552246, + "weight": 0.025000441819429398 + }, + { + "diff_generated": -69.10173034667969, + "epoch": 0.9300064808813999, + "grad_norm": 427.4263201187022, + "learning_rate": 6.972713856373369e-07, + "logits/chosen": -2.4598889350891113, + "logits/rejected": -2.5475592613220215, + "logps/chosen": -21.60628890991211, + "logps/rejected": -163.983154296875, + "loss": 21.4799, + "losses_ref": -2.7808329150502686e-07, + "ref_logps/chosen": -104.75083923339844, + "ref_logps/rejected": -94.88143920898438, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 83.1445541381836, + "rewards/margins": 152.2462921142578, + "rewards/rejected": -69.10173034667969, + "step": 2870, + "u": -2.9208388328552246, + "weight": 0.02500000223517418 + }, + { + "diff_generated": -68.18804931640625, + "epoch": 0.933246921581335, + "grad_norm": 439.3131259211645, + "learning_rate": 6.962601446061681e-07, + "logits/chosen": -2.4569830894470215, + "logits/rejected": -2.500361680984497, + "logps/chosen": -20.999797821044922, + "logps/rejected": -161.28903198242188, + "loss": 21.0596, + "losses_ref": -0.23938922584056854, + "ref_logps/chosen": -106.21049499511719, + "ref_logps/rejected": -93.10096740722656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 85.21070098876953, + "rewards/margins": 153.39874267578125, + "rewards/rejected": -68.18804931640625, + "step": 2880, + "u": -2.898897886276245, + "weight": 0.033899758011102676 + }, + { + "diff_generated": -68.9483642578125, + "epoch": 0.9364873622812703, + "grad_norm": 442.4508979736482, + "learning_rate": 6.952446917148853e-07, + "logits/chosen": -2.4237473011016846, + "logits/rejected": -2.5172266960144043, + "logps/chosen": -21.555978775024414, + "logps/rejected": -165.6626434326172, + "loss": 21.3983, + "losses_ref": -4.073951640748419e-05, + "ref_logps/chosen": -108.49068450927734, + "ref_logps/rejected": -96.71427917480469, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 86.93470764160156, + "rewards/margins": 155.883056640625, + "rewards/rejected": -68.9483642578125, + "step": 2890, + "u": -2.920837879180908, + "weight": 0.025001296773552895 + }, + { + "diff_generated": -69.78572082519531, + "epoch": 0.9397278029812054, + "grad_norm": 467.51746560826535, + "learning_rate": 6.94225041399941e-07, + "logits/chosen": -2.426409959793091, + "logits/rejected": -2.533045768737793, + "logps/chosen": -20.896163940429688, + "logps/rejected": -166.4514617919922, + "loss": 20.4097, + "losses_ref": -1.5812247511348687e-06, + "ref_logps/chosen": -105.7077407836914, + "ref_logps/rejected": -96.66574096679688, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 84.81156921386719, + "rewards/margins": 154.59727478027344, + "rewards/rejected": -69.78572082519531, + "step": 2900, + "u": -2.8272223472595215, + "weight": 0.05625004693865776 + }, + { + "diff_generated": -68.22757720947266, + "epoch": 0.9429682436811406, + "grad_norm": 420.11700035346576, + "learning_rate": 6.932012081574615e-07, + "logits/chosen": -2.45912766456604, + "logits/rejected": -2.5262129306793213, + "logps/chosen": -21.73269271850586, + "logps/rejected": -164.52273559570312, + "loss": 21.6465, + "losses_ref": -2.5883713306029676e-07, + "ref_logps/chosen": -102.88648986816406, + "ref_logps/rejected": -96.295166015625, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.1537857055664, + "rewards/margins": 149.38136291503906, + "rewards/rejected": -68.22757720947266, + "step": 2910, + "u": -2.86466908454895, + "weight": 0.04375000298023224 + }, + { + "diff_generated": -62.90834426879883, + "epoch": 0.9462086843810759, + "grad_norm": 432.40511833107803, + "learning_rate": 6.921732065430411e-07, + "logits/chosen": -2.4034066200256348, + "logits/rejected": -2.5192956924438477, + "logps/chosen": -18.513896942138672, + "logps/rejected": -156.45840454101562, + "loss": 20.4589, + "losses_ref": -1.186371264338959e-05, + "ref_logps/chosen": -99.51529693603516, + "ref_logps/rejected": -93.55006408691406, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 81.00138854980469, + "rewards/margins": 143.9097442626953, + "rewards/rejected": -62.90834426879883, + "step": 2920, + "u": -2.771052360534668, + "weight": 0.07500024884939194 + }, + { + "diff_generated": -63.88426971435547, + "epoch": 0.949449125081011, + "grad_norm": 453.7971832480189, + "learning_rate": 6.911410511715343e-07, + "logits/chosen": -2.4226136207580566, + "logits/rejected": -2.485466718673706, + "logps/chosen": -21.352230072021484, + "logps/rejected": -155.57913208007812, + "loss": 20.7018, + "losses_ref": -6.413492519641295e-05, + "ref_logps/chosen": -101.99363708496094, + "ref_logps/rejected": -91.69486236572266, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.64141082763672, + "rewards/margins": 144.52566528320312, + "rewards/rejected": -63.88426971435547, + "step": 2930, + "u": -2.7710506916046143, + "weight": 0.07500205188989639 + }, + { + "diff_generated": -64.1293716430664, + "epoch": 0.9526895657809462, + "grad_norm": 503.66026246273566, + "learning_rate": 6.901047567168491e-07, + "logits/chosen": -2.485138177871704, + "logits/rejected": -2.535628080368042, + "logps/chosen": -21.61728858947754, + "logps/rejected": -156.21929931640625, + "loss": 21.2944, + "losses_ref": -2.6853744202526286e-05, + "ref_logps/chosen": -103.80733489990234, + "ref_logps/rejected": -92.08992004394531, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 82.19004821777344, + "rewards/margins": 146.31944274902344, + "rewards/rejected": -64.1293716430664, + "step": 2940, + "u": -2.7710518836975098, + "weight": 0.07500077039003372 + }, + { + "diff_generated": -67.72306823730469, + "epoch": 0.9559300064808814, + "grad_norm": 479.0401195393126, + "learning_rate": 6.890643379117374e-07, + "logits/chosen": -2.460814952850342, + "logits/rejected": -2.503139019012451, + "logps/chosen": -21.537755966186523, + "logps/rejected": -164.7609405517578, + "loss": 20.722, + "losses_ref": -3.388463665032759e-05, + "ref_logps/chosen": -109.59577941894531, + "ref_logps/rejected": -97.03787994384766, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 88.05802154541016, + "rewards/margins": 155.7810821533203, + "rewards/rejected": -67.72306823730469, + "step": 2950, + "u": -2.827221393585205, + "weight": 0.05625098943710327 + }, + { + "diff_generated": -67.97636413574219, + "epoch": 0.9591704471808166, + "grad_norm": 493.7459099373087, + "learning_rate": 6.880198095475866e-07, + "logits/chosen": -2.4812498092651367, + "logits/rejected": -2.506908416748047, + "logps/chosen": -25.37700653076172, + "logps/rejected": -164.8048553466797, + "loss": 21.5508, + "losses_ref": -7.404306643365999e-07, + "ref_logps/chosen": -114.54148864746094, + "ref_logps/rejected": -96.8284912109375, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 89.16448974609375, + "rewards/margins": 157.14083862304688, + "rewards/rejected": -67.97636413574219, + "step": 2960, + "u": -2.883392333984375, + "weight": 0.03750002384185791 + }, + { + "diff_generated": -62.93597412109375, + "epoch": 0.9624108878807518, + "grad_norm": 494.96677426188234, + "learning_rate": 6.86971186474208e-07, + "logits/chosen": -2.450610399246216, + "logits/rejected": -2.526500940322876, + "logps/chosen": -21.140817642211914, + "logps/rejected": -156.13851928710938, + "loss": 22.2409, + "losses_ref": -0.014812096953392029, + "ref_logps/chosen": -101.89533996582031, + "ref_logps/rejected": -93.20252990722656, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.75450897216797, + "rewards/margins": 143.69049072265625, + "rewards/rejected": -62.93597412109375, + "step": 2970, + "u": -2.8270161151885986, + "weight": 0.0564638152718544 + }, + { + "diff_generated": -67.33392333984375, + "epoch": 0.9656513285806869, + "grad_norm": 473.1966481975596, + "learning_rate": 6.859184835996271e-07, + "logits/chosen": -2.435161590576172, + "logits/rejected": -2.5541937351226807, + "logps/chosen": -19.948017120361328, + "logps/rejected": -164.65989685058594, + "loss": 21.1082, + "losses_ref": -2.529567791498266e-05, + "ref_logps/chosen": -104.0987548828125, + "ref_logps/rejected": -97.32598114013672, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 84.1507568359375, + "rewards/margins": 151.4846649169922, + "rewards/rejected": -67.33392333984375, + "step": 2980, + "u": -2.864668369293213, + "weight": 0.04375060647726059 + }, + { + "diff_generated": -68.59715270996094, + "epoch": 0.9688917692806222, + "grad_norm": 458.49717269508545, + "learning_rate": 6.848617158898704e-07, + "logits/chosen": -2.430609941482544, + "logits/rejected": -2.5278258323669434, + "logps/chosen": -18.157360076904297, + "logps/rejected": -165.1083984375, + "loss": 20.1239, + "losses_ref": -0.0001408242096658796, + "ref_logps/chosen": -100.29402160644531, + "ref_logps/rejected": -96.51124572753906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.13666534423828, + "rewards/margins": 150.73382568359375, + "rewards/rejected": -68.59715270996094, + "step": 2990, + "u": -2.8646674156188965, + "weight": 0.0437517948448658 + }, + { + "diff_generated": -66.93498229980469, + "epoch": 0.9721322099805574, + "grad_norm": 475.1336735245732, + "learning_rate": 6.838008983687538e-07, + "logits/chosen": -2.413914680480957, + "logits/rejected": -2.5006773471832275, + "logps/chosen": -19.738994598388672, + "logps/rejected": -166.29986572265625, + "loss": 20.8047, + "losses_ref": -0.00012624330702237785, + "ref_logps/chosen": -103.92607116699219, + "ref_logps/rejected": -99.3648681640625, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.18708038330078, + "rewards/margins": 151.1220703125, + "rewards/rejected": -66.93498229980469, + "step": 3000, + "u": -2.883389711380005, + "weight": 0.037502724677324295 + }, + { + "diff_generated": -69.31685638427734, + "epoch": 0.9753726506804925, + "grad_norm": 481.7839747788105, + "learning_rate": 6.827360461176675e-07, + "logits/chosen": -2.4137511253356934, + "logits/rejected": -2.5089802742004395, + "logps/chosen": -22.41954231262207, + "logps/rejected": -162.98382568359375, + "loss": 22.2178, + "losses_ref": -2.792777740978636e-05, + "ref_logps/chosen": -102.8587875366211, + "ref_logps/rejected": -93.6669692993164, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.43924713134766, + "rewards/margins": 149.756103515625, + "rewards/rejected": -69.31685638427734, + "step": 3010, + "u": -2.845945358276367, + "weight": 0.05000030994415283 + }, + { + "diff_generated": -64.78567504882812, + "epoch": 0.9786130913804277, + "grad_norm": 463.0567393908434, + "learning_rate": 6.816671742753636e-07, + "logits/chosen": -2.4083762168884277, + "logits/rejected": -2.4754223823547363, + "logps/chosen": -22.250520706176758, + "logps/rejected": -155.5216064453125, + "loss": 20.263, + "losses_ref": -0.00012122480984544381, + "ref_logps/chosen": -102.7984390258789, + "ref_logps/rejected": -90.7359390258789, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.54792022705078, + "rewards/margins": 145.33358764648438, + "rewards/rejected": -64.78567504882812, + "step": 3020, + "u": -2.7710487842559814, + "weight": 0.07500384747982025 + }, + { + "diff_generated": -66.65394592285156, + "epoch": 0.981853532080363, + "grad_norm": 454.57826847430374, + "learning_rate": 6.80594298037739e-07, + "logits/chosen": -2.421954393386841, + "logits/rejected": -2.501904010772705, + "logps/chosen": -20.70921516418457, + "logps/rejected": -162.4676971435547, + "loss": 21.9045, + "losses_ref": -0.00022600118245463818, + "ref_logps/chosen": -103.9954833984375, + "ref_logps/rejected": -95.81376647949219, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.28627014160156, + "rewards/margins": 149.94021606445312, + "rewards/rejected": -66.65394592285156, + "step": 3030, + "u": -2.8459420204162598, + "weight": 0.05000375956296921 + }, + { + "diff_generated": -68.96112823486328, + "epoch": 0.9850939727802981, + "grad_norm": 467.9486022361592, + "learning_rate": 6.795174326576201e-07, + "logits/chosen": -2.483123302459717, + "logits/rejected": -2.5641913414001465, + "logps/chosen": -21.764116287231445, + "logps/rejected": -166.7112274169922, + "loss": 20.9532, + "losses_ref": -5.619110083898704e-07, + "ref_logps/chosen": -108.0898208618164, + "ref_logps/rejected": -97.75010681152344, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 86.32571411132812, + "rewards/margins": 155.28683471679688, + "rewards/rejected": -68.96112823486328, + "step": 3040, + "u": -2.9208388328552246, + "weight": 0.025000011548399925 + }, + { + "diff_generated": -66.76451110839844, + "epoch": 0.9883344134802333, + "grad_norm": 464.85586881726755, + "learning_rate": 6.784365934445467e-07, + "logits/chosen": -2.3822216987609863, + "logits/rejected": -2.5249252319335938, + "logps/chosen": -19.584186553955078, + "logps/rejected": -161.15478515625, + "loss": 21.1499, + "losses_ref": -8.875102503225207e-05, + "ref_logps/chosen": -98.30550384521484, + "ref_logps/rejected": -94.3902587890625, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 78.7213134765625, + "rewards/margins": 145.48582458496094, + "rewards/rejected": -66.76451110839844, + "step": 3050, + "u": -2.7897732257843018, + "weight": 0.06875281780958176 + }, + { + "diff_generated": -65.75493621826172, + "epoch": 0.9915748541801686, + "grad_norm": 472.29826294791883, + "learning_rate": 6.77351795764553e-07, + "logits/chosen": -2.497274160385132, + "logits/rejected": -2.5820260047912598, + "logps/chosen": -20.913732528686523, + "logps/rejected": -163.86170959472656, + "loss": 21.139, + "losses_ref": -8.552480721846223e-05, + "ref_logps/chosen": -109.77827453613281, + "ref_logps/rejected": -98.10675811767578, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 88.86453247070312, + "rewards/margins": 154.61947631835938, + "rewards/rejected": -65.75493621826172, + "step": 3060, + "u": -2.883389949798584, + "weight": 0.037502508610486984 + }, + { + "diff_generated": -63.22431182861328, + "epoch": 0.9948152948801037, + "grad_norm": 471.64849193039237, + "learning_rate": 6.7626305503995e-07, + "logits/chosen": -2.400494337081909, + "logits/rejected": -2.4999756813049316, + "logps/chosen": -21.634984970092773, + "logps/rejected": -154.18283081054688, + "loss": 20.7493, + "losses_ref": -6.76894069329137e-06, + "ref_logps/chosen": -102.55821228027344, + "ref_logps/rejected": -90.95851135253906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.92323303222656, + "rewards/margins": 144.14755249023438, + "rewards/rejected": -63.22431182861328, + "step": 3070, + "u": -2.864668846130371, + "weight": 0.043750178068876266 + }, + { + "diff_generated": -70.42333984375, + "epoch": 0.9980557355800389, + "grad_norm": 449.27113500553077, + "learning_rate": 6.75170386749106e-07, + "logits/chosen": -2.4183688163757324, + "logits/rejected": -2.507997512817383, + "logps/chosen": -22.62767219543457, + "logps/rejected": -171.69192504882812, + "loss": 20.6119, + "losses_ref": -0.000404975755373016, + "ref_logps/chosen": -107.9837875366211, + "ref_logps/rejected": -101.26859283447266, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 85.35613250732422, + "rewards/margins": 155.7794647216797, + "rewards/rejected": -70.42333984375, + "step": 3080, + "u": -2.9770007133483887, + "weight": 0.006258577108383179 + } + ], + "logging_steps": 10, + "max_steps": 9258, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}