{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 57.237549195872155, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0180665254592896, "logits/rejected": -0.9884552955627441, "logps/chosen": -0.27425095438957214, "logps/rejected": -0.2716319262981415, "loss": 3.1091, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -2.7425098419189453, "rewards/margins": -0.02619057334959507, "rewards/rejected": -2.7163190841674805, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 36.2177280707271, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.047877311706543, "logits/rejected": -0.9804394841194153, "logps/chosen": -0.2944500744342804, "logps/rejected": -0.29980722069740295, "loss": 3.1522, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.944500684738159, "rewards/margins": 0.05357087776064873, "rewards/rejected": -2.9980719089508057, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 51.02954591523818, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9653420448303223, "logits/rejected": -0.9844053983688354, "logps/chosen": -0.26417964696884155, "logps/rejected": -0.30082693696022034, "loss": 3.2048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.641796588897705, "rewards/margins": 0.3664725720882416, "rewards/rejected": -3.0082690715789795, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 56.89476138009963, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9597972631454468, "logits/rejected": -0.9341325759887695, "logps/chosen": -0.27756327390670776, "logps/rejected": -0.2916925251483917, "loss": 3.1321, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.7756330966949463, "rewards/margins": 0.14129219949245453, "rewards/rejected": -2.9169249534606934, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 56.48955746474513, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.001181960105896, "logits/rejected": -0.9730860590934753, "logps/chosen": -0.2715573310852051, "logps/rejected": -0.27819815278053284, "loss": 3.3596, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.7155730724334717, "rewards/margins": 0.06640852242708206, "rewards/rejected": -2.7819817066192627, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 47.66267593497189, "learning_rate": 6.382978723404255e-07, "logits/chosen": -1.0001966953277588, "logits/rejected": -0.9549218416213989, "logps/chosen": -0.2734990119934082, "logps/rejected": -0.2796509861946106, "loss": 2.9655, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -2.734990358352661, "rewards/margins": 0.06151958554983139, "rewards/rejected": -2.7965099811553955, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 57.03302592987705, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0495048761367798, "logits/rejected": -0.9743221998214722, "logps/chosen": -0.2940281331539154, "logps/rejected": -0.31984126567840576, "loss": 3.0572, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.940281391143799, "rewards/margins": 0.25813135504722595, "rewards/rejected": -3.1984126567840576, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 64.29646368113443, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.0000861883163452, "logits/rejected": -0.9559175372123718, "logps/chosen": -0.28027427196502686, "logps/rejected": -0.3249492049217224, "loss": 3.0201, "rewards/accuracies": 0.59375, "rewards/chosen": -2.8027429580688477, "rewards/margins": 0.4467490315437317, "rewards/rejected": -3.2494919300079346, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 34.0521027952876, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.049403429031372, "logits/rejected": -1.0066633224487305, "logps/chosen": -0.3022717535495758, "logps/rejected": -0.355845183134079, "loss": 3.1061, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.0227174758911133, "rewards/margins": 0.5357345342636108, "rewards/rejected": -3.5584518909454346, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 50.184137131794785, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.0293877124786377, "logits/rejected": -0.9806405901908875, "logps/chosen": -0.3117847442626953, "logps/rejected": -0.3513973653316498, "loss": 3.1525, "rewards/accuracies": 0.46875, "rewards/chosen": -3.1178476810455322, "rewards/margins": 0.39612606167793274, "rewards/rejected": -3.5139732360839844, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 139.4899548956689, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.0580527782440186, "logits/rejected": -1.0236852169036865, "logps/chosen": -0.29338452219963074, "logps/rejected": -0.36238163709640503, "loss": 2.8456, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.933845043182373, "rewards/margins": 0.6899713277816772, "rewards/rejected": -3.6238162517547607, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 58.45122397836986, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.0895339250564575, "logits/rejected": -1.0574713945388794, "logps/chosen": -0.33461707830429077, "logps/rejected": -0.35189467668533325, "loss": 2.9738, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.3461709022521973, "rewards/margins": 0.17277587950229645, "rewards/rejected": -3.518946886062622, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 100.62827839328082, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.011530876159668, "logits/rejected": -0.9821838140487671, "logps/chosen": -0.4006083011627197, "logps/rejected": -0.464979887008667, "loss": 2.9379, "rewards/accuracies": 0.59375, "rewards/chosen": -4.0060834884643555, "rewards/margins": 0.6437152624130249, "rewards/rejected": -4.649798393249512, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 39.36526232625554, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.0184242725372314, "logits/rejected": -0.9939621686935425, "logps/chosen": -0.3619542419910431, "logps/rejected": -0.4431312084197998, "loss": 2.9573, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.619542360305786, "rewards/margins": 0.8117697834968567, "rewards/rejected": -4.43131160736084, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 83.11076732917083, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9596433639526367, "logits/rejected": -0.8910166621208191, "logps/chosen": -0.3588549494743347, "logps/rejected": -0.4166484773159027, "loss": 2.9742, "rewards/accuracies": 0.53125, "rewards/chosen": -3.588549852371216, "rewards/margins": 0.5779348015785217, "rewards/rejected": -4.1664838790893555, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 52.281331982276065, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.9426174163818359, "logits/rejected": -0.9289323687553406, "logps/chosen": -0.35129761695861816, "logps/rejected": -0.4580927789211273, "loss": 2.9737, "rewards/accuracies": 0.5625, "rewards/chosen": -3.5129764080047607, "rewards/margins": 1.067950963973999, "rewards/rejected": -4.580927848815918, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 61.53493979772547, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9526857137680054, "logits/rejected": -0.9304324388504028, "logps/chosen": -0.34235039353370667, "logps/rejected": -0.40353184938430786, "loss": 2.7213, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.4235033988952637, "rewards/margins": 0.6118148565292358, "rewards/rejected": -4.035318851470947, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 75.22407650978651, "learning_rate": 9.743592451943998e-07, "logits/chosen": -0.9911141395568848, "logits/rejected": -0.9571215510368347, "logps/chosen": -0.4391642212867737, "logps/rejected": -0.5185960531234741, "loss": 3.0403, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.391642093658447, "rewards/margins": 0.7943190336227417, "rewards/rejected": -5.1859612464904785, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 46.673632090780266, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.079331636428833, "logits/rejected": -0.996097207069397, "logps/chosen": -0.4490174353122711, "logps/rejected": -0.49736976623535156, "loss": 2.8747, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.490174293518066, "rewards/margins": 0.4835231900215149, "rewards/rejected": -4.973697662353516, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 78.85306309497338, "learning_rate": 9.612209208833646e-07, "logits/chosen": -0.9557577967643738, "logits/rejected": -0.9308866262435913, "logps/chosen": -0.4265132546424866, "logps/rejected": -0.4960516393184662, "loss": 2.9809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.265132427215576, "rewards/margins": 0.6953836679458618, "rewards/rejected": -4.960515975952148, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 80.40817210917017, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.9734071493148804, "logits/rejected": -0.9203007817268372, "logps/chosen": -0.4045742154121399, "logps/rejected": -0.5108767747879028, "loss": 2.9566, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.045742034912109, "rewards/margins": 1.063025712966919, "rewards/rejected": -5.108767509460449, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 55.451042957143265, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9206374883651733, "logits/rejected": -0.8604587316513062, "logps/chosen": -0.45949387550354004, "logps/rejected": -0.6004349589347839, "loss": 2.8412, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.594939231872559, "rewards/margins": 1.4094107151031494, "rewards/rejected": -6.004349708557129, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 51.58223883398887, "learning_rate": 9.367041003085648e-07, "logits/chosen": -0.9696682691574097, "logits/rejected": -0.9112384915351868, "logps/chosen": -0.4893345832824707, "logps/rejected": -0.5542086362838745, "loss": 2.7495, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.893345355987549, "rewards/margins": 0.6487414240837097, "rewards/rejected": -5.542087078094482, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 75.59919212642018, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.9438816905021667, "logits/rejected": -0.8547528386116028, "logps/chosen": -0.5028254985809326, "logps/rejected": -0.7035338878631592, "loss": 2.5628, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.028255462646484, "rewards/margins": 2.0070836544036865, "rewards/rejected": -7.03533935546875, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 66.32992513821185, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.016081690788269, "logits/rejected": -0.9737744331359863, "logps/chosen": -0.5569332838058472, "logps/rejected": -0.6537975072860718, "loss": 2.4448, "rewards/accuracies": 0.65625, "rewards/chosen": -5.569332599639893, "rewards/margins": 0.9686424136161804, "rewards/rejected": -6.537975311279297, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 139.11732623143496, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.032061219215393, "logits/rejected": -1.0252352952957153, "logps/chosen": -0.5443070530891418, "logps/rejected": -0.8193408250808716, "loss": 2.4333, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.443070411682129, "rewards/margins": 2.750338077545166, "rewards/rejected": -8.193408012390137, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 107.42202232758989, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.0170912742614746, "logits/rejected": -0.9671396017074585, "logps/chosen": -0.6021947264671326, "logps/rejected": -0.7191929221153259, "loss": 2.517, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.021947383880615, "rewards/margins": 1.169981837272644, "rewards/rejected": -7.191929817199707, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 77.05576180382866, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.0649584531784058, "logits/rejected": -1.0430896282196045, "logps/chosen": -0.6696725487709045, "logps/rejected": -0.8106359243392944, "loss": 2.3997, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.696726322174072, "rewards/margins": 1.4096347093582153, "rewards/rejected": -8.106359481811523, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 68.63585118244188, "learning_rate": 8.71572412738697e-07, "logits/chosen": -0.9915879964828491, "logits/rejected": -0.9645885229110718, "logps/chosen": -0.6888564825057983, "logps/rejected": -0.9088963270187378, "loss": 2.0828, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.8885650634765625, "rewards/margins": 2.2003989219665527, "rewards/rejected": -9.088963508605957, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 77.90508875376052, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.0322893857955933, "logits/rejected": -1.0123205184936523, "logps/chosen": -0.7648183107376099, "logps/rejected": -0.9603475332260132, "loss": 2.2673, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.6481828689575195, "rewards/margins": 1.9552921056747437, "rewards/rejected": -9.603475570678711, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 75.78147375517075, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.0493463277816772, "logits/rejected": -1.0279868841171265, "logps/chosen": -0.8422037363052368, "logps/rejected": -1.0871771574020386, "loss": 2.2922, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.422037124633789, "rewards/margins": 2.4497344493865967, "rewards/rejected": -10.871770858764648, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 86.49849369728787, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.0481699705123901, "logits/rejected": -1.030601143836975, "logps/chosen": -0.9466081857681274, "logps/rejected": -1.3202154636383057, "loss": 2.3755, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.466081619262695, "rewards/margins": 3.736072540283203, "rewards/rejected": -13.202154159545898, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 66.82340849667754, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.0408477783203125, "logits/rejected": -1.0174505710601807, "logps/chosen": -0.9799006581306458, "logps/rejected": -1.3342236280441284, "loss": 2.1125, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.799007415771484, "rewards/margins": 3.5432305335998535, "rewards/rejected": -13.34223747253418, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 97.43908089438905, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.0224933624267578, "logits/rejected": -1.001030683517456, "logps/chosen": -0.9700697064399719, "logps/rejected": -1.358564853668213, "loss": 1.9793, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.70069694519043, "rewards/margins": 3.884951114654541, "rewards/rejected": -13.585647583007812, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 98.62704227490674, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.04830002784729, "logits/rejected": -1.0014127492904663, "logps/chosen": -1.0846463441848755, "logps/rejected": -1.3184218406677246, "loss": 1.8174, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -10.846463203430176, "rewards/margins": 2.3377552032470703, "rewards/rejected": -13.18421745300293, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 74.26153272998572, "learning_rate": 7.723195175075135e-07, "logits/chosen": -0.9784607887268066, "logits/rejected": -0.9590786099433899, "logps/chosen": -1.03909432888031, "logps/rejected": -1.3960068225860596, "loss": 1.8592, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -10.39094352722168, "rewards/margins": 3.5691237449645996, "rewards/rejected": -13.960065841674805, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 88.60468047988923, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.0223743915557861, "logits/rejected": -0.9691470861434937, "logps/chosen": -1.0873353481292725, "logps/rejected": -1.4810540676116943, "loss": 1.8506, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -10.873353958129883, "rewards/margins": 3.937185764312744, "rewards/rejected": -14.810541152954102, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 79.84179637831463, "learning_rate": 7.402210918896689e-07, "logits/chosen": -0.9927349090576172, "logits/rejected": -1.0011526346206665, "logps/chosen": -1.2325414419174194, "logps/rejected": -1.739311933517456, "loss": 1.6742, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -12.325414657592773, "rewards/margins": 5.067704200744629, "rewards/rejected": -17.393117904663086, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 75.55606036176057, "learning_rate": 7.236565741578162e-07, "logits/chosen": -0.9720694422721863, "logits/rejected": -0.9535917043685913, "logps/chosen": -1.2131645679473877, "logps/rejected": -1.5727177858352661, "loss": 1.7343, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -12.131647109985352, "rewards/margins": 3.5955300331115723, "rewards/rejected": -15.727177619934082, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 95.29175169321584, "learning_rate": 7.067792524832603e-07, "logits/chosen": -0.9580856561660767, "logits/rejected": -0.9478925466537476, "logps/chosen": -1.2784286737442017, "logps/rejected": -1.7080621719360352, "loss": 1.7675, "rewards/accuracies": 0.78125, "rewards/chosen": -12.784285545349121, "rewards/margins": 4.296335220336914, "rewards/rejected": -17.08062171936035, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 97.56349101288879, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.003482460975647, "logits/rejected": -0.9547850489616394, "logps/chosen": -1.410736322402954, "logps/rejected": -1.8478959798812866, "loss": 1.8853, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -14.107362747192383, "rewards/margins": 4.371596336364746, "rewards/rejected": -18.478958129882812, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 124.17700452204937, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.0298535823822021, "logits/rejected": -1.020567774772644, "logps/chosen": -1.4878171682357788, "logps/rejected": -1.9283632040023804, "loss": 1.7977, "rewards/accuracies": 0.84375, "rewards/chosen": -14.878171920776367, "rewards/margins": 4.405461311340332, "rewards/rejected": -19.283634185791016, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 122.41736903454225, "learning_rate": 6.545084971874736e-07, "logits/chosen": -0.9621469378471375, "logits/rejected": -0.9473578333854675, "logps/chosen": -1.558885097503662, "logps/rejected": -2.0420405864715576, "loss": 1.712, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.588850021362305, "rewards/margins": 4.8315558433532715, "rewards/rejected": -20.420406341552734, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 118.87466737296252, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.0177868604660034, "logits/rejected": -1.0038330554962158, "logps/chosen": -1.6249806880950928, "logps/rejected": -2.1466097831726074, "loss": 1.6798, "rewards/accuracies": 0.75, "rewards/chosen": -16.249807357788086, "rewards/margins": 5.216291904449463, "rewards/rejected": -21.46609878540039, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 71.45686372104745, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.0141699314117432, "logits/rejected": -0.9860795736312866, "logps/chosen": -1.6077144145965576, "logps/rejected": -2.097548723220825, "loss": 1.5264, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -16.077144622802734, "rewards/margins": 4.898342132568359, "rewards/rejected": -20.975486755371094, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 96.03329426013343, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.1012922525405884, "logits/rejected": -1.0541749000549316, "logps/chosen": -1.536604881286621, "logps/rejected": -1.9562132358551025, "loss": 1.5597, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -15.366048812866211, "rewards/margins": 4.196080207824707, "rewards/rejected": -19.562129974365234, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 92.4386577422302, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.1283349990844727, "logits/rejected": -1.1022907495498657, "logps/chosen": -1.527305245399475, "logps/rejected": -2.0704562664031982, "loss": 1.4964, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -15.273053169250488, "rewards/margins": 5.431510925292969, "rewards/rejected": -20.70456314086914, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 99.65700789182705, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.098283290863037, "logits/rejected": -1.1012353897094727, "logps/chosen": -1.430646538734436, "logps/rejected": -1.9294793605804443, "loss": 1.5595, "rewards/accuracies": 0.8125, "rewards/chosen": -14.306467056274414, "rewards/margins": 4.98832893371582, "rewards/rejected": -19.294795989990234, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 129.78574414242638, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.1896294355392456, "logits/rejected": -1.1353044509887695, "logps/chosen": -1.4528030157089233, "logps/rejected": -2.033853054046631, "loss": 1.475, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -14.52802848815918, "rewards/margins": 5.810500144958496, "rewards/rejected": -20.33852767944336, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 128.20283155514042, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.1316919326782227, "logits/rejected": -1.1171941757202148, "logps/chosen": -1.538417100906372, "logps/rejected": -2.1081037521362305, "loss": 1.4196, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.384170532226562, "rewards/margins": 5.696866512298584, "rewards/rejected": -21.081039428710938, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 297.4344168039998, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.1720324754714966, "logits/rejected": -1.1475986242294312, "logps/chosen": -1.6590303182601929, "logps/rejected": -2.150458335876465, "loss": 1.6319, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.590303421020508, "rewards/margins": 4.914281845092773, "rewards/rejected": -21.50458335876465, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 110.4855476557986, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.145662546157837, "logits/rejected": -1.1171993017196655, "logps/chosen": -1.5531560182571411, "logps/rejected": -2.0303704738616943, "loss": 1.4324, "rewards/accuracies": 0.8125, "rewards/chosen": -15.531560897827148, "rewards/margins": 4.7721452713012695, "rewards/rejected": -20.303707122802734, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 83.14465015789618, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.1167972087860107, "logits/rejected": -1.1012585163116455, "logps/chosen": -1.6575731039047241, "logps/rejected": -2.175945997238159, "loss": 1.5468, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -16.575729370117188, "rewards/margins": 5.183730125427246, "rewards/rejected": -21.75946044921875, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 116.06958067335016, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.1125719547271729, "logits/rejected": -1.0850471258163452, "logps/chosen": -1.4920045137405396, "logps/rejected": -2.0412135124206543, "loss": 1.411, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -14.920045852661133, "rewards/margins": 5.492091655731201, "rewards/rejected": -20.41213607788086, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 93.41636467602738, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.0854105949401855, "logits/rejected": -1.0501768589019775, "logps/chosen": -1.501579999923706, "logps/rejected": -1.966059684753418, "loss": 1.6514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.015800476074219, "rewards/margins": 4.644796848297119, "rewards/rejected": -19.66059684753418, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 76.77398013161601, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.1719205379486084, "logits/rejected": -1.148206114768982, "logps/chosen": -1.5624816417694092, "logps/rejected": -2.106921672821045, "loss": 1.5084, "rewards/accuracies": 0.8125, "rewards/chosen": -15.6248140335083, "rewards/margins": 5.44440221786499, "rewards/rejected": -21.069217681884766, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 111.58387969589569, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.1303155422210693, "logits/rejected": -1.0974434614181519, "logps/chosen": -1.6112607717514038, "logps/rejected": -2.212517261505127, "loss": 1.3065, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -16.112607955932617, "rewards/margins": 6.012566089630127, "rewards/rejected": -22.125173568725586, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 101.64126956685786, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.1603832244873047, "logits/rejected": -1.117941975593567, "logps/chosen": -1.6240203380584717, "logps/rejected": -2.1277661323547363, "loss": 1.5293, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.240203857421875, "rewards/margins": 5.037457466125488, "rewards/rejected": -21.277660369873047, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 107.06654753863799, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.0859363079071045, "logits/rejected": -1.0710818767547607, "logps/chosen": -1.6715633869171143, "logps/rejected": -2.20039701461792, "loss": 1.4051, "rewards/accuracies": 0.8125, "rewards/chosen": -16.715633392333984, "rewards/margins": 5.288336753845215, "rewards/rejected": -22.003969192504883, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 85.09256280069626, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.015700340270996, "logits/rejected": -0.9886563420295715, "logps/chosen": -1.7295385599136353, "logps/rejected": -2.197303056716919, "loss": 1.7183, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -17.295387268066406, "rewards/margins": 4.677645206451416, "rewards/rejected": -21.973031997680664, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 98.45045929238997, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.1253305673599243, "logits/rejected": -1.078313946723938, "logps/chosen": -1.716840386390686, "logps/rejected": -2.1928813457489014, "loss": 1.4901, "rewards/accuracies": 0.78125, "rewards/chosen": -17.16840171813965, "rewards/margins": 4.760410785675049, "rewards/rejected": -21.928813934326172, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 141.52088188995467, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.1675808429718018, "logits/rejected": -1.1469465494155884, "logps/chosen": -1.7660911083221436, "logps/rejected": -2.279519557952881, "loss": 1.6871, "rewards/accuracies": 0.8125, "rewards/chosen": -17.660913467407227, "rewards/margins": 5.134285926818848, "rewards/rejected": -22.795196533203125, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 101.86538699204806, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.153141736984253, "logits/rejected": -1.097063660621643, "logps/chosen": -1.705733299255371, "logps/rejected": -2.247840166091919, "loss": 1.5097, "rewards/accuracies": 0.84375, "rewards/chosen": -17.05733299255371, "rewards/margins": 5.421066761016846, "rewards/rejected": -22.47840118408203, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 122.36169835495791, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.136850357055664, "logits/rejected": -1.1315498352050781, "logps/chosen": -1.7248958349227905, "logps/rejected": -2.382091760635376, "loss": 1.446, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -17.248958587646484, "rewards/margins": 6.571959018707275, "rewards/rejected": -23.8209171295166, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 100.7599593802445, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.158361792564392, "logits/rejected": -1.1068694591522217, "logps/chosen": -1.8045142889022827, "logps/rejected": -2.512817859649658, "loss": 1.4403, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -18.04514503479004, "rewards/margins": 7.083035469055176, "rewards/rejected": -25.128177642822266, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 82.7307588668697, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.0738600492477417, "logits/rejected": -1.030057430267334, "logps/chosen": -1.734301209449768, "logps/rejected": -2.244229793548584, "loss": 1.4411, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.3430118560791, "rewards/margins": 5.099286079406738, "rewards/rejected": -22.442298889160156, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 111.19834108528815, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.1105704307556152, "logits/rejected": -1.092313289642334, "logps/chosen": -1.7023674249649048, "logps/rejected": -2.2848927974700928, "loss": 1.3002, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.023672103881836, "rewards/margins": 5.82525634765625, "rewards/rejected": -22.84893226623535, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 124.14973601872444, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.1307401657104492, "logits/rejected": -1.0960733890533447, "logps/chosen": -1.639500617980957, "logps/rejected": -2.2025198936462402, "loss": 1.4763, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.395008087158203, "rewards/margins": 5.630189895629883, "rewards/rejected": -22.025196075439453, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 89.15665757381706, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.1599509716033936, "logits/rejected": -1.135851502418518, "logps/chosen": -1.72158682346344, "logps/rejected": -2.364271402359009, "loss": 1.5031, "rewards/accuracies": 0.875, "rewards/chosen": -17.215869903564453, "rewards/margins": 6.426844596862793, "rewards/rejected": -23.642711639404297, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 110.91770853185307, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.1471744775772095, "logits/rejected": -1.1494718790054321, "logps/chosen": -1.5995361804962158, "logps/rejected": -2.1568686962127686, "loss": 1.4483, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.995361328125, "rewards/margins": 5.57332706451416, "rewards/rejected": -21.568689346313477, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 92.35558706588404, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.0737619400024414, "logits/rejected": -1.0871598720550537, "logps/chosen": -1.583603858947754, "logps/rejected": -2.1480062007904053, "loss": 1.2137, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -15.836038589477539, "rewards/margins": 5.6440229415893555, "rewards/rejected": -21.480064392089844, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 97.35140511945166, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.1281821727752686, "logits/rejected": -1.1263208389282227, "logps/chosen": -1.5962927341461182, "logps/rejected": -2.2565903663635254, "loss": 1.4883, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -15.962926864624023, "rewards/margins": 6.602975368499756, "rewards/rejected": -22.565902709960938, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 97.26488316442018, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.1745531558990479, "logits/rejected": -1.1077674627304077, "logps/chosen": -1.6627562046051025, "logps/rejected": -2.321105480194092, "loss": 1.4521, "rewards/accuracies": 0.8125, "rewards/chosen": -16.627561569213867, "rewards/margins": 6.583495140075684, "rewards/rejected": -23.211057662963867, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 112.70224926269489, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.0878835916519165, "logits/rejected": -1.075674295425415, "logps/chosen": -1.6165920495986938, "logps/rejected": -2.117642402648926, "loss": 1.4406, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.165922164916992, "rewards/margins": 5.010504245758057, "rewards/rejected": -21.17642593383789, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 116.18618106964092, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.1332778930664062, "logits/rejected": -1.1369507312774658, "logps/chosen": -1.7694313526153564, "logps/rejected": -2.3713538646698, "loss": 1.4837, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -17.694313049316406, "rewards/margins": 6.019228935241699, "rewards/rejected": -23.71354103088379, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 133.07010667525282, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.1531364917755127, "logits/rejected": -1.1320288181304932, "logps/chosen": -1.6155163049697876, "logps/rejected": -2.160113573074341, "loss": 1.5728, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.155162811279297, "rewards/margins": 5.4459710121154785, "rewards/rejected": -21.60113525390625, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 92.80555931457836, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.14119553565979, "logits/rejected": -1.1190695762634277, "logps/chosen": -1.6841446161270142, "logps/rejected": -2.4013619422912598, "loss": 1.2863, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -16.841445922851562, "rewards/margins": 7.172172546386719, "rewards/rejected": -24.013620376586914, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 106.02842713910356, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.164880633354187, "logits/rejected": -1.1471444368362427, "logps/chosen": -1.7512538433074951, "logps/rejected": -2.3633463382720947, "loss": 1.4838, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -17.512537002563477, "rewards/margins": 6.120924949645996, "rewards/rejected": -23.633460998535156, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 91.21482574420814, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.158891201019287, "logits/rejected": -1.1362249851226807, "logps/chosen": -1.6644847393035889, "logps/rejected": -2.2324166297912598, "loss": 1.3638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.644847869873047, "rewards/margins": 5.679316997528076, "rewards/rejected": -22.32416534423828, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 67.57151749023619, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.1628615856170654, "logits/rejected": -1.1462781429290771, "logps/chosen": -1.6841766834259033, "logps/rejected": -2.238058567047119, "loss": 1.2555, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.841764450073242, "rewards/margins": 5.538818359375, "rewards/rejected": -22.380582809448242, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.3520146608352661, "eval_logits/rejected": -1.3613466024398804, "eval_logps/chosen": -1.698158621788025, "eval_logps/rejected": -2.272404670715332, "eval_loss": 1.3605413436889648, "eval_rewards/accuracies": 0.8455284833908081, "eval_rewards/chosen": -16.981586456298828, "eval_rewards/margins": 5.742460250854492, "eval_rewards/rejected": -22.724044799804688, "eval_runtime": 96.7859, "eval_samples_per_second": 20.261, "eval_steps_per_second": 1.271, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 113.07658741149837, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.1250704526901245, "logits/rejected": -1.142858624458313, "logps/chosen": -1.7644565105438232, "logps/rejected": -2.3112316131591797, "loss": 1.3959, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -17.64456558227539, "rewards/margins": 5.467750549316406, "rewards/rejected": -23.112316131591797, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 135.58367842014283, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.148863434791565, "logits/rejected": -1.1359000205993652, "logps/chosen": -1.7182337045669556, "logps/rejected": -2.268091917037964, "loss": 1.5344, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.182336807250977, "rewards/margins": 5.4985833168029785, "rewards/rejected": -22.680919647216797, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 114.17576942626454, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.1382702589035034, "logits/rejected": -1.0858891010284424, "logps/chosen": -1.606128454208374, "logps/rejected": -2.25667142868042, "loss": 1.6467, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.061288833618164, "rewards/margins": 6.505424499511719, "rewards/rejected": -22.56671142578125, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 101.01008878090249, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.1730471849441528, "logits/rejected": -1.1232213973999023, "logps/chosen": -1.5842628479003906, "logps/rejected": -2.213921308517456, "loss": 1.3771, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -15.842630386352539, "rewards/margins": 6.296584129333496, "rewards/rejected": -22.13921356201172, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 111.86270544120462, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.1721917390823364, "logits/rejected": -1.1620614528656006, "logps/chosen": -1.77499258518219, "logps/rejected": -2.305689573287964, "loss": 1.493, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -17.74992561340332, "rewards/margins": 5.306972980499268, "rewards/rejected": -23.05689811706543, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 86.63448539967071, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.1510651111602783, "logits/rejected": -1.1352717876434326, "logps/chosen": -1.7862894535064697, "logps/rejected": -2.381641149520874, "loss": 1.411, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -17.862895965576172, "rewards/margins": 5.953517436981201, "rewards/rejected": -23.816410064697266, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 102.66949123740247, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.1117022037506104, "logits/rejected": -1.0704118013381958, "logps/chosen": -1.6134551763534546, "logps/rejected": -2.211256742477417, "loss": 1.3969, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -16.134552001953125, "rewards/margins": 5.9780168533325195, "rewards/rejected": -22.112567901611328, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 112.7340895477117, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.1366580724716187, "logits/rejected": -1.0769071578979492, "logps/chosen": -1.7496669292449951, "logps/rejected": -2.3226873874664307, "loss": 1.3153, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.49666976928711, "rewards/margins": 5.7302045822143555, "rewards/rejected": -23.226871490478516, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 92.65086353146947, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.0847865343093872, "logits/rejected": -1.0684945583343506, "logps/chosen": -1.726458191871643, "logps/rejected": -2.406322956085205, "loss": 1.2646, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -17.26458168029785, "rewards/margins": 6.798647403717041, "rewards/rejected": -24.063228607177734, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 81.82402808973579, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.1987271308898926, "logits/rejected": -1.1772375106811523, "logps/chosen": -1.6759332418441772, "logps/rejected": -2.2297987937927246, "loss": 1.4218, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.759334564208984, "rewards/margins": 5.5386552810668945, "rewards/rejected": -22.297988891601562, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 114.73913428926, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.1624139547348022, "logits/rejected": -1.1274266242980957, "logps/chosen": -1.695810317993164, "logps/rejected": -2.3724701404571533, "loss": 1.1616, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -16.958105087280273, "rewards/margins": 6.766595363616943, "rewards/rejected": -23.724700927734375, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 105.12442076336163, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.1637624502182007, "logits/rejected": -1.1525938510894775, "logps/chosen": -1.7872707843780518, "logps/rejected": -2.419015407562256, "loss": 1.334, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -17.87270736694336, "rewards/margins": 6.317442893981934, "rewards/rejected": -24.19015121459961, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 103.69467200755768, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.1328219175338745, "logits/rejected": -1.139512300491333, "logps/chosen": -1.773741364479065, "logps/rejected": -2.4201297760009766, "loss": 1.4736, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -17.737415313720703, "rewards/margins": 6.4638848304748535, "rewards/rejected": -24.201297760009766, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.9719815800480975, "train_runtime": 11459.6716, "train_samples_per_second": 5.225, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }