{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6281078251766553, "eval_steps": 400, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010468463752944255, "grad_norm": 424.8911502777972, "learning_rate": 3.125e-08, "loss": 713.6646, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -3.0610547065734863, "rewards/margins": -0.43895024061203003, "rewards/rejected": -2.6221041679382324, "step": 5 }, { "epoch": 0.02093692750588851, "grad_norm": 403.8683081084132, "learning_rate": 6.25e-08, "loss": 717.3508, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.993378162384033, "rewards/margins": -0.3109555244445801, "rewards/rejected": -2.682422637939453, "step": 10 }, { "epoch": 0.031405391258832765, "grad_norm": 487.9238814591701, "learning_rate": 9.375e-08, "loss": 713.6135, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.5931520462036133, "rewards/margins": -0.21937386691570282, "rewards/rejected": -2.3737778663635254, "step": 15 }, { "epoch": 0.04187385501177702, "grad_norm": 540.6870493028796, "learning_rate": 1.25e-07, "loss": 712.8184, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -3.164547920227051, "rewards/margins": -0.7127091288566589, "rewards/rejected": -2.451838970184326, "step": 20 }, { "epoch": 0.05234231876472128, "grad_norm": 427.3180170525652, "learning_rate": 1.5625e-07, "loss": 707.0853, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.7615244388580322, "rewards/margins": -0.16162791848182678, "rewards/rejected": -2.599896192550659, "step": 25 }, { "epoch": 0.06281078251766553, "grad_norm": 383763.7480098094, "learning_rate": 1.875e-07, "loss": 715.4415, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -2.58443546295166, "rewards/margins": -0.2884238660335541, "rewards/rejected": -2.2960116863250732, "step": 30 }, { "epoch": 0.07327924627060979, "grad_norm": 439.43685355063843, "learning_rate": 2.1874999999999997e-07, "loss": 717.8594, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.9699530601501465, "rewards/margins": -0.2793353796005249, "rewards/rejected": -2.690617799758911, "step": 35 }, { "epoch": 0.08374771002355404, "grad_norm": 503.22488533065024, "learning_rate": 2.5e-07, "loss": 710.3533, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.8341801166534424, "rewards/margins": -0.11880241334438324, "rewards/rejected": -2.7153773307800293, "step": 40 }, { "epoch": 0.0942161737764983, "grad_norm": 1337.4413216082382, "learning_rate": 2.8125e-07, "loss": 711.881, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -2.8177173137664795, "rewards/margins": -0.19809791445732117, "rewards/rejected": -2.619619607925415, "step": 45 }, { "epoch": 0.10468463752944256, "grad_norm": 385.7756641011158, "learning_rate": 3.125e-07, "loss": 705.6052, "rewards/accuracies": 0.5625, "rewards/chosen": -2.5368785858154297, "rewards/margins": 0.30438369512557983, "rewards/rejected": -2.8412623405456543, "step": 50 }, { "epoch": 0.11515310128238682, "grad_norm": 422.3765258964566, "learning_rate": 3.4374999999999994e-07, "loss": 706.4469, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -3.026949882507324, "rewards/margins": -0.3166283369064331, "rewards/rejected": -2.7103216648101807, "step": 55 }, { "epoch": 0.12562156503533106, "grad_norm": 394.9174924028097, "learning_rate": 3.75e-07, "loss": 710.1363, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.814709424972534, "rewards/margins": -0.3439286947250366, "rewards/rejected": -2.470780849456787, "step": 60 }, { "epoch": 0.1360900287882753, "grad_norm": 590.9528736566529, "learning_rate": 4.0625000000000003e-07, "loss": 704.8263, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -3.221498489379883, "rewards/margins": -0.13808628916740417, "rewards/rejected": -3.083411931991577, "step": 65 }, { "epoch": 0.14655849254121958, "grad_norm": 666.685124573273, "learning_rate": 4.3749999999999994e-07, "loss": 709.7217, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.896270751953125, "rewards/margins": -0.23611800372600555, "rewards/rejected": -2.6601529121398926, "step": 70 }, { "epoch": 0.15702695629416383, "grad_norm": 422.81236685781573, "learning_rate": 4.6874999999999996e-07, "loss": 701.5896, "rewards/accuracies": 0.4375, "rewards/chosen": -3.1475062370300293, "rewards/margins": -0.23899349570274353, "rewards/rejected": -2.908513069152832, "step": 75 }, { "epoch": 0.16749542004710807, "grad_norm": 1161.6272059916828, "learning_rate": 5e-07, "loss": 712.8695, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.549727439880371, "rewards/margins": 0.05642218515276909, "rewards/rejected": -2.606149673461914, "step": 80 }, { "epoch": 0.17796388380005235, "grad_norm": 691.6756141822095, "learning_rate": 5.3125e-07, "loss": 715.2848, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.962017774581909, "rewards/margins": -0.18228396773338318, "rewards/rejected": -2.779733896255493, "step": 85 }, { "epoch": 0.1884323475529966, "grad_norm": 557.7156194405, "learning_rate": 5.625e-07, "loss": 710.1722, "rewards/accuracies": 0.4375, "rewards/chosen": -3.1114089488983154, "rewards/margins": -0.5688842535018921, "rewards/rejected": -2.542525053024292, "step": 90 }, { "epoch": 0.19890081130594087, "grad_norm": 643.081848366494, "learning_rate": 5.9375e-07, "loss": 708.6848, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.728463888168335, "rewards/margins": 0.1268891543149948, "rewards/rejected": -2.8553528785705566, "step": 95 }, { "epoch": 0.2093692750588851, "grad_norm": 1749.0480774010928, "learning_rate": 5.999678242522831e-07, "loss": 712.9789, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.0832152366638184, "rewards/margins": -0.22872868180274963, "rewards/rejected": -2.8544864654541016, "step": 100 }, { "epoch": 0.21983773881182936, "grad_norm": 10514.346037549345, "learning_rate": 5.998371221059621e-07, "loss": 697.8367, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.205540895462036, "rewards/margins": -0.16175726056098938, "rewards/rejected": -3.043783187866211, "step": 105 }, { "epoch": 0.23030620256477363, "grad_norm": 2830.1739417475483, "learning_rate": 5.996059263493219e-07, "loss": 714.0083, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -3.1492106914520264, "rewards/margins": -0.41294175386428833, "rewards/rejected": -2.736268997192383, "step": 110 }, { "epoch": 0.24077466631771788, "grad_norm": 3697.295888208749, "learning_rate": 5.992743144700869e-07, "loss": 703.9895, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -3.2083535194396973, "rewards/margins": -0.05745415762066841, "rewards/rejected": -3.1508989334106445, "step": 115 }, { "epoch": 0.2512431300706621, "grad_norm": 1170.631987747208, "learning_rate": 5.988423976115163e-07, "loss": 921.5164, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -3.09690523147583, "rewards/margins": -0.07702343910932541, "rewards/rejected": -3.0198817253112793, "step": 120 }, { "epoch": 0.26171159382360637, "grad_norm": 1391.9151015605798, "learning_rate": 5.983103205351532e-07, "loss": 704.9495, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.0641629695892334, "rewards/margins": 0.2125791758298874, "rewards/rejected": -3.2767422199249268, "step": 125 }, { "epoch": 0.2721800575765506, "grad_norm": 39002.795574769065, "learning_rate": 5.976782615723061e-07, "loss": 728.894, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.5640883445739746, "rewards/margins": -0.004037248902022839, "rewards/rejected": -3.5600509643554688, "step": 130 }, { "epoch": 0.2826485213294949, "grad_norm": 1164.3315552399881, "learning_rate": 5.969464325642798e-07, "loss": 700.7844, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.116656541824341, "rewards/margins": 0.2637160122394562, "rewards/rejected": -3.3803725242614746, "step": 135 }, { "epoch": 0.29311698508243916, "grad_norm": 2549527.6361433878, "learning_rate": 5.961150787913738e-07, "loss": 1252.7453, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.451526641845703, "rewards/margins": -0.17801007628440857, "rewards/rejected": -3.2735161781311035, "step": 140 }, { "epoch": 0.3035854488353834, "grad_norm": 2301.1157939792593, "learning_rate": 5.951844788906746e-07, "loss": 749.2581, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -3.379659652709961, "rewards/margins": -0.495597779750824, "rewards/rejected": -2.884061813354492, "step": 145 }, { "epoch": 0.31405391258832765, "grad_norm": 1288.9212020876917, "learning_rate": 5.941549447626671e-07, "loss": 22400296550.4, "rewards/accuracies": 0.53125, "rewards/chosen": -3.371587038040161, "rewards/margins": 0.21983376145362854, "rewards/rejected": -3.5914206504821777, "step": 150 }, { "epoch": 0.3245223763412719, "grad_norm": 4169.091186018576, "learning_rate": 5.930268214666979e-07, "loss": 689.9577, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -3.393592119216919, "rewards/margins": 0.27183833718299866, "rewards/rejected": -3.6654305458068848, "step": 155 }, { "epoch": 0.33499084009421615, "grad_norm": 40987.876210824266, "learning_rate": 5.918004871053251e-07, "loss": 699.906, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.6145877838134766, "rewards/margins": 0.1472555547952652, "rewards/rejected": -3.76184344291687, "step": 160 }, { "epoch": 0.34545930384716045, "grad_norm": 3446.7363874580406, "learning_rate": 5.904763526975934e-07, "loss": 700.4801, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.6712746620178223, "rewards/margins": 0.03212170675396919, "rewards/rejected": -3.7033963203430176, "step": 165 }, { "epoch": 0.3559277676001047, "grad_norm": 4383.523843958487, "learning_rate": 5.890548620412763e-07, "loss": 696.9372, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -4.037501335144043, "rewards/margins": -0.23374083638191223, "rewards/rejected": -3.803760528564453, "step": 170 }, { "epoch": 0.36639623135304894, "grad_norm": 1968.8221017002966, "learning_rate": 5.875364915641322e-07, "loss": 693.6001, "rewards/accuracies": 0.5625, "rewards/chosen": -4.061675071716309, "rewards/margins": 0.16252286732196808, "rewards/rejected": -4.224198818206787, "step": 175 }, { "epoch": 0.3768646951059932, "grad_norm": 4165.273001929711, "learning_rate": 5.859217501642258e-07, "loss": 689.4774, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.6556944847106934, "rewards/margins": 0.33984482288360596, "rewards/rejected": -3.995539903640747, "step": 180 }, { "epoch": 0.38733315885893743, "grad_norm": 7680.7561691485025, "learning_rate": 5.842111790393642e-07, "loss": 690.4501, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.226962566375732, "rewards/margins": 0.14984741806983948, "rewards/rejected": -4.376810073852539, "step": 185 }, { "epoch": 0.39780162261188173, "grad_norm": 3514.664070908699, "learning_rate": 5.824053515057091e-07, "loss": 693.3683, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.8957512378692627, "rewards/margins": 0.18127045035362244, "rewards/rejected": -4.077021598815918, "step": 190 }, { "epoch": 0.408270086364826, "grad_norm": 6684.674851679545, "learning_rate": 5.805048728056245e-07, "loss": 685.6387, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.011441707611084, "rewards/margins": 0.16933482885360718, "rewards/rejected": -4.180776596069336, "step": 195 }, { "epoch": 0.4187385501177702, "grad_norm": 2817.010612327531, "learning_rate": 5.785103799048218e-07, "loss": 691.3805, "rewards/accuracies": 0.53125, "rewards/chosen": -4.5704665184021, "rewards/margins": 0.05775844305753708, "rewards/rejected": -4.628224849700928, "step": 200 }, { "epoch": 0.42920701387071447, "grad_norm": 12460.132704854665, "learning_rate": 5.764225412788754e-07, "loss": 690.0626, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -4.749141693115234, "rewards/margins": 0.10000785440206528, "rewards/rejected": -4.849149703979492, "step": 205 }, { "epoch": 0.4396754776236587, "grad_norm": 3156.835029013167, "learning_rate": 5.742420566891749e-07, "loss": 679.2428, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.9751620292663574, "rewards/margins": 0.5410782098770142, "rewards/rejected": -4.516240119934082, "step": 210 }, { "epoch": 0.45014394137660296, "grad_norm": 4165.789445089526, "learning_rate": 5.719696569483936e-07, "loss": 679.8576, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.488650321960449, "rewards/margins": 0.339005708694458, "rewards/rejected": -4.827655792236328, "step": 215 }, { "epoch": 0.46061240512954726, "grad_norm": 7010.571587146665, "learning_rate": 5.696061036755478e-07, "loss": 685.5709, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.9668354988098145, "rewards/margins": 0.14022143185138702, "rewards/rejected": -5.107056617736816, "step": 220 }, { "epoch": 0.4710808688824915, "grad_norm": 4950.519059974548, "learning_rate": 5.671521890407327e-07, "loss": 680.7437, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.726534843444824, "rewards/margins": 0.5676447749137878, "rewards/rejected": -5.294179916381836, "step": 225 }, { "epoch": 0.48154933263543576, "grad_norm": 7887.8682965510425, "learning_rate": 5.64608735499618e-07, "loss": 674.3191, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.650925159454346, "rewards/margins": 0.5207107067108154, "rewards/rejected": -5.17163610458374, "step": 230 }, { "epoch": 0.49201779638838, "grad_norm": 5967.489228784308, "learning_rate": 5.619765955177932e-07, "loss": 680.9146, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.723302841186523, "rewards/margins": 0.5448298454284668, "rewards/rejected": -5.268132209777832, "step": 235 }, { "epoch": 0.5024862601413242, "grad_norm": 4067.749182919556, "learning_rate": 5.592566512850545e-07, "loss": 677.9534, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -5.117281436920166, "rewards/margins": 0.6115970611572266, "rewards/rejected": -5.728878974914551, "step": 240 }, { "epoch": 0.5129547238942685, "grad_norm": 3495.5145356721982, "learning_rate": 5.564498144197293e-07, "loss": 681.9477, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -5.144923210144043, "rewards/margins": 0.386813759803772, "rewards/rejected": -5.531736850738525, "step": 245 }, { "epoch": 0.5234231876472127, "grad_norm": 4370.4657370961, "learning_rate": 5.535570256631384e-07, "loss": 679.4021, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -5.269853591918945, "rewards/margins": 0.39420580863952637, "rewards/rejected": -5.664059162139893, "step": 250 }, { "epoch": 0.533891651400157, "grad_norm": 5546.329529459924, "learning_rate": 5.505792545642954e-07, "loss": 680.8774, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -5.6696882247924805, "rewards/margins": 0.1662217080593109, "rewards/rejected": -5.835909843444824, "step": 255 }, { "epoch": 0.5443601151531012, "grad_norm": 3396.1113411173433, "learning_rate": 5.475174991549528e-07, "loss": 680.5286, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -5.477304935455322, "rewards/margins": 0.6626185178756714, "rewards/rejected": -6.139924049377441, "step": 260 }, { "epoch": 0.5548285789060455, "grad_norm": 7509.706842299371, "learning_rate": 5.443727856151007e-07, "loss": 667.1712, "rewards/accuracies": 0.59375, "rewards/chosen": -5.695134162902832, "rewards/margins": 0.6776683330535889, "rewards/rejected": -6.372802734375, "step": 265 }, { "epoch": 0.5652970426589898, "grad_norm": 5167.959854781231, "learning_rate": 5.411461679290317e-07, "loss": 678.3353, "rewards/accuracies": 0.5625, "rewards/chosen": -5.676094055175781, "rewards/margins": 0.755618691444397, "rewards/rejected": -6.431711673736572, "step": 270 }, { "epoch": 0.575765506411934, "grad_norm": 3674.961462097515, "learning_rate": 5.378387275320869e-07, "loss": 666.944, "rewards/accuracies": 0.625, "rewards/chosen": -5.402568817138672, "rewards/margins": 0.7821658253669739, "rewards/rejected": -6.18473482131958, "step": 275 }, { "epoch": 0.5862339701648783, "grad_norm": 5634.831880478573, "learning_rate": 5.34451572948201e-07, "loss": 670.9914, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -6.517806053161621, "rewards/margins": 0.6073935627937317, "rewards/rejected": -7.125199794769287, "step": 280 }, { "epoch": 0.5967024339178225, "grad_norm": 10174.679987145297, "learning_rate": 5.309858394183691e-07, "loss": 674.4187, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -6.75530481338501, "rewards/margins": 0.6639969944953918, "rewards/rejected": -7.419301509857178, "step": 285 }, { "epoch": 0.6071708976707668, "grad_norm": 6705.843344302837, "learning_rate": 5.274426885201582e-07, "loss": 680.643, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.8337082862854, "rewards/margins": 0.32110291719436646, "rewards/rejected": -7.154810905456543, "step": 290 }, { "epoch": 0.6176393614237111, "grad_norm": 29305.105895087316, "learning_rate": 5.238233077783925e-07, "loss": 663.5017, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -5.573851585388184, "rewards/margins": 0.6479231715202332, "rewards/rejected": -6.221774578094482, "step": 295 }, { "epoch": 0.6281078251766553, "grad_norm": 4360.840909716472, "learning_rate": 5.201289102671411e-07, "loss": 673.6718, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -6.299983024597168, "rewards/margins": 0.7464480400085449, "rewards/rejected": -7.046431064605713, "step": 300 } ], "logging_steps": 5, "max_steps": 954, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }