{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9987217724755006, "eval_steps": 500, "global_step": 2346, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008521516829995739, "grad_norm": 0.6813573837280273, "learning_rate": 5e-05, "loss": 1.7024, "step": 10 }, { "epoch": 0.017043033659991477, "grad_norm": 0.6651060581207275, "learning_rate": 4.978595890410959e-05, "loss": 1.4561, "step": 20 }, { "epoch": 0.02556455048998722, "grad_norm": 1.1946938037872314, "learning_rate": 4.957191780821918e-05, "loss": 1.4456, "step": 30 }, { "epoch": 0.034086067319982954, "grad_norm": 1.7951576709747314, "learning_rate": 4.9357876712328774e-05, "loss": 1.7177, "step": 40 }, { "epoch": 0.04260758414997869, "grad_norm": 2.754934549331665, "learning_rate": 4.914383561643836e-05, "loss": 1.9612, "step": 50 }, { "epoch": 0.05112910097997444, "grad_norm": 0.9254423975944519, "learning_rate": 4.892979452054795e-05, "loss": 1.3975, "step": 60 }, { "epoch": 0.05965061780997018, "grad_norm": 0.9828574657440186, "learning_rate": 4.871575342465753e-05, "loss": 1.2984, "step": 70 }, { "epoch": 0.06817213463996591, "grad_norm": 1.291460394859314, "learning_rate": 4.850171232876712e-05, "loss": 1.5636, "step": 80 }, { "epoch": 0.07669365146996165, "grad_norm": 2.218918800354004, "learning_rate": 4.8287671232876716e-05, "loss": 1.765, "step": 90 }, { "epoch": 0.08521516829995739, "grad_norm": 2.730905771255493, "learning_rate": 4.8073630136986304e-05, "loss": 1.3425, "step": 100 }, { "epoch": 0.09373668512995313, "grad_norm": 0.9995436668395996, "learning_rate": 4.785958904109589e-05, "loss": 1.2667, "step": 110 }, { "epoch": 0.10225820195994888, "grad_norm": 0.877986490726471, "learning_rate": 4.764554794520548e-05, "loss": 1.2679, "step": 120 }, { "epoch": 0.11077971878994461, "grad_norm": 1.7854726314544678, "learning_rate": 4.743150684931507e-05, "loss": 1.6378, "step": 130 }, { "epoch": 0.11930123561994035, "grad_norm": 2.3612940311431885, "learning_rate": 4.7217465753424664e-05, "loss": 1.7922, "step": 140 }, { "epoch": 0.1278227524499361, "grad_norm": 2.323775053024292, "learning_rate": 4.700342465753425e-05, "loss": 1.2947, "step": 150 }, { "epoch": 0.13634426927993182, "grad_norm": 1.2070249319076538, "learning_rate": 4.678938356164384e-05, "loss": 1.269, "step": 160 }, { "epoch": 0.14486578610992756, "grad_norm": 1.1411925554275513, "learning_rate": 4.657534246575342e-05, "loss": 1.3021, "step": 170 }, { "epoch": 0.1533873029399233, "grad_norm": 1.3789585828781128, "learning_rate": 4.636130136986302e-05, "loss": 1.3572, "step": 180 }, { "epoch": 0.16190881976991905, "grad_norm": 2.145219326019287, "learning_rate": 4.6147260273972605e-05, "loss": 1.6292, "step": 190 }, { "epoch": 0.17043033659991477, "grad_norm": 3.3627982139587402, "learning_rate": 4.5933219178082194e-05, "loss": 1.4358, "step": 200 }, { "epoch": 0.17895185342991052, "grad_norm": 1.116105318069458, "learning_rate": 4.571917808219178e-05, "loss": 1.2568, "step": 210 }, { "epoch": 0.18747337025990626, "grad_norm": 1.2009385824203491, "learning_rate": 4.550513698630137e-05, "loss": 1.1746, "step": 220 }, { "epoch": 0.195994887089902, "grad_norm": 2.0525238513946533, "learning_rate": 4.529109589041096e-05, "loss": 1.5463, "step": 230 }, { "epoch": 0.20451640391989775, "grad_norm": 2.649242639541626, "learning_rate": 4.5077054794520553e-05, "loss": 1.7533, "step": 240 }, { "epoch": 0.21303792074989347, "grad_norm": 3.1068289279937744, "learning_rate": 4.486301369863014e-05, "loss": 1.3994, "step": 250 }, { "epoch": 0.22155943757988922, "grad_norm": 1.2716392278671265, "learning_rate": 4.464897260273973e-05, "loss": 1.2382, "step": 260 }, { "epoch": 0.23008095440988496, "grad_norm": 1.20390784740448, "learning_rate": 4.443493150684932e-05, "loss": 1.1434, "step": 270 }, { "epoch": 0.2386024712398807, "grad_norm": 1.7929986715316772, "learning_rate": 4.422089041095891e-05, "loss": 1.5121, "step": 280 }, { "epoch": 0.24712398806987643, "grad_norm": 2.645399570465088, "learning_rate": 4.4006849315068495e-05, "loss": 1.4523, "step": 290 }, { "epoch": 0.2556455048998722, "grad_norm": 3.460209846496582, "learning_rate": 4.379280821917808e-05, "loss": 1.3166, "step": 300 }, { "epoch": 0.2641670217298679, "grad_norm": 1.2120425701141357, "learning_rate": 4.357876712328767e-05, "loss": 1.2265, "step": 310 }, { "epoch": 0.27268853855986364, "grad_norm": 1.2804960012435913, "learning_rate": 4.336472602739726e-05, "loss": 1.15, "step": 320 }, { "epoch": 0.2812100553898594, "grad_norm": 2.300981044769287, "learning_rate": 4.3150684931506855e-05, "loss": 1.5719, "step": 330 }, { "epoch": 0.2897315722198551, "grad_norm": 2.3201756477355957, "learning_rate": 4.293664383561644e-05, "loss": 1.5963, "step": 340 }, { "epoch": 0.2982530890498509, "grad_norm": 3.4875683784484863, "learning_rate": 4.272260273972603e-05, "loss": 1.1126, "step": 350 }, { "epoch": 0.3067746058798466, "grad_norm": 1.2687772512435913, "learning_rate": 4.250856164383562e-05, "loss": 1.1099, "step": 360 }, { "epoch": 0.31529612270984236, "grad_norm": 1.367256760597229, "learning_rate": 4.229452054794521e-05, "loss": 1.1648, "step": 370 }, { "epoch": 0.3238176395398381, "grad_norm": 1.5560370683670044, "learning_rate": 4.2080479452054796e-05, "loss": 1.3107, "step": 380 }, { "epoch": 0.33233915636983385, "grad_norm": 2.6812076568603516, "learning_rate": 4.1866438356164385e-05, "loss": 1.4966, "step": 390 }, { "epoch": 0.34086067319982954, "grad_norm": 4.098489284515381, "learning_rate": 4.165239726027397e-05, "loss": 1.2831, "step": 400 }, { "epoch": 0.3493821900298253, "grad_norm": 1.2586500644683838, "learning_rate": 4.143835616438356e-05, "loss": 1.152, "step": 410 }, { "epoch": 0.35790370685982104, "grad_norm": 1.3851690292358398, "learning_rate": 4.122431506849315e-05, "loss": 1.2035, "step": 420 }, { "epoch": 0.3664252236898168, "grad_norm": 1.9357365369796753, "learning_rate": 4.1010273972602745e-05, "loss": 1.4145, "step": 430 }, { "epoch": 0.3749467405198125, "grad_norm": 2.84523344039917, "learning_rate": 4.079623287671233e-05, "loss": 1.5173, "step": 440 }, { "epoch": 0.3834682573498083, "grad_norm": 2.4506237506866455, "learning_rate": 4.058219178082192e-05, "loss": 1.1763, "step": 450 }, { "epoch": 0.391989774179804, "grad_norm": 1.1524243354797363, "learning_rate": 4.036815068493151e-05, "loss": 1.316, "step": 460 }, { "epoch": 0.40051129100979976, "grad_norm": 1.289616346359253, "learning_rate": 4.01541095890411e-05, "loss": 1.102, "step": 470 }, { "epoch": 0.4090328078397955, "grad_norm": 1.482630968093872, "learning_rate": 3.9940068493150686e-05, "loss": 1.3212, "step": 480 }, { "epoch": 0.4175543246697912, "grad_norm": 2.3927276134490967, "learning_rate": 3.9726027397260274e-05, "loss": 1.6163, "step": 490 }, { "epoch": 0.42607584149978694, "grad_norm": 3.26775860786438, "learning_rate": 3.951198630136986e-05, "loss": 1.1924, "step": 500 }, { "epoch": 0.4345973583297827, "grad_norm": 1.2278869152069092, "learning_rate": 3.929794520547945e-05, "loss": 1.1865, "step": 510 }, { "epoch": 0.44311887515977844, "grad_norm": 1.3176724910736084, "learning_rate": 3.908390410958904e-05, "loss": 1.2366, "step": 520 }, { "epoch": 0.4516403919897742, "grad_norm": 4.769649505615234, "learning_rate": 3.8869863013698634e-05, "loss": 1.4883, "step": 530 }, { "epoch": 0.4601619088197699, "grad_norm": 3.0388917922973633, "learning_rate": 3.865582191780822e-05, "loss": 1.4851, "step": 540 }, { "epoch": 0.4686834256497657, "grad_norm": 3.100656270980835, "learning_rate": 3.844178082191781e-05, "loss": 1.0437, "step": 550 }, { "epoch": 0.4772049424797614, "grad_norm": 3.1867423057556152, "learning_rate": 3.82277397260274e-05, "loss": 1.1181, "step": 560 }, { "epoch": 0.48572645930975716, "grad_norm": 1.2426332235336304, "learning_rate": 3.801369863013699e-05, "loss": 1.1024, "step": 570 }, { "epoch": 0.49424797613975285, "grad_norm": 1.2760354280471802, "learning_rate": 3.779965753424658e-05, "loss": 1.1686, "step": 580 }, { "epoch": 0.5027694929697486, "grad_norm": 2.6802961826324463, "learning_rate": 3.7585616438356164e-05, "loss": 1.4582, "step": 590 }, { "epoch": 0.5112910097997444, "grad_norm": 3.5295393466949463, "learning_rate": 3.737157534246575e-05, "loss": 1.1565, "step": 600 }, { "epoch": 0.5198125266297401, "grad_norm": 1.3644486665725708, "learning_rate": 3.715753424657534e-05, "loss": 1.0841, "step": 610 }, { "epoch": 0.5283340434597358, "grad_norm": 1.268506407737732, "learning_rate": 3.6943493150684936e-05, "loss": 1.0859, "step": 620 }, { "epoch": 0.5368555602897316, "grad_norm": 2.1879653930664062, "learning_rate": 3.6729452054794524e-05, "loss": 1.415, "step": 630 }, { "epoch": 0.5453770771197273, "grad_norm": 2.482619047164917, "learning_rate": 3.651541095890411e-05, "loss": 1.5743, "step": 640 }, { "epoch": 0.5538985939497231, "grad_norm": 3.3282759189605713, "learning_rate": 3.63013698630137e-05, "loss": 1.203, "step": 650 }, { "epoch": 0.5624201107797188, "grad_norm": 1.288682460784912, "learning_rate": 3.608732876712329e-05, "loss": 1.1415, "step": 660 }, { "epoch": 0.5709416276097146, "grad_norm": 1.232367753982544, "learning_rate": 3.587328767123288e-05, "loss": 1.0992, "step": 670 }, { "epoch": 0.5794631444397103, "grad_norm": 2.4475719928741455, "learning_rate": 3.565924657534247e-05, "loss": 1.6888, "step": 680 }, { "epoch": 0.587984661269706, "grad_norm": 2.75734543800354, "learning_rate": 3.5445205479452054e-05, "loss": 1.479, "step": 690 }, { "epoch": 0.5965061780997017, "grad_norm": 3.1947202682495117, "learning_rate": 3.523116438356164e-05, "loss": 1.0327, "step": 700 }, { "epoch": 0.6050276949296974, "grad_norm": 1.2636553049087524, "learning_rate": 3.501712328767123e-05, "loss": 1.0756, "step": 710 }, { "epoch": 0.6135492117596932, "grad_norm": 1.2693008184432983, "learning_rate": 3.4803082191780825e-05, "loss": 1.2125, "step": 720 }, { "epoch": 0.6220707285896889, "grad_norm": 2.964484691619873, "learning_rate": 3.4589041095890414e-05, "loss": 1.3954, "step": 730 }, { "epoch": 0.6305922454196847, "grad_norm": 2.6809933185577393, "learning_rate": 3.4375e-05, "loss": 1.3599, "step": 740 }, { "epoch": 0.6391137622496804, "grad_norm": 3.059370994567871, "learning_rate": 3.416095890410959e-05, "loss": 1.2158, "step": 750 }, { "epoch": 0.6476352790796762, "grad_norm": 1.3807661533355713, "learning_rate": 3.394691780821918e-05, "loss": 1.0973, "step": 760 }, { "epoch": 0.6561567959096719, "grad_norm": 1.525608777999878, "learning_rate": 3.373287671232877e-05, "loss": 1.1659, "step": 770 }, { "epoch": 0.6646783127396677, "grad_norm": 2.267289161682129, "learning_rate": 3.351883561643836e-05, "loss": 1.6013, "step": 780 }, { "epoch": 0.6731998295696634, "grad_norm": 2.4192111492156982, "learning_rate": 3.330479452054795e-05, "loss": 1.5129, "step": 790 }, { "epoch": 0.6817213463996591, "grad_norm": 4.045669078826904, "learning_rate": 3.309075342465753e-05, "loss": 1.1527, "step": 800 }, { "epoch": 0.6902428632296549, "grad_norm": 1.4586423635482788, "learning_rate": 3.287671232876712e-05, "loss": 1.0746, "step": 810 }, { "epoch": 0.6987643800596506, "grad_norm": 1.3621727228164673, "learning_rate": 3.2662671232876715e-05, "loss": 1.0083, "step": 820 }, { "epoch": 0.7072858968896464, "grad_norm": 2.002037763595581, "learning_rate": 3.2448630136986303e-05, "loss": 1.4296, "step": 830 }, { "epoch": 0.7158074137196421, "grad_norm": 24.786273956298828, "learning_rate": 3.223458904109589e-05, "loss": 1.5759, "step": 840 }, { "epoch": 0.7243289305496379, "grad_norm": 3.176041841506958, "learning_rate": 3.202054794520548e-05, "loss": 1.34, "step": 850 }, { "epoch": 0.7328504473796336, "grad_norm": 1.4174227714538574, "learning_rate": 3.180650684931507e-05, "loss": 1.0625, "step": 860 }, { "epoch": 0.7413719642096294, "grad_norm": 1.2621195316314697, "learning_rate": 3.1592465753424663e-05, "loss": 1.0369, "step": 870 }, { "epoch": 0.749893481039625, "grad_norm": 2.297351360321045, "learning_rate": 3.137842465753425e-05, "loss": 1.4817, "step": 880 }, { "epoch": 0.7584149978696207, "grad_norm": 2.9399614334106445, "learning_rate": 3.116438356164384e-05, "loss": 1.4127, "step": 890 }, { "epoch": 0.7669365146996165, "grad_norm": 3.5914995670318604, "learning_rate": 3.095034246575342e-05, "loss": 1.0546, "step": 900 }, { "epoch": 0.7754580315296122, "grad_norm": 1.2652535438537598, "learning_rate": 3.073630136986301e-05, "loss": 1.1101, "step": 910 }, { "epoch": 0.783979548359608, "grad_norm": 1.644640564918518, "learning_rate": 3.0522260273972605e-05, "loss": 1.1791, "step": 920 }, { "epoch": 0.7925010651896037, "grad_norm": 2.482304096221924, "learning_rate": 3.0308219178082193e-05, "loss": 1.512, "step": 930 }, { "epoch": 0.8010225820195995, "grad_norm": 2.763471841812134, "learning_rate": 3.009417808219178e-05, "loss": 1.477, "step": 940 }, { "epoch": 0.8095440988495952, "grad_norm": 3.675570011138916, "learning_rate": 2.988013698630137e-05, "loss": 1.1064, "step": 950 }, { "epoch": 0.818065615679591, "grad_norm": 1.302946925163269, "learning_rate": 2.966609589041096e-05, "loss": 1.0189, "step": 960 }, { "epoch": 0.8265871325095867, "grad_norm": 1.492253303527832, "learning_rate": 2.945205479452055e-05, "loss": 1.0817, "step": 970 }, { "epoch": 0.8351086493395824, "grad_norm": 3.072866439819336, "learning_rate": 2.923801369863014e-05, "loss": 1.4999, "step": 980 }, { "epoch": 0.8436301661695782, "grad_norm": 2.7219836711883545, "learning_rate": 2.902397260273973e-05, "loss": 1.4183, "step": 990 }, { "epoch": 0.8521516829995739, "grad_norm": 3.425384044647217, "learning_rate": 2.8809931506849318e-05, "loss": 1.0067, "step": 1000 }, { "epoch": 0.8606731998295697, "grad_norm": 1.6666113138198853, "learning_rate": 2.8595890410958903e-05, "loss": 1.1709, "step": 1010 }, { "epoch": 0.8691947166595654, "grad_norm": 1.3914527893066406, "learning_rate": 2.838184931506849e-05, "loss": 1.0248, "step": 1020 }, { "epoch": 0.8777162334895612, "grad_norm": 2.082874298095703, "learning_rate": 2.8167808219178083e-05, "loss": 1.2266, "step": 1030 }, { "epoch": 0.8862377503195569, "grad_norm": 2.5057151317596436, "learning_rate": 2.795376712328767e-05, "loss": 1.4517, "step": 1040 }, { "epoch": 0.8947592671495527, "grad_norm": 3.3986401557922363, "learning_rate": 2.7739726027397263e-05, "loss": 1.1483, "step": 1050 }, { "epoch": 0.9032807839795484, "grad_norm": 1.3847638368606567, "learning_rate": 2.752568493150685e-05, "loss": 1.0509, "step": 1060 }, { "epoch": 0.911802300809544, "grad_norm": 1.515759825706482, "learning_rate": 2.731164383561644e-05, "loss": 1.1062, "step": 1070 }, { "epoch": 0.9203238176395399, "grad_norm": 1.9089744091033936, "learning_rate": 2.709760273972603e-05, "loss": 1.2536, "step": 1080 }, { "epoch": 0.9288453344695355, "grad_norm": 3.3122973442077637, "learning_rate": 2.688356164383562e-05, "loss": 1.6613, "step": 1090 }, { "epoch": 0.9373668512995313, "grad_norm": 4.078606128692627, "learning_rate": 2.6669520547945208e-05, "loss": 1.1644, "step": 1100 }, { "epoch": 0.945888368129527, "grad_norm": 1.3247941732406616, "learning_rate": 2.6455479452054793e-05, "loss": 0.999, "step": 1110 }, { "epoch": 0.9544098849595228, "grad_norm": 1.617491364479065, "learning_rate": 2.6241438356164384e-05, "loss": 1.0226, "step": 1120 }, { "epoch": 0.9629314017895185, "grad_norm": 2.2223851680755615, "learning_rate": 2.6027397260273973e-05, "loss": 1.5095, "step": 1130 }, { "epoch": 0.9714529186195143, "grad_norm": 2.7325618267059326, "learning_rate": 2.581335616438356e-05, "loss": 1.5794, "step": 1140 }, { "epoch": 0.97997443544951, "grad_norm": 3.270749568939209, "learning_rate": 2.5599315068493153e-05, "loss": 1.2044, "step": 1150 }, { "epoch": 0.9884959522795057, "grad_norm": 1.6444586515426636, "learning_rate": 2.538527397260274e-05, "loss": 1.136, "step": 1160 }, { "epoch": 0.9970174691095015, "grad_norm": 3.6478381156921387, "learning_rate": 2.517123287671233e-05, "loss": 1.3373, "step": 1170 }, { "epoch": 1.0051129100979974, "grad_norm": 1.398633599281311, "learning_rate": 2.495719178082192e-05, "loss": 0.9056, "step": 1180 }, { "epoch": 1.013634426927993, "grad_norm": 1.3859210014343262, "learning_rate": 2.4743150684931506e-05, "loss": 0.9212, "step": 1190 }, { "epoch": 1.022155943757989, "grad_norm": 2.0104875564575195, "learning_rate": 2.4529109589041097e-05, "loss": 1.0748, "step": 1200 }, { "epoch": 1.0306774605879847, "grad_norm": 3.378269672393799, "learning_rate": 2.4315068493150686e-05, "loss": 1.1882, "step": 1210 }, { "epoch": 1.0391989774179804, "grad_norm": 3.0901708602905273, "learning_rate": 2.4101027397260274e-05, "loss": 0.8874, "step": 1220 }, { "epoch": 1.047720494247976, "grad_norm": 1.6910735368728638, "learning_rate": 2.3886986301369866e-05, "loss": 0.888, "step": 1230 }, { "epoch": 1.0562420110779718, "grad_norm": 1.5407036542892456, "learning_rate": 2.367294520547945e-05, "loss": 0.9438, "step": 1240 }, { "epoch": 1.0647635279079677, "grad_norm": 1.9005461931228638, "learning_rate": 2.3458904109589042e-05, "loss": 0.9209, "step": 1250 }, { "epoch": 1.0732850447379634, "grad_norm": 2.394400119781494, "learning_rate": 2.324486301369863e-05, "loss": 1.2207, "step": 1260 }, { "epoch": 1.081806561567959, "grad_norm": 2.9217231273651123, "learning_rate": 2.3030821917808222e-05, "loss": 1.0554, "step": 1270 }, { "epoch": 1.0903280783979548, "grad_norm": 1.6549851894378662, "learning_rate": 2.281678082191781e-05, "loss": 0.9341, "step": 1280 }, { "epoch": 1.0988495952279507, "grad_norm": 1.7708053588867188, "learning_rate": 2.2602739726027396e-05, "loss": 0.9768, "step": 1290 }, { "epoch": 1.1073711120579464, "grad_norm": 1.953326940536499, "learning_rate": 2.2388698630136987e-05, "loss": 0.9628, "step": 1300 }, { "epoch": 1.115892628887942, "grad_norm": 3.4446678161621094, "learning_rate": 2.2174657534246575e-05, "loss": 1.2085, "step": 1310 }, { "epoch": 1.1244141457179377, "grad_norm": 3.5323126316070557, "learning_rate": 2.1960616438356167e-05, "loss": 1.124, "step": 1320 }, { "epoch": 1.1329356625479337, "grad_norm": 1.861324429512024, "learning_rate": 2.1746575342465755e-05, "loss": 0.8991, "step": 1330 }, { "epoch": 1.1414571793779293, "grad_norm": 1.6703088283538818, "learning_rate": 2.1532534246575344e-05, "loss": 0.9776, "step": 1340 }, { "epoch": 1.149978696207925, "grad_norm": 1.834876537322998, "learning_rate": 2.1318493150684932e-05, "loss": 0.9773, "step": 1350 }, { "epoch": 1.1585002130379207, "grad_norm": 3.600705862045288, "learning_rate": 2.110445205479452e-05, "loss": 1.2392, "step": 1360 }, { "epoch": 1.1670217298679164, "grad_norm": 3.9731180667877197, "learning_rate": 2.0890410958904112e-05, "loss": 0.9947, "step": 1370 }, { "epoch": 1.1755432466979123, "grad_norm": 2.0116519927978516, "learning_rate": 2.06763698630137e-05, "loss": 0.9025, "step": 1380 }, { "epoch": 1.184064763527908, "grad_norm": 1.8098217248916626, "learning_rate": 2.046232876712329e-05, "loss": 0.8956, "step": 1390 }, { "epoch": 1.1925862803579037, "grad_norm": 2.4669318199157715, "learning_rate": 2.0248287671232877e-05, "loss": 1.0979, "step": 1400 }, { "epoch": 1.2011077971878994, "grad_norm": 3.1824028491973877, "learning_rate": 2.0034246575342465e-05, "loss": 1.2918, "step": 1410 }, { "epoch": 1.2096293140178953, "grad_norm": 3.60017991065979, "learning_rate": 1.9820205479452057e-05, "loss": 0.9279, "step": 1420 }, { "epoch": 1.218150830847891, "grad_norm": 6.929809093475342, "learning_rate": 1.9606164383561645e-05, "loss": 0.8606, "step": 1430 }, { "epoch": 1.2266723476778867, "grad_norm": 2.342393636703491, "learning_rate": 1.9392123287671233e-05, "loss": 0.963, "step": 1440 }, { "epoch": 1.2351938645078824, "grad_norm": 2.6814820766448975, "learning_rate": 1.9178082191780822e-05, "loss": 1.0806, "step": 1450 }, { "epoch": 1.243715381337878, "grad_norm": 4.10792350769043, "learning_rate": 1.896404109589041e-05, "loss": 1.1623, "step": 1460 }, { "epoch": 1.2522368981678738, "grad_norm": 3.0713534355163574, "learning_rate": 1.8750000000000002e-05, "loss": 0.8909, "step": 1470 }, { "epoch": 1.2607584149978697, "grad_norm": 1.9831465482711792, "learning_rate": 1.853595890410959e-05, "loss": 0.8631, "step": 1480 }, { "epoch": 1.2692799318278654, "grad_norm": 1.8906506299972534, "learning_rate": 1.832191780821918e-05, "loss": 0.9484, "step": 1490 }, { "epoch": 1.277801448657861, "grad_norm": 2.1991984844207764, "learning_rate": 1.8107876712328767e-05, "loss": 1.0584, "step": 1500 }, { "epoch": 1.286322965487857, "grad_norm": 3.851630449295044, "learning_rate": 1.7893835616438355e-05, "loss": 1.3179, "step": 1510 }, { "epoch": 1.2948444823178527, "grad_norm": 4.507850646972656, "learning_rate": 1.7679794520547947e-05, "loss": 1.0782, "step": 1520 }, { "epoch": 1.3033659991478483, "grad_norm": 1.7349963188171387, "learning_rate": 1.7465753424657535e-05, "loss": 0.8217, "step": 1530 }, { "epoch": 1.311887515977844, "grad_norm": 1.8865768909454346, "learning_rate": 1.7251712328767127e-05, "loss": 0.9012, "step": 1540 }, { "epoch": 1.3204090328078397, "grad_norm": 1.9257206916809082, "learning_rate": 1.703767123287671e-05, "loss": 0.8876, "step": 1550 }, { "epoch": 1.3289305496378354, "grad_norm": 3.2287306785583496, "learning_rate": 1.68236301369863e-05, "loss": 1.2605, "step": 1560 }, { "epoch": 1.3374520664678313, "grad_norm": 3.4038004875183105, "learning_rate": 1.660958904109589e-05, "loss": 0.9881, "step": 1570 }, { "epoch": 1.345973583297827, "grad_norm": 1.9077774286270142, "learning_rate": 1.639554794520548e-05, "loss": 0.913, "step": 1580 }, { "epoch": 1.3544951001278227, "grad_norm": 2.548600196838379, "learning_rate": 1.618150684931507e-05, "loss": 0.9404, "step": 1590 }, { "epoch": 1.3630166169578186, "grad_norm": 2.9931490421295166, "learning_rate": 1.596746575342466e-05, "loss": 1.0795, "step": 1600 }, { "epoch": 1.3715381337878143, "grad_norm": 4.5465216636657715, "learning_rate": 1.5753424657534248e-05, "loss": 1.2289, "step": 1610 }, { "epoch": 1.38005965061781, "grad_norm": 4.3074631690979, "learning_rate": 1.5539383561643836e-05, "loss": 0.8165, "step": 1620 }, { "epoch": 1.3885811674478057, "grad_norm": 1.9459553956985474, "learning_rate": 1.5325342465753425e-05, "loss": 0.8087, "step": 1630 }, { "epoch": 1.3971026842778014, "grad_norm": 1.8323142528533936, "learning_rate": 1.5111301369863015e-05, "loss": 0.8715, "step": 1640 }, { "epoch": 1.405624201107797, "grad_norm": 2.3462753295898438, "learning_rate": 1.4897260273972605e-05, "loss": 0.8655, "step": 1650 }, { "epoch": 1.414145717937793, "grad_norm": 3.0974302291870117, "learning_rate": 1.4683219178082191e-05, "loss": 1.1704, "step": 1660 }, { "epoch": 1.4226672347677887, "grad_norm": 3.1823673248291016, "learning_rate": 1.4469178082191781e-05, "loss": 0.9444, "step": 1670 }, { "epoch": 1.4311887515977844, "grad_norm": 2.3283894062042236, "learning_rate": 1.4255136986301371e-05, "loss": 0.8847, "step": 1680 }, { "epoch": 1.4397102684277803, "grad_norm": 1.8633939027786255, "learning_rate": 1.404109589041096e-05, "loss": 0.8843, "step": 1690 }, { "epoch": 1.448231785257776, "grad_norm": 2.870725393295288, "learning_rate": 1.382705479452055e-05, "loss": 0.9102, "step": 1700 }, { "epoch": 1.4567533020877717, "grad_norm": 3.6491854190826416, "learning_rate": 1.3613013698630136e-05, "loss": 1.1743, "step": 1710 }, { "epoch": 1.4652748189177673, "grad_norm": 4.350872039794922, "learning_rate": 1.3398972602739726e-05, "loss": 0.9763, "step": 1720 }, { "epoch": 1.473796335747763, "grad_norm": 2.080191135406494, "learning_rate": 1.3184931506849316e-05, "loss": 0.8528, "step": 1730 }, { "epoch": 1.4823178525777587, "grad_norm": 2.2544877529144287, "learning_rate": 1.2970890410958906e-05, "loss": 0.8987, "step": 1740 }, { "epoch": 1.4908393694077546, "grad_norm": 2.3497424125671387, "learning_rate": 1.2756849315068494e-05, "loss": 1.0171, "step": 1750 }, { "epoch": 1.4993608862377503, "grad_norm": 3.6866824626922607, "learning_rate": 1.2542808219178081e-05, "loss": 1.1774, "step": 1760 }, { "epoch": 1.507882403067746, "grad_norm": 4.5758891105651855, "learning_rate": 1.2328767123287671e-05, "loss": 0.9464, "step": 1770 }, { "epoch": 1.516403919897742, "grad_norm": 2.159677743911743, "learning_rate": 1.2114726027397261e-05, "loss": 0.8089, "step": 1780 }, { "epoch": 1.5249254367277376, "grad_norm": 2.345613479614258, "learning_rate": 1.1900684931506851e-05, "loss": 0.9699, "step": 1790 }, { "epoch": 1.5334469535577333, "grad_norm": 2.187382936477661, "learning_rate": 1.168664383561644e-05, "loss": 1.0533, "step": 1800 }, { "epoch": 1.541968470387729, "grad_norm": 6.11014986038208, "learning_rate": 1.1472602739726027e-05, "loss": 1.3963, "step": 1810 }, { "epoch": 1.5504899872177247, "grad_norm": 4.699113368988037, "learning_rate": 1.1258561643835617e-05, "loss": 0.8968, "step": 1820 }, { "epoch": 1.5590115040477204, "grad_norm": 1.8584600687026978, "learning_rate": 1.1044520547945206e-05, "loss": 0.8661, "step": 1830 }, { "epoch": 1.5675330208777163, "grad_norm": 2.1716010570526123, "learning_rate": 1.0830479452054796e-05, "loss": 0.9446, "step": 1840 }, { "epoch": 1.576054537707712, "grad_norm": 2.649498701095581, "learning_rate": 1.0616438356164384e-05, "loss": 0.9392, "step": 1850 }, { "epoch": 1.5845760545377077, "grad_norm": 5.051261901855469, "learning_rate": 1.0402397260273972e-05, "loss": 1.4026, "step": 1860 }, { "epoch": 1.5930975713677036, "grad_norm": 4.0868425369262695, "learning_rate": 1.0188356164383562e-05, "loss": 1.0371, "step": 1870 }, { "epoch": 1.6016190881976993, "grad_norm": 2.242595672607422, "learning_rate": 9.97431506849315e-06, "loss": 0.7793, "step": 1880 }, { "epoch": 1.610140605027695, "grad_norm": 2.0395429134368896, "learning_rate": 9.76027397260274e-06, "loss": 0.9659, "step": 1890 }, { "epoch": 1.6186621218576907, "grad_norm": 3.6967060565948486, "learning_rate": 9.54623287671233e-06, "loss": 1.3609, "step": 1900 }, { "epoch": 1.6271836386876863, "grad_norm": 3.8495938777923584, "learning_rate": 9.332191780821919e-06, "loss": 0.9838, "step": 1910 }, { "epoch": 1.635705155517682, "grad_norm": 4.630374431610107, "learning_rate": 9.118150684931507e-06, "loss": 0.9471, "step": 1920 }, { "epoch": 1.6442266723476777, "grad_norm": 2.2313318252563477, "learning_rate": 8.904109589041095e-06, "loss": 0.8377, "step": 1930 }, { "epoch": 1.6527481891776736, "grad_norm": 2.623538017272949, "learning_rate": 8.690068493150685e-06, "loss": 0.9168, "step": 1940 }, { "epoch": 1.6612697060076693, "grad_norm": 2.4369919300079346, "learning_rate": 8.476027397260275e-06, "loss": 1.1145, "step": 1950 }, { "epoch": 1.6697912228376652, "grad_norm": 3.5387771129608154, "learning_rate": 8.261986301369864e-06, "loss": 1.2709, "step": 1960 }, { "epoch": 1.678312739667661, "grad_norm": 3.519103527069092, "learning_rate": 8.047945205479452e-06, "loss": 0.8001, "step": 1970 }, { "epoch": 1.6868342564976566, "grad_norm": 2.0191633701324463, "learning_rate": 7.83390410958904e-06, "loss": 0.9001, "step": 1980 }, { "epoch": 1.6953557733276523, "grad_norm": 2.1604294776916504, "learning_rate": 7.61986301369863e-06, "loss": 0.8404, "step": 1990 }, { "epoch": 1.703877290157648, "grad_norm": 2.4887239933013916, "learning_rate": 7.40582191780822e-06, "loss": 1.0109, "step": 2000 }, { "epoch": 1.7123988069876437, "grad_norm": 3.2208356857299805, "learning_rate": 7.191780821917809e-06, "loss": 1.135, "step": 2010 }, { "epoch": 1.7209203238176394, "grad_norm": 3.4077229499816895, "learning_rate": 6.977739726027398e-06, "loss": 1.0031, "step": 2020 }, { "epoch": 1.7294418406476353, "grad_norm": 1.9562416076660156, "learning_rate": 6.763698630136987e-06, "loss": 0.8598, "step": 2030 }, { "epoch": 1.737963357477631, "grad_norm": 2.2411820888519287, "learning_rate": 6.549657534246575e-06, "loss": 0.8489, "step": 2040 }, { "epoch": 1.7464848743076269, "grad_norm": 3.026580333709717, "learning_rate": 6.335616438356165e-06, "loss": 1.0326, "step": 2050 }, { "epoch": 1.7550063911376226, "grad_norm": 4.257705211639404, "learning_rate": 6.121575342465754e-06, "loss": 1.2792, "step": 2060 }, { "epoch": 1.7635279079676183, "grad_norm": 2.8991031646728516, "learning_rate": 5.907534246575343e-06, "loss": 0.9655, "step": 2070 }, { "epoch": 1.772049424797614, "grad_norm": 2.1200027465820312, "learning_rate": 5.693493150684932e-06, "loss": 0.9003, "step": 2080 }, { "epoch": 1.7805709416276096, "grad_norm": 2.8092291355133057, "learning_rate": 5.479452054794521e-06, "loss": 0.9486, "step": 2090 }, { "epoch": 1.7890924584576053, "grad_norm": 2.481590747833252, "learning_rate": 5.26541095890411e-06, "loss": 0.8927, "step": 2100 }, { "epoch": 1.797613975287601, "grad_norm": 4.080635070800781, "learning_rate": 5.051369863013699e-06, "loss": 1.0988, "step": 2110 }, { "epoch": 1.806135492117597, "grad_norm": 5.39976692199707, "learning_rate": 4.8373287671232874e-06, "loss": 0.923, "step": 2120 }, { "epoch": 1.8146570089475926, "grad_norm": 2.044147253036499, "learning_rate": 4.623287671232877e-06, "loss": 0.914, "step": 2130 }, { "epoch": 1.8231785257775885, "grad_norm": 2.0422821044921875, "learning_rate": 4.4092465753424666e-06, "loss": 0.8518, "step": 2140 }, { "epoch": 1.8317000426075842, "grad_norm": 2.9914565086364746, "learning_rate": 4.195205479452055e-06, "loss": 1.1411, "step": 2150 }, { "epoch": 1.84022155943758, "grad_norm": 3.7009713649749756, "learning_rate": 3.981164383561644e-06, "loss": 1.1454, "step": 2160 }, { "epoch": 1.8487430762675756, "grad_norm": 5.263967514038086, "learning_rate": 3.7671232876712327e-06, "loss": 0.8432, "step": 2170 }, { "epoch": 1.8572645930975713, "grad_norm": 2.337414026260376, "learning_rate": 3.5530821917808223e-06, "loss": 0.8604, "step": 2180 }, { "epoch": 1.865786109927567, "grad_norm": 2.2294089794158936, "learning_rate": 3.3390410958904114e-06, "loss": 0.933, "step": 2190 }, { "epoch": 1.8743076267575627, "grad_norm": 2.332831621170044, "learning_rate": 3.125e-06, "loss": 0.9133, "step": 2200 }, { "epoch": 1.8828291435875586, "grad_norm": 3.756347179412842, "learning_rate": 2.910958904109589e-06, "loss": 1.3043, "step": 2210 }, { "epoch": 1.8913506604175543, "grad_norm": 4.380275249481201, "learning_rate": 2.6969178082191784e-06, "loss": 0.9552, "step": 2220 }, { "epoch": 1.8998721772475502, "grad_norm": 2.2383973598480225, "learning_rate": 2.482876712328767e-06, "loss": 0.8783, "step": 2230 }, { "epoch": 1.9083936940775459, "grad_norm": 2.0851542949676514, "learning_rate": 2.2688356164383563e-06, "loss": 0.8612, "step": 2240 }, { "epoch": 1.9169152109075416, "grad_norm": 2.3459975719451904, "learning_rate": 2.054794520547945e-06, "loss": 0.9008, "step": 2250 }, { "epoch": 1.9254367277375373, "grad_norm": 4.169739246368408, "learning_rate": 1.8407534246575344e-06, "loss": 1.2114, "step": 2260 }, { "epoch": 1.933958244567533, "grad_norm": 4.250594615936279, "learning_rate": 1.6267123287671233e-06, "loss": 1.0262, "step": 2270 }, { "epoch": 1.9424797613975286, "grad_norm": 2.187901020050049, "learning_rate": 1.4126712328767122e-06, "loss": 0.8497, "step": 2280 }, { "epoch": 1.9510012782275243, "grad_norm": 2.139758348464966, "learning_rate": 1.1986301369863014e-06, "loss": 0.8855, "step": 2290 }, { "epoch": 1.9595227950575203, "grad_norm": 3.2545394897460938, "learning_rate": 9.845890410958905e-07, "loss": 1.0541, "step": 2300 }, { "epoch": 1.968044311887516, "grad_norm": 4.898044586181641, "learning_rate": 7.705479452054794e-07, "loss": 1.1848, "step": 2310 }, { "epoch": 1.9765658287175119, "grad_norm": 5.535640716552734, "learning_rate": 5.565068493150685e-07, "loss": 0.8679, "step": 2320 }, { "epoch": 1.9850873455475075, "grad_norm": 1.8863285779953003, "learning_rate": 3.4246575342465755e-07, "loss": 0.7762, "step": 2330 }, { "epoch": 1.9936088623775032, "grad_norm": 4.670785427093506, "learning_rate": 1.2842465753424656e-07, "loss": 0.9951, "step": 2340 } ], "logging_steps": 10, "max_steps": 2346, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4355734448273408e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }