|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 5238, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0057273768613974796, |
|
"grad_norm": 1.1893006680700424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7164, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011454753722794959, |
|
"grad_norm": 1.04026748119074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6034, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01718213058419244, |
|
"grad_norm": 0.947476798322524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5621, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.022909507445589918, |
|
"grad_norm": 0.8781018637437837, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5341, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0286368843069874, |
|
"grad_norm": 0.9580875402301294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.526, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03436426116838488, |
|
"grad_norm": 0.8781914311552192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5078, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04009163802978236, |
|
"grad_norm": 0.671566687192882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5056, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.045819014891179836, |
|
"grad_norm": 0.6015476511437352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05154639175257732, |
|
"grad_norm": 0.6233828429107251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5062, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0572737686139748, |
|
"grad_norm": 0.5502218388168, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4995, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06300114547537228, |
|
"grad_norm": 0.5727700188508118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5018, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06872852233676977, |
|
"grad_norm": 0.5825399902463676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4897, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07445589919816724, |
|
"grad_norm": 0.5794095605103436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4921, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08018327605956473, |
|
"grad_norm": 0.5905275386374104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4872, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0859106529209622, |
|
"grad_norm": 0.5323846340490436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4771, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09163802978235967, |
|
"grad_norm": 0.5827507453646251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4923, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09736540664375716, |
|
"grad_norm": 0.6052222320099511, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4889, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"grad_norm": 0.5461900186435344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4906, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10882016036655212, |
|
"grad_norm": 0.5446312759280234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4792, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1145475372279496, |
|
"grad_norm": 0.5282278298362526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4749, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12027491408934708, |
|
"grad_norm": 0.5564949237418149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.476, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12600229095074456, |
|
"grad_norm": 0.5511775080408522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4795, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13172966781214204, |
|
"grad_norm": 0.5328672926489202, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4768, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13745704467353953, |
|
"grad_norm": 0.5278974935096795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.471, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.143184421534937, |
|
"grad_norm": 0.5354440170377812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4713, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14891179839633448, |
|
"grad_norm": 0.5438446163693844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4696, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15463917525773196, |
|
"grad_norm": 0.5496174748297591, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4655, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16036655211912945, |
|
"grad_norm": 0.5926866056184342, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4747, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1660939289805269, |
|
"grad_norm": 0.5141247582508096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4648, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1718213058419244, |
|
"grad_norm": 0.5536073278932672, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4659, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1775486827033219, |
|
"grad_norm": 0.5234564239929281, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4581, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.18327605956471935, |
|
"grad_norm": 0.5219512867012481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4577, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.18900343642611683, |
|
"grad_norm": 0.5474679242801981, |
|
"learning_rate": 5e-06, |
|
"loss": 0.463, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19473081328751432, |
|
"grad_norm": 0.516308993975195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4606, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2004581901489118, |
|
"grad_norm": 0.5401317271229032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4722, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.5234223381679171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.461, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.21191294387170675, |
|
"grad_norm": 0.5475015406952914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4527, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.21764032073310424, |
|
"grad_norm": 0.5376680018928803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4618, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22336769759450173, |
|
"grad_norm": 0.5410529332865766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.454, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2290950744558992, |
|
"grad_norm": 0.5652526379991017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4656, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23482245131729668, |
|
"grad_norm": 0.5555688465804705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4594, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24054982817869416, |
|
"grad_norm": 0.5701732999281934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4662, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.24627720504009165, |
|
"grad_norm": 0.5512026939695381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4683, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2520045819014891, |
|
"grad_norm": 0.553814123316092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.463, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"grad_norm": 0.5200195587347526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4609, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2634593356242841, |
|
"grad_norm": 0.550688588791354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4457, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.26918671248568155, |
|
"grad_norm": 0.5863844516267137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4632, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.27491408934707906, |
|
"grad_norm": 0.5729329474663841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4434, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2806414662084765, |
|
"grad_norm": 0.5523269014403068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4433, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.286368843069874, |
|
"grad_norm": 0.5116583829791455, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4562, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2920962199312715, |
|
"grad_norm": 0.5787963201989073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4654, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.29782359679266895, |
|
"grad_norm": 0.5447902232207782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4298, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3035509736540664, |
|
"grad_norm": 0.5423741210811989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.463, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"grad_norm": 0.5232608979728346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4534, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3150057273768614, |
|
"grad_norm": 0.5745290999609551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4514, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3207331042382589, |
|
"grad_norm": 0.5974645168072564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4606, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.32646048109965636, |
|
"grad_norm": 0.5487734496746278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4607, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3321878579610538, |
|
"grad_norm": 0.5441562004543612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4507, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.33791523482245134, |
|
"grad_norm": 0.5573672899202607, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4441, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3436426116838488, |
|
"grad_norm": 0.5308370142895599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4518, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.34936998854524626, |
|
"grad_norm": 0.5117603002146386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4583, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3550973654066438, |
|
"grad_norm": 0.5574954390885013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4603, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.36082474226804123, |
|
"grad_norm": 0.5606001506682383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4514, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3665521191294387, |
|
"grad_norm": 0.5569042632837081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4537, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3722794959908362, |
|
"grad_norm": 0.5585555273085546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4626, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.37800687285223367, |
|
"grad_norm": 0.5029368495012907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4438, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3837342497136312, |
|
"grad_norm": 0.5901403440941212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.456, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.38946162657502864, |
|
"grad_norm": 0.5195926126583429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4445, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3951890034364261, |
|
"grad_norm": 0.5225881930388002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4439, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4009163802978236, |
|
"grad_norm": 0.5279076911326633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4518, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4066437571592211, |
|
"grad_norm": 0.5178025129238295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4444, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.5391197058842394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4417, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.41809851088201605, |
|
"grad_norm": 0.5693350216341284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4531, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4238258877434135, |
|
"grad_norm": 0.5272168668986746, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4507, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.42955326460481097, |
|
"grad_norm": 0.5601282100265634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4395, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4352806414662085, |
|
"grad_norm": 0.5283833355102217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4392, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.44100801832760594, |
|
"grad_norm": 0.5348435308559183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4571, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.44673539518900346, |
|
"grad_norm": 0.5431552168715813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4532, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4524627720504009, |
|
"grad_norm": 0.545120281271903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4534, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4581901489117984, |
|
"grad_norm": 0.5387987858480611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4536, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4639175257731959, |
|
"grad_norm": 0.5430847301042485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4469, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.46964490263459335, |
|
"grad_norm": 0.5311236600330211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4428, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4753722794959908, |
|
"grad_norm": 0.5560451338696027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4502, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.48109965635738833, |
|
"grad_norm": 0.5470174916746855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4482, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4868270332187858, |
|
"grad_norm": 0.563263919253685, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4446, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4925544100801833, |
|
"grad_norm": 0.573113515782645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4508, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.49828178694158076, |
|
"grad_norm": 0.5231438549151345, |
|
"learning_rate": 5e-06, |
|
"loss": 0.45, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5040091638029782, |
|
"grad_norm": 0.5176148074624103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4348, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5097365406643757, |
|
"grad_norm": 0.5309016268362335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.443, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 0.5021579023128154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.448, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5211912943871707, |
|
"grad_norm": 0.559779127609092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.444, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5269186712485682, |
|
"grad_norm": 0.5789927290923392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4493, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5326460481099656, |
|
"grad_norm": 0.5690148664602116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4541, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5383734249713631, |
|
"grad_norm": 0.5018200538904565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4437, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5441008018327605, |
|
"grad_norm": 0.5626094146626068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.438, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5498281786941581, |
|
"grad_norm": 0.5453326758465812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4382, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.5731487204099626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.443, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.561282932416953, |
|
"grad_norm": 0.5469415847599124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4541, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5670103092783505, |
|
"grad_norm": 0.5216017929376741, |
|
"learning_rate": 5e-06, |
|
"loss": 0.447, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.572737686139748, |
|
"grad_norm": 0.5633524684965274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4371, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5784650630011455, |
|
"grad_norm": 0.5293660880153032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4383, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.584192439862543, |
|
"grad_norm": 0.5399877214818077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.456, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5899198167239404, |
|
"grad_norm": 0.5249271107308078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.459, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5956471935853379, |
|
"grad_norm": 0.5341902718265518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4379, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6013745704467354, |
|
"grad_norm": 0.5681490785135558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.445, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6071019473081328, |
|
"grad_norm": 0.5894093830458618, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4353, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6128293241695304, |
|
"grad_norm": 0.5221738673506199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4412, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.5797474198144454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.443, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6242840778923253, |
|
"grad_norm": 0.5717734893270968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4435, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6300114547537228, |
|
"grad_norm": 0.4981452424201392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4294, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6357388316151202, |
|
"grad_norm": 0.5341189709831258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4443, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6414662084765178, |
|
"grad_norm": 0.5337916570686927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4388, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6471935853379153, |
|
"grad_norm": 0.5191012616320769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4278, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6529209621993127, |
|
"grad_norm": 0.5532648229910325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4445, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6586483390607102, |
|
"grad_norm": 0.5376131317705378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4443, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6643757159221076, |
|
"grad_norm": 0.5163417459667795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.449, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6701030927835051, |
|
"grad_norm": 0.5750744468716225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4424, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6758304696449027, |
|
"grad_norm": 0.5691426147058902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4409, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6815578465063001, |
|
"grad_norm": 0.572169838625891, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4408, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6872852233676976, |
|
"grad_norm": 0.562085920947936, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4539, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.693012600229095, |
|
"grad_norm": 0.549680800307771, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4408, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6987399770904925, |
|
"grad_norm": 0.5268961102121411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4318, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7044673539518901, |
|
"grad_norm": 0.546084380380316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4274, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7101947308132875, |
|
"grad_norm": 0.5789125996714688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.426, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.715922107674685, |
|
"grad_norm": 0.5552812309484606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4433, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"grad_norm": 0.5490841525640856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4347, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7273768613974799, |
|
"grad_norm": 0.5287976784408353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4404, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7331042382588774, |
|
"grad_norm": 0.5303023254262377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4195, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.738831615120275, |
|
"grad_norm": 0.5615661953108583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4399, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7445589919816724, |
|
"grad_norm": 0.5392244469382096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4416, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7502863688430699, |
|
"grad_norm": 0.5631926544247716, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4458, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7560137457044673, |
|
"grad_norm": 0.5321981965200037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4425, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7617411225658648, |
|
"grad_norm": 0.5472839772157344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.438, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7674684994272624, |
|
"grad_norm": 0.5148846097662794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.441, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"grad_norm": 0.5814136125437906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4422, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7789232531500573, |
|
"grad_norm": 0.4865290153302818, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4195, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7846506300114547, |
|
"grad_norm": 0.5729362641564091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4252, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.7903780068728522, |
|
"grad_norm": 0.513721699658904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.428, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7961053837342497, |
|
"grad_norm": 0.5404966717471676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4321, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8018327605956472, |
|
"grad_norm": 0.5705888817719007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4292, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8075601374570447, |
|
"grad_norm": 0.5433900974318504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.436, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8132875143184422, |
|
"grad_norm": 0.5233288735400002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4347, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8190148911798396, |
|
"grad_norm": 0.5315754725482877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4343, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.540898996728165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4385, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8304696449026346, |
|
"grad_norm": 0.4918837151161211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4187, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8361970217640321, |
|
"grad_norm": 0.5167559153442192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4354, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8419243986254296, |
|
"grad_norm": 0.5388428750370872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.442, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.847651775486827, |
|
"grad_norm": 0.5478670963710769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.432, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8533791523482245, |
|
"grad_norm": 0.5551238913425003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4349, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8591065292096219, |
|
"grad_norm": 0.5096809451223056, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4337, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8648339060710195, |
|
"grad_norm": 0.5485551648513652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4304, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.870561282932417, |
|
"grad_norm": 0.5439471352377651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4291, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8762886597938144, |
|
"grad_norm": 0.5541220552945509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.435, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.8820160366552119, |
|
"grad_norm": 0.5549824280277013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4455, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.8877434135166093, |
|
"grad_norm": 0.5160198263139852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4246, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.8934707903780069, |
|
"grad_norm": 0.5789101227791154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4459, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.8991981672394044, |
|
"grad_norm": 0.5619060606532335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4227, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9049255441008018, |
|
"grad_norm": 0.5131288466424669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.425, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9106529209621993, |
|
"grad_norm": 0.5378774111576103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4339, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9163802978235968, |
|
"grad_norm": 0.5496777521069192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4351, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9221076746849943, |
|
"grad_norm": 0.5090132737788183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4439, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"grad_norm": 0.5537237167714347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4315, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.9335624284077892, |
|
"grad_norm": 0.5083065954607388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4209, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.9392898052691867, |
|
"grad_norm": 0.5504634632628532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4255, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9450171821305842, |
|
"grad_norm": 0.497466661980522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4359, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.9507445589919816, |
|
"grad_norm": 0.5728710662746097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4298, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9564719358533792, |
|
"grad_norm": 0.5382940747387163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4321, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9621993127147767, |
|
"grad_norm": 0.5602645110237611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4376, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9679266895761741, |
|
"grad_norm": 0.500936047855792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4138, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.9736540664375716, |
|
"grad_norm": 0.5586135425680717, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4348, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.979381443298969, |
|
"grad_norm": 0.5493075698232323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4235, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.9851088201603666, |
|
"grad_norm": 0.5618879613810053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4331, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.9908361970217641, |
|
"grad_norm": 0.5518636911221633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4324, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.9965635738831615, |
|
"grad_norm": 0.540041363098471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4292, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.4308567941188812, |
|
"eval_runtime": 442.3894, |
|
"eval_samples_per_second": 26.587, |
|
"eval_steps_per_second": 0.416, |
|
"step": 1746 |
|
}, |
|
{ |
|
"epoch": 1.002290950744559, |
|
"grad_norm": 0.5315755979624516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4118, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.0080183276059564, |
|
"grad_norm": 0.5480247806796007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3744, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.013745704467354, |
|
"grad_norm": 0.5149756078279293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.378, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.0194730813287514, |
|
"grad_norm": 0.5171540890283103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3702, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.0252004581901488, |
|
"grad_norm": 0.5125282145044338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3711, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.0309278350515463, |
|
"grad_norm": 0.5207422414484574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3665, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.036655211912944, |
|
"grad_norm": 0.5786010896530626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3787, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.0423825887743414, |
|
"grad_norm": 0.5464322830590085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3701, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.0481099656357389, |
|
"grad_norm": 0.5205995452066101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3775, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.0538373424971363, |
|
"grad_norm": 0.519243616807458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3796, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.0595647193585338, |
|
"grad_norm": 0.516879079058521, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3682, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.0652920962199313, |
|
"grad_norm": 0.5222769824122077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3727, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.0710194730813287, |
|
"grad_norm": 0.5445361815315836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3808, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.0767468499427262, |
|
"grad_norm": 0.5036623685893024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.372, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.0824742268041236, |
|
"grad_norm": 0.5348337621107334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3732, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.088201603665521, |
|
"grad_norm": 0.5060065384189831, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3846, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.0939289805269188, |
|
"grad_norm": 0.5079630313899987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3734, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.0996563573883162, |
|
"grad_norm": 0.5280819989664591, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3795, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.1053837342497137, |
|
"grad_norm": 0.5170704766163384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3813, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.490210122517552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3675, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.1168384879725086, |
|
"grad_norm": 0.5132872988639963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3725, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.122565864833906, |
|
"grad_norm": 0.5345634201153158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3717, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.1282932416953035, |
|
"grad_norm": 0.509614500177815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.376, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.134020618556701, |
|
"grad_norm": 0.533789127642796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3813, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.1397479954180985, |
|
"grad_norm": 0.5430074485285918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3836, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.145475372279496, |
|
"grad_norm": 0.5313196763751518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3733, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1512027491408934, |
|
"grad_norm": 0.5044508200904723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3783, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.1569301260022908, |
|
"grad_norm": 0.5686318202507047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3752, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.1626575028636885, |
|
"grad_norm": 0.5467468838281937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3762, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.168384879725086, |
|
"grad_norm": 0.5149539692191945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3743, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.1741122565864834, |
|
"grad_norm": 0.5312244954262079, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3746, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.179839633447881, |
|
"grad_norm": 0.5219006331104148, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3723, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.1855670103092784, |
|
"grad_norm": 0.538164681785682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.375, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.1912943871706758, |
|
"grad_norm": 0.5113415155626444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3808, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.1970217640320733, |
|
"grad_norm": 0.5321490376681038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3699, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.2027491408934707, |
|
"grad_norm": 0.5024285063567219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.37, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.2084765177548682, |
|
"grad_norm": 0.500252002027109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3782, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.2142038946162657, |
|
"grad_norm": 0.4924458421349769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3743, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.2199312714776633, |
|
"grad_norm": 0.4994293087338354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3758, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.2256586483390608, |
|
"grad_norm": 0.5610101096098137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3805, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.2313860252004583, |
|
"grad_norm": 0.537084984018735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3803, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.2371134020618557, |
|
"grad_norm": 0.5371742151025276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.38, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.2428407789232532, |
|
"grad_norm": 0.5349028198444167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3808, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.2485681557846506, |
|
"grad_norm": 0.5168637085023853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3762, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.254295532646048, |
|
"grad_norm": 0.521040039917101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3724, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.2600229095074456, |
|
"grad_norm": 0.547586207928514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.379, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.265750286368843, |
|
"grad_norm": 0.5217616276100534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3801, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.2714776632302405, |
|
"grad_norm": 0.5119693142628771, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3717, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.277205040091638, |
|
"grad_norm": 0.49171715528514415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3733, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.2829324169530354, |
|
"grad_norm": 0.5583188472803077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3848, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.2886597938144329, |
|
"grad_norm": 0.5384351613915103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3708, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.2943871706758305, |
|
"grad_norm": 0.5283801778798911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3737, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.300114547537228, |
|
"grad_norm": 0.516459149172914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3778, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.3058419243986255, |
|
"grad_norm": 0.5064553031768843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3803, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.311569301260023, |
|
"grad_norm": 0.4992554472760309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3644, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.3172966781214204, |
|
"grad_norm": 0.5351102245997288, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3754, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.3230240549828178, |
|
"grad_norm": 0.5744380836159825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3773, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.3287514318442153, |
|
"grad_norm": 0.5390997556454836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3702, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.3344788087056128, |
|
"grad_norm": 0.5159422766312324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3662, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.3402061855670104, |
|
"grad_norm": 0.5122893955215309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3662, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.345933562428408, |
|
"grad_norm": 0.5435516572685336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3692, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.3516609392898054, |
|
"grad_norm": 0.506519754552173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3775, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.3573883161512028, |
|
"grad_norm": 0.5382970585180483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3744, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.3631156930126003, |
|
"grad_norm": 0.5098385678364493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3817, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.3688430698739977, |
|
"grad_norm": 0.554626835720726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3846, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.3745704467353952, |
|
"grad_norm": 0.5148678581013812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3673, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3802978235967927, |
|
"grad_norm": 0.5160690729715556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3733, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.38602520045819, |
|
"grad_norm": 0.49317947843655263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3676, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.3917525773195876, |
|
"grad_norm": 0.491680558889149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3726, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.397479954180985, |
|
"grad_norm": 0.5159563830253588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3722, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.4032073310423825, |
|
"grad_norm": 0.5153913293053918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.378, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.40893470790378, |
|
"grad_norm": 0.5264135232150384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3788, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.4146620847651776, |
|
"grad_norm": 0.5382150315757761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3728, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.420389461626575, |
|
"grad_norm": 0.5310778600445981, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3725, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.4261168384879725, |
|
"grad_norm": 0.5190694903204963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3708, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.43184421534937, |
|
"grad_norm": 0.5074266048391012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3816, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.4375715922107675, |
|
"grad_norm": 0.527668984455016, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3692, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.443298969072165, |
|
"grad_norm": 0.5676081369199447, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3726, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.4490263459335624, |
|
"grad_norm": 0.734177855082109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3626, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.4547537227949598, |
|
"grad_norm": 0.5303161196964384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3736, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.4604810996563573, |
|
"grad_norm": 0.4844944790072633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3717, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.466208476517755, |
|
"grad_norm": 0.5315356528221638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3739, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.4719358533791524, |
|
"grad_norm": 0.4772067027942158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3812, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.47766323024055, |
|
"grad_norm": 0.4939040642118836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3672, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.4833906071019474, |
|
"grad_norm": 0.508946651609973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3746, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.4891179839633448, |
|
"grad_norm": 0.5298078020553034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3733, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.4948453608247423, |
|
"grad_norm": 0.5432646094312185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3758, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.5005727376861397, |
|
"grad_norm": 0.5105782450661664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3805, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.5063001145475372, |
|
"grad_norm": 0.5382093019088371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.381, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.5120274914089347, |
|
"grad_norm": 0.5204647981548546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3787, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.5177548682703321, |
|
"grad_norm": 0.5407064147138322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.379, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.5234822451317296, |
|
"grad_norm": 0.5213399797673937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3815, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.529209621993127, |
|
"grad_norm": 0.5213816003555798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3881, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.5349369988545245, |
|
"grad_norm": 0.5142228705336312, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3776, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.540664375715922, |
|
"grad_norm": 0.520697436811985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3806, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.5463917525773194, |
|
"grad_norm": 0.5637817682476384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3715, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.552119129438717, |
|
"grad_norm": 0.4915517576152862, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3727, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.5578465063001146, |
|
"grad_norm": 0.5237728548563729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3709, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.563573883161512, |
|
"grad_norm": 0.7190217505740812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3803, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.5693012600229095, |
|
"grad_norm": 0.5209643317738942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.371, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.575028636884307, |
|
"grad_norm": 0.5247668495585847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3808, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.5807560137457046, |
|
"grad_norm": 0.532703132334951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.378, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.586483390607102, |
|
"grad_norm": 0.5117006044420915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3695, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.5922107674684995, |
|
"grad_norm": 0.5272216030753158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3666, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.597938144329897, |
|
"grad_norm": 0.5392842250351713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3783, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.6036655211912945, |
|
"grad_norm": 0.5211103103323186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3809, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.609392898052692, |
|
"grad_norm": 0.5434144298333962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3839, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.6151202749140894, |
|
"grad_norm": 0.6244778503333394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.375, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.6208476517754868, |
|
"grad_norm": 0.7929144005943337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3843, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.6265750286368843, |
|
"grad_norm": 0.5076072639573785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3764, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.6323024054982818, |
|
"grad_norm": 0.499119817733024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3814, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.6380297823596792, |
|
"grad_norm": 0.5427237791683881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3732, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.6437571592210767, |
|
"grad_norm": 0.5072517280886356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3716, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.6494845360824741, |
|
"grad_norm": 0.5424242226752932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3725, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.6552119129438716, |
|
"grad_norm": 0.5239482084133175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3797, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.660939289805269, |
|
"grad_norm": 0.4877340871651499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3797, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.5187696419666292, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3732, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.6723940435280642, |
|
"grad_norm": 0.5165454581988194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3761, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.6781214203894617, |
|
"grad_norm": 0.5295455488567519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3732, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.6838487972508591, |
|
"grad_norm": 0.5356558582460169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3759, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.6895761741122566, |
|
"grad_norm": 0.5382234203233621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3809, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.695303550973654, |
|
"grad_norm": 0.5426839059978352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3818, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.7010309278350515, |
|
"grad_norm": 0.5589401799758412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3735, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.7067583046964492, |
|
"grad_norm": 0.5400070080828028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3715, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.7124856815578466, |
|
"grad_norm": 0.5125880859835712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3692, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.718213058419244, |
|
"grad_norm": 0.521750795912995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3734, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7239404352806416, |
|
"grad_norm": 0.49977791549900996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3714, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.729667812142039, |
|
"grad_norm": 0.517176383552139, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3798, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.7353951890034365, |
|
"grad_norm": 0.5569996765160103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3704, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.741122565864834, |
|
"grad_norm": 0.5152384355166261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3687, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.7468499427262314, |
|
"grad_norm": 0.5152215076607375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3711, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.7525773195876289, |
|
"grad_norm": 0.5209367759111013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.367, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.7583046964490263, |
|
"grad_norm": 0.72478206720126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3753, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.7640320733104238, |
|
"grad_norm": 0.5108998808880466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3776, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.7697594501718212, |
|
"grad_norm": 0.49545669464589914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3691, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.7754868270332187, |
|
"grad_norm": 0.5370814607474198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3746, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.7812142038946162, |
|
"grad_norm": 0.6072170324437888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3729, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.7869415807560136, |
|
"grad_norm": 0.5271754924806036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3588, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.792668957617411, |
|
"grad_norm": 0.5754508589270141, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3776, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.7983963344788088, |
|
"grad_norm": 0.5403755251129506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3762, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.8041237113402062, |
|
"grad_norm": 0.5138981496909732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3711, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.8098510882016037, |
|
"grad_norm": 0.534919240445055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3763, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.8155784650630011, |
|
"grad_norm": 0.5146135891570391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3806, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.8213058419243986, |
|
"grad_norm": 0.505605657209453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3766, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.827033218785796, |
|
"grad_norm": 0.5239153217495089, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3659, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.8327605956471937, |
|
"grad_norm": 0.9509826091058188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3792, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.8384879725085912, |
|
"grad_norm": 0.5059729711091226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3721, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.8442153493699887, |
|
"grad_norm": 0.5245359643059282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.379, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.8499427262313861, |
|
"grad_norm": 0.5050698399570109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3739, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.8556701030927836, |
|
"grad_norm": 0.5065697206422057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3849, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.861397479954181, |
|
"grad_norm": 0.5209454651612013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3707, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.8671248568155785, |
|
"grad_norm": 0.5430271570033155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3696, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.872852233676976, |
|
"grad_norm": 0.5281297299023378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3712, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.8785796105383734, |
|
"grad_norm": 0.5292724874313788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3726, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.8843069873997709, |
|
"grad_norm": 0.5350000049801282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3723, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.8900343642611683, |
|
"grad_norm": 0.5208368353063128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3677, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.8957617411225658, |
|
"grad_norm": 0.5279268625494672, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3753, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.9014891179839633, |
|
"grad_norm": 0.5144354886651064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3741, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.9072164948453607, |
|
"grad_norm": 0.5082140336402234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3635, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.9129438717067582, |
|
"grad_norm": 0.5252002760881638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3787, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.9186712485681556, |
|
"grad_norm": 0.5070002443248173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3718, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.9243986254295533, |
|
"grad_norm": 0.5316148384707632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3664, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.9301260022909508, |
|
"grad_norm": 0.5193287345279981, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3684, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.9358533791523482, |
|
"grad_norm": 0.506552724781655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.372, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.9415807560137457, |
|
"grad_norm": 0.571442610460597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.379, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.9473081328751431, |
|
"grad_norm": 0.5688327247671221, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3687, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.9530355097365406, |
|
"grad_norm": 0.5042217231517675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3738, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.9587628865979383, |
|
"grad_norm": 0.5321243972550566, |
|
"learning_rate": 5e-06, |
|
"loss": 0.37, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.9644902634593358, |
|
"grad_norm": 0.5188108633782941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3766, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.9702176403207332, |
|
"grad_norm": 0.49989426974744805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3703, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.9759450171821307, |
|
"grad_norm": 0.5428089945263836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3689, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.9816723940435281, |
|
"grad_norm": 0.5404439065398382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3729, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.9873997709049256, |
|
"grad_norm": 0.5410379700662915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3726, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.993127147766323, |
|
"grad_norm": 0.5225961306378605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3782, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.9988545246277205, |
|
"grad_norm": 0.5535365053498117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3757, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.42091062664985657, |
|
"eval_runtime": 444.2503, |
|
"eval_samples_per_second": 26.476, |
|
"eval_steps_per_second": 0.414, |
|
"step": 3492 |
|
}, |
|
{ |
|
"epoch": 2.004581901489118, |
|
"grad_norm": 0.5715227391793876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3266, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.0103092783505154, |
|
"grad_norm": 0.5115288441622716, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3123, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.016036655211913, |
|
"grad_norm": 0.5114186476378918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3064, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.0217640320733103, |
|
"grad_norm": 0.5174383277753903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3115, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.027491408934708, |
|
"grad_norm": 0.5180676439991899, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3092, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.0332187857961053, |
|
"grad_norm": 0.5521743990569499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3154, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.0389461626575027, |
|
"grad_norm": 0.5325564269068459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3139, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.0446735395189, |
|
"grad_norm": 0.5307121051642008, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3143, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.0504009163802976, |
|
"grad_norm": 0.5039022580188123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3139, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.056128293241695, |
|
"grad_norm": 0.5386599812235912, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3274, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.0618556701030926, |
|
"grad_norm": 0.5510122632798343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3218, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.0675830469644905, |
|
"grad_norm": 0.5314262243630897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3126, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.073310423825888, |
|
"grad_norm": 0.4939184680389998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.313, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.0790378006872854, |
|
"grad_norm": 0.5312396764234006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3105, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.084765177548683, |
|
"grad_norm": 0.4876119734890598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3124, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.0904925544100803, |
|
"grad_norm": 0.5103227941442228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3242, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.0962199312714778, |
|
"grad_norm": 0.5042566969256604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3184, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.1019473081328752, |
|
"grad_norm": 0.5141761625174609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3143, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.1076746849942727, |
|
"grad_norm": 0.5382583222094659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3166, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.11340206185567, |
|
"grad_norm": 0.4804864664629375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3122, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.1191294387170676, |
|
"grad_norm": 0.5165262098916743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3135, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.124856815578465, |
|
"grad_norm": 0.5253973489118585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3103, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.1305841924398625, |
|
"grad_norm": 0.5153233857956726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.13631156930126, |
|
"grad_norm": 0.5024911676484453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3153, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.1420389461626574, |
|
"grad_norm": 0.48649497978161954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3125, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.147766323024055, |
|
"grad_norm": 0.5398906998916857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3207, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.1534936998854524, |
|
"grad_norm": 0.5130668071668567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3141, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.15922107674685, |
|
"grad_norm": 0.5518874676047263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3184, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.1649484536082473, |
|
"grad_norm": 0.5160046944915151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3172, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.1706758304696447, |
|
"grad_norm": 0.48834912906902556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3234, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.176403207331042, |
|
"grad_norm": 0.5171110867595475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3151, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.1821305841924397, |
|
"grad_norm": 0.5180778501865309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3157, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.1878579610538376, |
|
"grad_norm": 0.5068334930400914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3126, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.193585337915235, |
|
"grad_norm": 0.5383783523767623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.313, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.1993127147766325, |
|
"grad_norm": 0.5231488887255323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3199, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.20504009163803, |
|
"grad_norm": 0.5130199004591844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3168, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.2107674684994274, |
|
"grad_norm": 0.5482589908800253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3223, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.216494845360825, |
|
"grad_norm": 0.53120848952022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.321, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.5071013057492006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.22794959908362, |
|
"grad_norm": 0.531266327465737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.324, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.2336769759450172, |
|
"grad_norm": 0.507052566002429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3137, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.2394043528064147, |
|
"grad_norm": 0.5271639744898703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3159, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.245131729667812, |
|
"grad_norm": 0.5388005656911042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3228, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.2508591065292096, |
|
"grad_norm": 0.5060194168439476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3124, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.256586483390607, |
|
"grad_norm": 0.48546648398689385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3136, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.2623138602520045, |
|
"grad_norm": 0.5076224304840579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3167, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.268041237113402, |
|
"grad_norm": 0.5296050756375043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3204, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.2737686139747995, |
|
"grad_norm": 0.5177826901430715, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3232, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.279495990836197, |
|
"grad_norm": 0.5132840661554632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3158, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.2852233676975944, |
|
"grad_norm": 0.543452264339846, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3192, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.290950744558992, |
|
"grad_norm": 0.5139552493291161, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3209, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.2966781214203893, |
|
"grad_norm": 0.4969492673393391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3153, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.3024054982817868, |
|
"grad_norm": 0.5143134020150514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3201, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.308132875143184, |
|
"grad_norm": 0.4939050512552889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3106, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.3138602520045817, |
|
"grad_norm": 0.4886273394938644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3225, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.319587628865979, |
|
"grad_norm": 0.5380016012501289, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3183, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.325315005727377, |
|
"grad_norm": 0.5160375481982886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3127, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.3310423825887745, |
|
"grad_norm": 0.5217824879654166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3168, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.336769759450172, |
|
"grad_norm": 0.4987920921308887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3243, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.3424971363115694, |
|
"grad_norm": 0.540718540329614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3267, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.348224513172967, |
|
"grad_norm": 0.4928104897116783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3154, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.3539518900343643, |
|
"grad_norm": 0.5332905023045372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3294, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 2.359679266895762, |
|
"grad_norm": 0.5190878954413758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3135, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 2.3654066437571593, |
|
"grad_norm": 0.49770844545756987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3217, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 2.3711340206185567, |
|
"grad_norm": 0.5158902735624868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3182, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 2.376861397479954, |
|
"grad_norm": 0.5258004488461884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3211, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.3825887743413516, |
|
"grad_norm": 0.5006246348277473, |
|
"learning_rate": 5e-06, |
|
"loss": 0.311, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 2.388316151202749, |
|
"grad_norm": 0.5152750751947476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3219, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 2.3940435280641466, |
|
"grad_norm": 0.52068469477404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3164, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 2.399770904925544, |
|
"grad_norm": 0.5323767239444738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3208, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 2.4054982817869415, |
|
"grad_norm": 0.5003101909687979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3228, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.411225658648339, |
|
"grad_norm": 0.5111739731033206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3271, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 2.4169530355097364, |
|
"grad_norm": 0.5131151305511839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3147, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 2.422680412371134, |
|
"grad_norm": 0.5163340601407542, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3194, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 2.4284077892325313, |
|
"grad_norm": 0.4958432705018228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3188, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 2.434135166093929, |
|
"grad_norm": 0.5468740104263736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3249, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.4398625429553267, |
|
"grad_norm": 0.5410705485869296, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3278, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 2.445589919816724, |
|
"grad_norm": 0.5368673433466151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3076, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 2.4513172966781216, |
|
"grad_norm": 0.5128839768898594, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3184, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 2.457044673539519, |
|
"grad_norm": 0.5068335080164907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3169, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 2.4627720504009165, |
|
"grad_norm": 0.4997641306590389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3182, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.468499427262314, |
|
"grad_norm": 0.5191935789555608, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3216, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 2.4742268041237114, |
|
"grad_norm": 0.5256779474029062, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3233, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 2.479954180985109, |
|
"grad_norm": 0.4865715340962184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3204, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 2.4856815578465064, |
|
"grad_norm": 0.526248237049682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3205, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 2.491408934707904, |
|
"grad_norm": 0.5130011897003942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3259, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.4971363115693013, |
|
"grad_norm": 0.5162374829270555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3195, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 2.5028636884306987, |
|
"grad_norm": 0.5014077765674124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3228, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.508591065292096, |
|
"grad_norm": 0.5026646925273252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3207, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.5143184421534936, |
|
"grad_norm": 0.489097601061641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3176, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 2.520045819014891, |
|
"grad_norm": 0.5078735499696359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3224, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.5257731958762886, |
|
"grad_norm": 0.5271652940063883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.323, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 2.531500572737686, |
|
"grad_norm": 0.507078592732912, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.5372279495990835, |
|
"grad_norm": 0.5337080175384938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3228, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 2.542955326460481, |
|
"grad_norm": 0.50176908036031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3202, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 2.5486827033218784, |
|
"grad_norm": 0.5120277803126527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3197, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.554410080183276, |
|
"grad_norm": 0.5366578152814068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3244, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 2.5601374570446733, |
|
"grad_norm": 0.522159840101665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 2.565864833906071, |
|
"grad_norm": 0.48196371586373893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3173, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.5715922107674682, |
|
"grad_norm": 0.5255302877210084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3278, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 2.5773195876288657, |
|
"grad_norm": 0.5089869190508646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3198, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.5830469644902636, |
|
"grad_norm": 0.530959409102577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3304, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 2.588774341351661, |
|
"grad_norm": 0.5531110752439229, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3215, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 2.5945017182130585, |
|
"grad_norm": 0.5260126942389958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.316, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 2.600229095074456, |
|
"grad_norm": 0.5160707506662007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.32, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 2.6059564719358534, |
|
"grad_norm": 0.5410876892071711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.324, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.611683848797251, |
|
"grad_norm": 0.5149983976318435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3168, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 2.6174112256586484, |
|
"grad_norm": 0.5066882681673974, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3164, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 2.623138602520046, |
|
"grad_norm": 0.5033788230089447, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3163, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 2.6288659793814433, |
|
"grad_norm": 0.5247195850361597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3175, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 2.6345933562428407, |
|
"grad_norm": 0.5282100918029693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3272, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.640320733104238, |
|
"grad_norm": 0.52717520655256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 2.6460481099656357, |
|
"grad_norm": 0.5022071775606154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3244, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 2.651775486827033, |
|
"grad_norm": 0.5211994501987708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3131, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 2.6575028636884306, |
|
"grad_norm": 0.5124524380744641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3253, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 2.663230240549828, |
|
"grad_norm": 0.5060206463796821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3173, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.6689576174112255, |
|
"grad_norm": 0.5162650933726732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3198, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 2.6746849942726234, |
|
"grad_norm": 0.524408740926564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3302, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 2.680412371134021, |
|
"grad_norm": 0.5146510600522526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3234, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 2.6861397479954183, |
|
"grad_norm": 0.5092928068009863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.322, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 2.691867124856816, |
|
"grad_norm": 0.5757942628503555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3117, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.6975945017182132, |
|
"grad_norm": 0.4889008213010037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3231, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 2.7033218785796107, |
|
"grad_norm": 0.5302974244163768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3178, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 2.709049255441008, |
|
"grad_norm": 0.52479839402123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3217, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 2.7147766323024056, |
|
"grad_norm": 0.5039321357679906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3257, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 2.720504009163803, |
|
"grad_norm": 0.5101102471452367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3222, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.7262313860252005, |
|
"grad_norm": 0.5430506058097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 2.731958762886598, |
|
"grad_norm": 0.5355617323855306, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3269, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 2.7376861397479955, |
|
"grad_norm": 0.4986561926249133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3149, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 2.743413516609393, |
|
"grad_norm": 0.5053547200537913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3308, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 2.7491408934707904, |
|
"grad_norm": 0.48632254517347523, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.754868270332188, |
|
"grad_norm": 0.486358386605019, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3103, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 2.7605956471935853, |
|
"grad_norm": 0.5374470855558577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3298, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 2.7663230240549828, |
|
"grad_norm": 0.5153628917876052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3233, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 2.77205040091638, |
|
"grad_norm": 0.5054154722646462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3275, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.5149761064711591, |
|
"learning_rate": 5e-06, |
|
"loss": 0.317, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.783505154639175, |
|
"grad_norm": 0.5148931201825926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3233, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 2.7892325315005726, |
|
"grad_norm": 0.5191662793295072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3184, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 2.79495990836197, |
|
"grad_norm": 0.5675073234527983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 2.8006872852233675, |
|
"grad_norm": 0.5038525244128136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3232, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 2.806414662084765, |
|
"grad_norm": 0.4918634907852925, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3175, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.8121420389461624, |
|
"grad_norm": 0.5284779685936459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3208, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 2.81786941580756, |
|
"grad_norm": 0.5108078870269828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3272, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 2.8235967926689574, |
|
"grad_norm": 0.4902747564813571, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3257, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 2.8293241695303553, |
|
"grad_norm": 0.5030822928949704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.32, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 2.8350515463917527, |
|
"grad_norm": 0.5272339203419648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3288, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.84077892325315, |
|
"grad_norm": 0.5340619443879843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3277, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 2.8465063001145476, |
|
"grad_norm": 0.5139875993534403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3167, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 2.852233676975945, |
|
"grad_norm": 0.5197113622112696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3221, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 2.8579610538373426, |
|
"grad_norm": 0.4932910708419312, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 2.86368843069874, |
|
"grad_norm": 0.48961874681440903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3216, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.8694158075601375, |
|
"grad_norm": 0.5306801859906654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3294, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 2.875143184421535, |
|
"grad_norm": 0.5245022306778138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3238, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 2.8808705612829324, |
|
"grad_norm": 0.5294236720116192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3172, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 2.88659793814433, |
|
"grad_norm": 0.49805002487820915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3154, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 2.8923253150057273, |
|
"grad_norm": 0.5118585995041246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3264, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.8980526918671248, |
|
"grad_norm": 0.4778628715181792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3215, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 2.9037800687285222, |
|
"grad_norm": 0.502141421148047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3195, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 2.9095074455899197, |
|
"grad_norm": 0.5216144225169003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.322, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 2.915234822451317, |
|
"grad_norm": 0.5061317551186302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3181, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 2.9209621993127146, |
|
"grad_norm": 0.5258296449599809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3196, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.9266895761741125, |
|
"grad_norm": 0.5205129759914494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3298, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 2.93241695303551, |
|
"grad_norm": 0.5441889203090563, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3247, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 2.9381443298969074, |
|
"grad_norm": 0.5039522474134192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3181, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 2.943871706758305, |
|
"grad_norm": 0.5232711939948509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3196, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 2.9495990836197024, |
|
"grad_norm": 0.5119250727874893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3131, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.9553264604811, |
|
"grad_norm": 0.5050585849003654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3248, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 2.9610538373424973, |
|
"grad_norm": 0.5164002352592711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.32, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 2.9667812142038947, |
|
"grad_norm": 0.5133798037929324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 2.972508591065292, |
|
"grad_norm": 0.5131629955273521, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3169, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 2.9782359679266897, |
|
"grad_norm": 0.5354127604897547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3265, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.983963344788087, |
|
"grad_norm": 0.486875785468725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 2.9896907216494846, |
|
"grad_norm": 0.524969500670149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3161, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 2.995418098510882, |
|
"grad_norm": 0.5109832301608017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3268, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.42792943120002747, |
|
"eval_runtime": 443.2838, |
|
"eval_samples_per_second": 26.534, |
|
"eval_steps_per_second": 0.415, |
|
"step": 5238 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 5238, |
|
"total_flos": 2745782658662400.0, |
|
"train_loss": 0.3825414228548572, |
|
"train_runtime": 71090.721, |
|
"train_samples_per_second": 9.431, |
|
"train_steps_per_second": 0.074 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5238, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2745782658662400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|