|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9953917050691246, |
|
"eval_steps": 500, |
|
"global_step": 758, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0026333113890717576, |
|
"grad_norm": 31.375, |
|
"learning_rate": 3.947368421052631e-06, |
|
"loss": 2.687, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013166556945358789, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.9736842105263155e-05, |
|
"loss": 2.3265, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.026333113890717578, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.947368421052631e-05, |
|
"loss": 1.834, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03949967083607637, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 5.921052631578947e-05, |
|
"loss": 1.6157, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.052666227781435156, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 7.894736842105262e-05, |
|
"loss": 1.5546, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06583278472679395, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.868421052631579e-05, |
|
"loss": 1.561, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07899934167215274, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.00011842105263157894, |
|
"loss": 1.4433, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09216589861751152, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.0001381578947368421, |
|
"loss": 1.5053, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10533245556287031, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 0.00015789473684210524, |
|
"loss": 1.5204, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1184990125082291, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.00017763157894736838, |
|
"loss": 1.5645, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1316655694535879, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 0.00019736842105263157, |
|
"loss": 1.5742, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1448321263989467, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.00021710526315789472, |
|
"loss": 1.6198, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15799868334430547, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00023684210526315788, |
|
"loss": 1.6436, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17116524028966426, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00025657894736842105, |
|
"loss": 1.6867, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18433179723502305, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.0002763157894736842, |
|
"loss": 1.7356, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19749835418038184, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 0.00029605263157894733, |
|
"loss": 1.7819, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.21066491112574062, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 0.0002999745375637391, |
|
"loss": 1.9272, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2238314680710994, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 0.00029987111123173417, |
|
"loss": 2.0363, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2369980250164582, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 0.00029968818442293417, |
|
"loss": 1.8288, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.250164581961817, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 0.00029942585417250744, |
|
"loss": 1.8436, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2633311389071758, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.00029908425963589115, |
|
"loss": 1.7724, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2764976958525346, |
|
"grad_norm": 1.875, |
|
"learning_rate": 0.00029866358201497474, |
|
"loss": 1.7534, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2896642527978934, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.0002981640444619799, |
|
"loss": 1.7532, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.30283080974325216, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.00029758591196108743, |
|
"loss": 1.7545, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.31599736668861095, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.00029692949118787415, |
|
"loss": 1.8269, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32916392363396973, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.0002961951303466338, |
|
"loss": 1.7823, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3423304805793285, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 0.0002953832189856691, |
|
"loss": 1.7371, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3554970375246873, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 0.00029449418779065257, |
|
"loss": 1.7607, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3686635944700461, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 0.00029352850835616504, |
|
"loss": 1.7956, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3818301514154049, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.00029248669293553437, |
|
"loss": 1.7176, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.39499670836076367, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 0.0002913692941691059, |
|
"loss": 1.843, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.0002901769047910895, |
|
"loss": 1.7918, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.42132982225148125, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.0002889101573151384, |
|
"loss": 1.7714, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.43449637919684003, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 0.00028756972369882667, |
|
"loss": 1.8033, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4476629361421988, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0002861563149872031, |
|
"loss": 1.8409, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4608294930875576, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.0002846706809356112, |
|
"loss": 1.8259, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4739960500329164, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.0002831136096119747, |
|
"loss": 1.7612, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4871626069782752, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.0002814859269787596, |
|
"loss": 1.7649, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.500329163923634, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.0002797884964548353, |
|
"loss": 1.7443, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5134957208689928, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.0002780222184574662, |
|
"loss": 1.7219, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5266622778143516, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.0002761880299246772, |
|
"loss": 1.7409, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5398288347597103, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00027428690381824637, |
|
"loss": 1.7043, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5529953917050692, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.00027231984860758907, |
|
"loss": 1.6709, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5661619486504279, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.000270287907734806, |
|
"loss": 1.7417, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5793285055957867, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.0002681921590611799, |
|
"loss": 1.66, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5924950625411455, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0002660337142954145, |
|
"loss": 1.732, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6056616194865043, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0002638137184039186, |
|
"loss": 1.6964, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.618828176431863, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.00026153334900344853, |
|
"loss": 1.648, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6319947333772219, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0002591938157364303, |
|
"loss": 1.6197, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.00025679635962929455, |
|
"loss": 1.701, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6583278472679395, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.00025434225243416234, |
|
"loss": 1.7649, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6714944042132982, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.0002518327959542333, |
|
"loss": 1.712, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.684660961158657, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.0002492693213532321, |
|
"loss": 1.6628, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6978275181040158, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0002466531884492808, |
|
"loss": 1.6714, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7109940750493746, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 0.0002439857849935712, |
|
"loss": 1.6833, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7241606319947334, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.00024126852593421967, |
|
"loss": 1.7174, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7373271889400922, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0002385028526656952, |
|
"loss": 1.6437, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7504937458854509, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.00023569023226421883, |
|
"loss": 1.6515, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7636603028308098, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0002328321567095398, |
|
"loss": 1.6352, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7768268597761685, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.00022993014209350167, |
|
"loss": 1.6205, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7899934167215273, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.00022698572781581757, |
|
"loss": 1.6508, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8031599736668861, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0002240004757674819, |
|
"loss": 1.5989, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.00022097596950225134, |
|
"loss": 1.6176, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8294930875576036, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00021791381339663423, |
|
"loss": 1.6204, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8426596445029625, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.00021481563179883502, |
|
"loss": 1.5592, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8558262014483212, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00021168306816710393, |
|
"loss": 1.5973, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8689927583936801, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0002085177841979498, |
|
"loss": 1.5367, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8821593153390388, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.00020532145894467828, |
|
"loss": 1.5283, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8953258722843976, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.000202095787926723, |
|
"loss": 1.5374, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9084924292297564, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.00019884248223024203, |
|
"loss": 1.5021, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9216589861751152, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00019556326760045658, |
|
"loss": 1.5345, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.934825543120474, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00019225988352621445, |
|
"loss": 1.5164, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9479921000658328, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.0001889340823172622, |
|
"loss": 1.4778, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9611586570111915, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00018558762817471678, |
|
"loss": 1.5624, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9743252139565504, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00018222229625522928, |
|
"loss": 1.527, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9874917709019092, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00017883987172933707, |
|
"loss": 1.4608, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.0001754421488345041, |
|
"loss": 1.4084, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0131665569453587, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00017203092992335137, |
|
"loss": 1.013, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0263331138907177, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.0001686080245075831, |
|
"loss": 1.0124, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0394996708360764, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0001651752482981148, |
|
"loss": 1.0275, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0526662277814351, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00016173442224191309, |
|
"loss": 0.9538, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0658327847267939, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00015828737155605804, |
|
"loss": 0.9683, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0789993416721528, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0001548359247595405, |
|
"loss": 1.0414, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0921658986175116, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00015138191270330773, |
|
"loss": 0.9749, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.1053324555628703, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.00014792716759907186, |
|
"loss": 0.9802, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.118499012508229, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00014447352204739712, |
|
"loss": 0.9399, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.131665569453588, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00014102280806558006, |
|
"loss": 1.0111, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1448321263989467, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00013757685611583983, |
|
"loss": 0.9483, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.1579986833443054, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00013413749413433273, |
|
"loss": 0.9546, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1711652402896642, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0001307065465615073, |
|
"loss": 0.9294, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.1843317972350231, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00012728583337431353, |
|
"loss": 0.9498, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1974983541803819, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0001238771691207795, |
|
"loss": 0.942, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.2106649111257406, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00012048236195746822, |
|
"loss": 0.9069, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2238314680710993, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00011710321269032502, |
|
"loss": 0.9452, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.2369980250164583, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00011374151381942327, |
|
"loss": 0.9533, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.250164581961817, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.00011039904858811712, |
|
"loss": 0.9229, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.2633311389071757, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00010707759003710384, |
|
"loss": 0.8528, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2764976958525347, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00010377890006389856, |
|
"loss": 0.8836, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.2896642527978934, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00010050472848821968, |
|
"loss": 0.9177, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3028308097432522, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 9.725681212378167e-05, |
|
"loss": 0.8867, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.315997366688611, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 9.403687385698632e-05, |
|
"loss": 0.9074, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3291639236339696, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.084662173300223e-05, |
|
"loss": 0.8652, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.3423304805793286, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 8.768774804971705e-05, |
|
"loss": 0.8758, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3554970375246873, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 8.456192846004275e-05, |
|
"loss": 0.8357, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.368663594470046, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 8.147082108305058e-05, |
|
"loss": 0.8258, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.381830151415405, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 7.84160656244067e-05, |
|
"loss": 0.906, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.3949967083607637, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 7.539928250657594e-05, |
|
"loss": 0.809, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4081632653061225, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 7.242207200925383e-05, |
|
"loss": 0.7685, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.4213298222514812, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 6.948601342048397e-05, |
|
"loss": 0.8473, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.43449637919684, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 6.65926641989106e-05, |
|
"loss": 0.8022, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.4476629361421989, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 6.374355914761062e-05, |
|
"loss": 0.7762, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4608294930875576, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 6.094020959994336e-05, |
|
"loss": 0.862, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.4739960500329163, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 5.818410261785056e-05, |
|
"loss": 0.793, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4871626069782753, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 5.5476700203030643e-05, |
|
"loss": 0.7979, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.500329163923634, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 5.281943852140697e-05, |
|
"loss": 0.8223, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5134957208689928, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 5.021372714130087e-05, |
|
"loss": 0.84, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.5266622778143515, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.766094828571313e-05, |
|
"loss": 0.7897, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5398288347597102, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.516245609911161e-05, |
|
"loss": 0.7917, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.5529953917050692, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.271957592911325e-05, |
|
"loss": 0.7691, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.566161948650428, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.033360362344117e-05, |
|
"loss": 0.8063, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.5793285055957869, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.800580484253105e-05, |
|
"loss": 0.7744, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5924950625411456, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.5737414388149785e-05, |
|
"loss": 0.7701, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.6056616194865043, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.352963554838402e-05, |
|
"loss": 0.7414, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.618828176431863, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.138363945934523e-05, |
|
"loss": 0.7739, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.6319947333772218, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 2.9300564483929852e-05, |
|
"loss": 0.794, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6451612903225805, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 2.728151560796454e-05, |
|
"loss": 0.8121, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.6583278472679395, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 2.5327563854056714e-05, |
|
"loss": 0.7925, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6714944042132982, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.3439745713460624e-05, |
|
"loss": 0.8124, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.6846609611586572, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 2.1619062596261583e-05, |
|
"loss": 0.7899, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6978275181040159, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.9866480300168885e-05, |
|
"loss": 0.7489, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.7109940750493746, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.8182928498199634e-05, |
|
"loss": 0.7739, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7241606319947334, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.6569300245525457e-05, |
|
"loss": 0.7311, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.737327188940092, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.5026451505743408e-05, |
|
"loss": 0.7321, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.7504937458854508, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.3555200696822232e-05, |
|
"loss": 0.7963, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.7636603028308098, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.215632825696541e-05, |
|
"loss": 0.7587, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7768268597761685, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.0830576230620492e-05, |
|
"loss": 0.7989, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.7899934167215275, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.578647874855095e-06, |
|
"loss": 0.8169, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8031599736668862, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 8.401207286307881e-06, |
|
"loss": 0.7674, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.816326530612245, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 7.2988790489124424e-06, |
|
"loss": 0.8234, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8294930875576036, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 6.272247902581201e-06, |
|
"loss": 0.7603, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.8426596445029624, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 5.3218584330249e-06, |
|
"loss": 0.795, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8558262014483211, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.448214782872134e-06, |
|
"loss": 0.759, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.86899275839368, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.6517803842424474e-06, |
|
"loss": 0.7344, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8821593153390388, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 2.932977712914586e-06, |
|
"loss": 0.7102, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.8953258722843978, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.292188064220374e-06, |
|
"loss": 0.7783, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9084924292297565, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.7297513507832927e-06, |
|
"loss": 0.7961, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.9216589861751152, |
|
"grad_norm": 44.75, |
|
"learning_rate": 1.2459659222086304e-06, |
|
"loss": 0.7633, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.934825543120474, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 8.410884068213941e-07, |
|
"loss": 0.7727, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.9479921000658327, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 5.153335755354038e-07, |
|
"loss": 0.7779, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9611586570111914, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.688742279261913e-07, |
|
"loss": 0.7058, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.9743252139565504, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.0184110056790651e-07, |
|
"loss": 0.8194, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9874917709019093, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.432279768290856e-08, |
|
"loss": 0.7634, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.9953917050691246, |
|
"step": 758, |
|
"total_flos": 1.449790274661253e+17, |
|
"train_loss": 1.2643018703032924, |
|
"train_runtime": 2142.5178, |
|
"train_samples_per_second": 11.339, |
|
"train_steps_per_second": 0.354 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 758, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.449790274661253e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|