|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 2505, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029940119760479042, |
|
"grad_norm": 1.6695125102996826, |
|
"learning_rate": 2.98804780876494e-06, |
|
"loss": 1.1663, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.059880239520958084, |
|
"grad_norm": 1.2466001510620117, |
|
"learning_rate": 5.97609561752988e-06, |
|
"loss": 1.0872, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08982035928143713, |
|
"grad_norm": 1.3031201362609863, |
|
"learning_rate": 8.964143426294822e-06, |
|
"loss": 1.0409, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11976047904191617, |
|
"grad_norm": 1.2915183305740356, |
|
"learning_rate": 1.195219123505976e-05, |
|
"loss": 0.9647, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1497005988023952, |
|
"grad_norm": 1.2883236408233643, |
|
"learning_rate": 1.4940239043824702e-05, |
|
"loss": 0.8845, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.17964071856287425, |
|
"grad_norm": 1.2346601486206055, |
|
"learning_rate": 1.7928286852589643e-05, |
|
"loss": 0.7993, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.20958083832335328, |
|
"grad_norm": 1.6535353660583496, |
|
"learning_rate": 2.0916334661354585e-05, |
|
"loss": 0.7358, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 1.7455912828445435, |
|
"learning_rate": 2.390438247011952e-05, |
|
"loss": 0.7181, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2694610778443114, |
|
"grad_norm": 1.9179027080535889, |
|
"learning_rate": 2.6892430278884462e-05, |
|
"loss": 0.6732, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2994011976047904, |
|
"grad_norm": 1.792811393737793, |
|
"learning_rate": 2.9880478087649403e-05, |
|
"loss": 0.6535, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.32934131736526945, |
|
"grad_norm": 1.7675724029541016, |
|
"learning_rate": 2.9680567879325643e-05, |
|
"loss": 0.6498, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 1.713584303855896, |
|
"learning_rate": 2.9347826086956523e-05, |
|
"loss": 0.6196, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.38922155688622756, |
|
"grad_norm": 1.8329033851623535, |
|
"learning_rate": 2.90150842945874e-05, |
|
"loss": 0.6241, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.41916167664670656, |
|
"grad_norm": 1.9062732458114624, |
|
"learning_rate": 2.868234250221828e-05, |
|
"loss": 0.6262, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4491017964071856, |
|
"grad_norm": 2.1072585582733154, |
|
"learning_rate": 2.834960070984916e-05, |
|
"loss": 0.6337, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 1.9632734060287476, |
|
"learning_rate": 2.8016858917480035e-05, |
|
"loss": 0.6225, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5089820359281437, |
|
"grad_norm": 1.9905979633331299, |
|
"learning_rate": 2.7684117125110915e-05, |
|
"loss": 0.6079, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5389221556886228, |
|
"grad_norm": 1.9922906160354614, |
|
"learning_rate": 2.735137533274179e-05, |
|
"loss": 0.615, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5688622754491018, |
|
"grad_norm": 1.898781418800354, |
|
"learning_rate": 2.701863354037267e-05, |
|
"loss": 0.612, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"grad_norm": 2.038163423538208, |
|
"learning_rate": 2.668589174800355e-05, |
|
"loss": 0.6307, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6287425149700598, |
|
"grad_norm": 1.7456048727035522, |
|
"learning_rate": 2.635314995563443e-05, |
|
"loss": 0.6021, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6586826347305389, |
|
"grad_norm": 1.9026494026184082, |
|
"learning_rate": 2.6020408163265307e-05, |
|
"loss": 0.6123, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.688622754491018, |
|
"grad_norm": 1.8563848733901978, |
|
"learning_rate": 2.5687666370896187e-05, |
|
"loss": 0.6272, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 1.7864805459976196, |
|
"learning_rate": 2.5354924578527063e-05, |
|
"loss": 0.6147, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7485029940119761, |
|
"grad_norm": 1.581784963607788, |
|
"learning_rate": 2.5022182786157943e-05, |
|
"loss": 0.6003, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7784431137724551, |
|
"grad_norm": 1.7364259958267212, |
|
"learning_rate": 2.4689440993788823e-05, |
|
"loss": 0.5972, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8083832335329342, |
|
"grad_norm": 1.889833927154541, |
|
"learning_rate": 2.43566992014197e-05, |
|
"loss": 0.6206, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.8383233532934131, |
|
"grad_norm": 1.6451694965362549, |
|
"learning_rate": 2.402395740905058e-05, |
|
"loss": 0.6084, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8682634730538922, |
|
"grad_norm": 1.9670981168746948, |
|
"learning_rate": 2.3691215616681455e-05, |
|
"loss": 0.58, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8982035928143712, |
|
"grad_norm": 1.8126428127288818, |
|
"learning_rate": 2.3358473824312335e-05, |
|
"loss": 0.6039, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9281437125748503, |
|
"grad_norm": 1.6846369504928589, |
|
"learning_rate": 2.302573203194321e-05, |
|
"loss": 0.6032, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 1.777023434638977, |
|
"learning_rate": 2.269299023957409e-05, |
|
"loss": 0.5884, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9880239520958084, |
|
"grad_norm": 1.7603051662445068, |
|
"learning_rate": 2.2360248447204967e-05, |
|
"loss": 0.6098, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.0179640718562875, |
|
"grad_norm": 1.8388431072235107, |
|
"learning_rate": 2.2027506654835847e-05, |
|
"loss": 0.5802, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0479041916167664, |
|
"grad_norm": 1.9411472082138062, |
|
"learning_rate": 2.1694764862466724e-05, |
|
"loss": 0.596, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.0778443113772456, |
|
"grad_norm": 1.8858859539031982, |
|
"learning_rate": 2.1362023070097603e-05, |
|
"loss": 0.5776, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1077844311377245, |
|
"grad_norm": 1.9276236295700073, |
|
"learning_rate": 2.1029281277728483e-05, |
|
"loss": 0.5673, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.1377245508982037, |
|
"grad_norm": 2.1130690574645996, |
|
"learning_rate": 2.0696539485359363e-05, |
|
"loss": 0.5859, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1676646706586826, |
|
"grad_norm": 2.248739004135132, |
|
"learning_rate": 2.0363797692990243e-05, |
|
"loss": 0.5856, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.1976047904191618, |
|
"grad_norm": 2.193042516708374, |
|
"learning_rate": 2.003105590062112e-05, |
|
"loss": 0.5803, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2275449101796407, |
|
"grad_norm": 1.9028452634811401, |
|
"learning_rate": 1.9698314108252e-05, |
|
"loss": 0.5721, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.2574850299401197, |
|
"grad_norm": 2.1685469150543213, |
|
"learning_rate": 1.9365572315882875e-05, |
|
"loss": 0.5854, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.2874251497005988, |
|
"grad_norm": 1.913358449935913, |
|
"learning_rate": 1.9032830523513755e-05, |
|
"loss": 0.5739, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.3173652694610778, |
|
"grad_norm": 2.101280450820923, |
|
"learning_rate": 1.870008873114463e-05, |
|
"loss": 0.5757, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.347305389221557, |
|
"grad_norm": 1.8946870565414429, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 0.5652, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.377245508982036, |
|
"grad_norm": 1.9929522275924683, |
|
"learning_rate": 1.8034605146406388e-05, |
|
"loss": 0.5717, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.407185628742515, |
|
"grad_norm": 1.9487309455871582, |
|
"learning_rate": 1.7701863354037267e-05, |
|
"loss": 0.5633, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.437125748502994, |
|
"grad_norm": 2.029733896255493, |
|
"learning_rate": 1.7369121561668147e-05, |
|
"loss": 0.566, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.467065868263473, |
|
"grad_norm": 2.0024046897888184, |
|
"learning_rate": 1.7036379769299024e-05, |
|
"loss": 0.5707, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.4970059880239521, |
|
"grad_norm": 2.075111150741577, |
|
"learning_rate": 1.6703637976929903e-05, |
|
"loss": 0.5707, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.5269461077844313, |
|
"grad_norm": 2.050729751586914, |
|
"learning_rate": 1.637089618456078e-05, |
|
"loss": 0.5769, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.55688622754491, |
|
"grad_norm": 1.8547407388687134, |
|
"learning_rate": 1.603815439219166e-05, |
|
"loss": 0.5821, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5868263473053892, |
|
"grad_norm": 1.965280294418335, |
|
"learning_rate": 1.5705412599822536e-05, |
|
"loss": 0.5611, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.6167664670658684, |
|
"grad_norm": 2.0741708278656006, |
|
"learning_rate": 1.5372670807453416e-05, |
|
"loss": 0.5786, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.6467065868263473, |
|
"grad_norm": 1.8129183053970337, |
|
"learning_rate": 1.5039929015084294e-05, |
|
"loss": 0.5716, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.6766467065868262, |
|
"grad_norm": 1.8226035833358765, |
|
"learning_rate": 1.4707187222715174e-05, |
|
"loss": 0.5739, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.7065868263473054, |
|
"grad_norm": 2.042602062225342, |
|
"learning_rate": 1.4374445430346052e-05, |
|
"loss": 0.5664, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.7365269461077846, |
|
"grad_norm": 1.8656301498413086, |
|
"learning_rate": 1.404170363797693e-05, |
|
"loss": 0.5656, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7664670658682635, |
|
"grad_norm": 1.8996257781982422, |
|
"learning_rate": 1.3708961845607808e-05, |
|
"loss": 0.5791, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.7964071856287425, |
|
"grad_norm": 1.7967721223831177, |
|
"learning_rate": 1.3376220053238688e-05, |
|
"loss": 0.5551, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.8263473053892216, |
|
"grad_norm": 1.7963491678237915, |
|
"learning_rate": 1.3043478260869566e-05, |
|
"loss": 0.5686, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.8562874251497006, |
|
"grad_norm": 2.0248234272003174, |
|
"learning_rate": 1.2710736468500444e-05, |
|
"loss": 0.5518, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.8862275449101795, |
|
"grad_norm": 1.84022855758667, |
|
"learning_rate": 1.2377994676131322e-05, |
|
"loss": 0.5521, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.9161676646706587, |
|
"grad_norm": 2.160158157348633, |
|
"learning_rate": 1.20452528837622e-05, |
|
"loss": 0.5525, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.9461077844311379, |
|
"grad_norm": 1.9900472164154053, |
|
"learning_rate": 1.171251109139308e-05, |
|
"loss": 0.5574, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.9760479041916168, |
|
"grad_norm": 1.9723472595214844, |
|
"learning_rate": 1.1379769299023958e-05, |
|
"loss": 0.5785, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.0059880239520957, |
|
"grad_norm": 2.0681216716766357, |
|
"learning_rate": 1.1047027506654836e-05, |
|
"loss": 0.5553, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.035928143712575, |
|
"grad_norm": 2.1955301761627197, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 0.5657, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.065868263473054, |
|
"grad_norm": 2.136237859725952, |
|
"learning_rate": 1.0381543921916594e-05, |
|
"loss": 0.5415, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 2.095808383233533, |
|
"grad_norm": 2.073801279067993, |
|
"learning_rate": 1.0048802129547472e-05, |
|
"loss": 0.5335, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.125748502994012, |
|
"grad_norm": 2.140120506286621, |
|
"learning_rate": 9.71606033717835e-06, |
|
"loss": 0.5623, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 2.155688622754491, |
|
"grad_norm": 2.025439977645874, |
|
"learning_rate": 9.383318544809228e-06, |
|
"loss": 0.5427, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.18562874251497, |
|
"grad_norm": 2.0939273834228516, |
|
"learning_rate": 9.050576752440106e-06, |
|
"loss": 0.5542, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 2.215568862275449, |
|
"grad_norm": 2.1501097679138184, |
|
"learning_rate": 8.717834960070984e-06, |
|
"loss": 0.5559, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.245508982035928, |
|
"grad_norm": 2.0543227195739746, |
|
"learning_rate": 8.385093167701862e-06, |
|
"loss": 0.542, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.2754491017964074, |
|
"grad_norm": 1.9872088432312012, |
|
"learning_rate": 8.052351375332742e-06, |
|
"loss": 0.5421, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.305389221556886, |
|
"grad_norm": 2.2318456172943115, |
|
"learning_rate": 7.71960958296362e-06, |
|
"loss": 0.5575, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.3353293413173652, |
|
"grad_norm": 2.1007492542266846, |
|
"learning_rate": 7.386867790594499e-06, |
|
"loss": 0.5585, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.3652694610778444, |
|
"grad_norm": 2.1291024684906006, |
|
"learning_rate": 7.054125998225377e-06, |
|
"loss": 0.5606, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.3952095808383236, |
|
"grad_norm": 2.263563394546509, |
|
"learning_rate": 6.721384205856256e-06, |
|
"loss": 0.5585, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.4251497005988023, |
|
"grad_norm": 2.1472697257995605, |
|
"learning_rate": 6.388642413487134e-06, |
|
"loss": 0.5377, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.4550898203592815, |
|
"grad_norm": 2.075249195098877, |
|
"learning_rate": 6.055900621118012e-06, |
|
"loss": 0.5413, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.4850299401197606, |
|
"grad_norm": 2.2139124870300293, |
|
"learning_rate": 5.7231588287488905e-06, |
|
"loss": 0.5468, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.5149700598802394, |
|
"grad_norm": 2.2758278846740723, |
|
"learning_rate": 5.390417036379769e-06, |
|
"loss": 0.5619, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.5449101796407185, |
|
"grad_norm": 2.212797164916992, |
|
"learning_rate": 5.057675244010648e-06, |
|
"loss": 0.544, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.5748502994011977, |
|
"grad_norm": 2.078122615814209, |
|
"learning_rate": 4.724933451641526e-06, |
|
"loss": 0.5263, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.6047904191616764, |
|
"grad_norm": 3.65337872505188, |
|
"learning_rate": 4.3921916592724045e-06, |
|
"loss": 0.551, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.6347305389221556, |
|
"grad_norm": 2.2584967613220215, |
|
"learning_rate": 4.059449866903283e-06, |
|
"loss": 0.5445, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.6646706586826348, |
|
"grad_norm": 2.1008565425872803, |
|
"learning_rate": 3.7267080745341615e-06, |
|
"loss": 0.5451, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.694610778443114, |
|
"grad_norm": 2.453005790710449, |
|
"learning_rate": 3.3939662821650396e-06, |
|
"loss": 0.5573, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.724550898203593, |
|
"grad_norm": 2.185372829437256, |
|
"learning_rate": 3.0612244897959185e-06, |
|
"loss": 0.5399, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.754491017964072, |
|
"grad_norm": 2.158651351928711, |
|
"learning_rate": 2.7284826974267966e-06, |
|
"loss": 0.5482, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.784431137724551, |
|
"grad_norm": 2.1352522373199463, |
|
"learning_rate": 2.3957409050576756e-06, |
|
"loss": 0.5453, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.81437125748503, |
|
"grad_norm": 2.063870906829834, |
|
"learning_rate": 2.0629991126885537e-06, |
|
"loss": 0.5452, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.844311377245509, |
|
"grad_norm": 2.159843921661377, |
|
"learning_rate": 1.7302573203194322e-06, |
|
"loss": 0.5374, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.874251497005988, |
|
"grad_norm": 2.268249273300171, |
|
"learning_rate": 1.3975155279503105e-06, |
|
"loss": 0.5608, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.904191616766467, |
|
"grad_norm": 1.9996740818023682, |
|
"learning_rate": 1.078083407275954e-06, |
|
"loss": 0.5355, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.934131736526946, |
|
"grad_norm": 2.2193121910095215, |
|
"learning_rate": 7.453416149068323e-07, |
|
"loss": 0.5425, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.964071856287425, |
|
"grad_norm": 2.098750591278076, |
|
"learning_rate": 4.125998225377108e-07, |
|
"loss": 0.5527, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"grad_norm": 2.186506986618042, |
|
"learning_rate": 7.985803016858918e-08, |
|
"loss": 0.5567, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 2505, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 2.1895496620572672e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|