hkufyp2024's picture
Upload folder using huggingface_hub
7cfd559 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2505,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029940119760479042,
"grad_norm": 1.6695125102996826,
"learning_rate": 2.98804780876494e-06,
"loss": 1.1663,
"step": 25
},
{
"epoch": 0.059880239520958084,
"grad_norm": 1.2466001510620117,
"learning_rate": 5.97609561752988e-06,
"loss": 1.0872,
"step": 50
},
{
"epoch": 0.08982035928143713,
"grad_norm": 1.3031201362609863,
"learning_rate": 8.964143426294822e-06,
"loss": 1.0409,
"step": 75
},
{
"epoch": 0.11976047904191617,
"grad_norm": 1.2915183305740356,
"learning_rate": 1.195219123505976e-05,
"loss": 0.9647,
"step": 100
},
{
"epoch": 0.1497005988023952,
"grad_norm": 1.2883236408233643,
"learning_rate": 1.4940239043824702e-05,
"loss": 0.8845,
"step": 125
},
{
"epoch": 0.17964071856287425,
"grad_norm": 1.2346601486206055,
"learning_rate": 1.7928286852589643e-05,
"loss": 0.7993,
"step": 150
},
{
"epoch": 0.20958083832335328,
"grad_norm": 1.6535353660583496,
"learning_rate": 2.0916334661354585e-05,
"loss": 0.7358,
"step": 175
},
{
"epoch": 0.23952095808383234,
"grad_norm": 1.7455912828445435,
"learning_rate": 2.390438247011952e-05,
"loss": 0.7181,
"step": 200
},
{
"epoch": 0.2694610778443114,
"grad_norm": 1.9179027080535889,
"learning_rate": 2.6892430278884462e-05,
"loss": 0.6732,
"step": 225
},
{
"epoch": 0.2994011976047904,
"grad_norm": 1.792811393737793,
"learning_rate": 2.9880478087649403e-05,
"loss": 0.6535,
"step": 250
},
{
"epoch": 0.32934131736526945,
"grad_norm": 1.7675724029541016,
"learning_rate": 2.9680567879325643e-05,
"loss": 0.6498,
"step": 275
},
{
"epoch": 0.3592814371257485,
"grad_norm": 1.713584303855896,
"learning_rate": 2.9347826086956523e-05,
"loss": 0.6196,
"step": 300
},
{
"epoch": 0.38922155688622756,
"grad_norm": 1.8329033851623535,
"learning_rate": 2.90150842945874e-05,
"loss": 0.6241,
"step": 325
},
{
"epoch": 0.41916167664670656,
"grad_norm": 1.9062732458114624,
"learning_rate": 2.868234250221828e-05,
"loss": 0.6262,
"step": 350
},
{
"epoch": 0.4491017964071856,
"grad_norm": 2.1072585582733154,
"learning_rate": 2.834960070984916e-05,
"loss": 0.6337,
"step": 375
},
{
"epoch": 0.47904191616766467,
"grad_norm": 1.9632734060287476,
"learning_rate": 2.8016858917480035e-05,
"loss": 0.6225,
"step": 400
},
{
"epoch": 0.5089820359281437,
"grad_norm": 1.9905979633331299,
"learning_rate": 2.7684117125110915e-05,
"loss": 0.6079,
"step": 425
},
{
"epoch": 0.5389221556886228,
"grad_norm": 1.9922906160354614,
"learning_rate": 2.735137533274179e-05,
"loss": 0.615,
"step": 450
},
{
"epoch": 0.5688622754491018,
"grad_norm": 1.898781418800354,
"learning_rate": 2.701863354037267e-05,
"loss": 0.612,
"step": 475
},
{
"epoch": 0.5988023952095808,
"grad_norm": 2.038163423538208,
"learning_rate": 2.668589174800355e-05,
"loss": 0.6307,
"step": 500
},
{
"epoch": 0.6287425149700598,
"grad_norm": 1.7456048727035522,
"learning_rate": 2.635314995563443e-05,
"loss": 0.6021,
"step": 525
},
{
"epoch": 0.6586826347305389,
"grad_norm": 1.9026494026184082,
"learning_rate": 2.6020408163265307e-05,
"loss": 0.6123,
"step": 550
},
{
"epoch": 0.688622754491018,
"grad_norm": 1.8563848733901978,
"learning_rate": 2.5687666370896187e-05,
"loss": 0.6272,
"step": 575
},
{
"epoch": 0.718562874251497,
"grad_norm": 1.7864805459976196,
"learning_rate": 2.5354924578527063e-05,
"loss": 0.6147,
"step": 600
},
{
"epoch": 0.7485029940119761,
"grad_norm": 1.581784963607788,
"learning_rate": 2.5022182786157943e-05,
"loss": 0.6003,
"step": 625
},
{
"epoch": 0.7784431137724551,
"grad_norm": 1.7364259958267212,
"learning_rate": 2.4689440993788823e-05,
"loss": 0.5972,
"step": 650
},
{
"epoch": 0.8083832335329342,
"grad_norm": 1.889833927154541,
"learning_rate": 2.43566992014197e-05,
"loss": 0.6206,
"step": 675
},
{
"epoch": 0.8383233532934131,
"grad_norm": 1.6451694965362549,
"learning_rate": 2.402395740905058e-05,
"loss": 0.6084,
"step": 700
},
{
"epoch": 0.8682634730538922,
"grad_norm": 1.9670981168746948,
"learning_rate": 2.3691215616681455e-05,
"loss": 0.58,
"step": 725
},
{
"epoch": 0.8982035928143712,
"grad_norm": 1.8126428127288818,
"learning_rate": 2.3358473824312335e-05,
"loss": 0.6039,
"step": 750
},
{
"epoch": 0.9281437125748503,
"grad_norm": 1.6846369504928589,
"learning_rate": 2.302573203194321e-05,
"loss": 0.6032,
"step": 775
},
{
"epoch": 0.9580838323353293,
"grad_norm": 1.777023434638977,
"learning_rate": 2.269299023957409e-05,
"loss": 0.5884,
"step": 800
},
{
"epoch": 0.9880239520958084,
"grad_norm": 1.7603051662445068,
"learning_rate": 2.2360248447204967e-05,
"loss": 0.6098,
"step": 825
},
{
"epoch": 1.0179640718562875,
"grad_norm": 1.8388431072235107,
"learning_rate": 2.2027506654835847e-05,
"loss": 0.5802,
"step": 850
},
{
"epoch": 1.0479041916167664,
"grad_norm": 1.9411472082138062,
"learning_rate": 2.1694764862466724e-05,
"loss": 0.596,
"step": 875
},
{
"epoch": 1.0778443113772456,
"grad_norm": 1.8858859539031982,
"learning_rate": 2.1362023070097603e-05,
"loss": 0.5776,
"step": 900
},
{
"epoch": 1.1077844311377245,
"grad_norm": 1.9276236295700073,
"learning_rate": 2.1029281277728483e-05,
"loss": 0.5673,
"step": 925
},
{
"epoch": 1.1377245508982037,
"grad_norm": 2.1130690574645996,
"learning_rate": 2.0696539485359363e-05,
"loss": 0.5859,
"step": 950
},
{
"epoch": 1.1676646706586826,
"grad_norm": 2.248739004135132,
"learning_rate": 2.0363797692990243e-05,
"loss": 0.5856,
"step": 975
},
{
"epoch": 1.1976047904191618,
"grad_norm": 2.193042516708374,
"learning_rate": 2.003105590062112e-05,
"loss": 0.5803,
"step": 1000
},
{
"epoch": 1.2275449101796407,
"grad_norm": 1.9028452634811401,
"learning_rate": 1.9698314108252e-05,
"loss": 0.5721,
"step": 1025
},
{
"epoch": 1.2574850299401197,
"grad_norm": 2.1685469150543213,
"learning_rate": 1.9365572315882875e-05,
"loss": 0.5854,
"step": 1050
},
{
"epoch": 1.2874251497005988,
"grad_norm": 1.913358449935913,
"learning_rate": 1.9032830523513755e-05,
"loss": 0.5739,
"step": 1075
},
{
"epoch": 1.3173652694610778,
"grad_norm": 2.101280450820923,
"learning_rate": 1.870008873114463e-05,
"loss": 0.5757,
"step": 1100
},
{
"epoch": 1.347305389221557,
"grad_norm": 1.8946870565414429,
"learning_rate": 1.836734693877551e-05,
"loss": 0.5652,
"step": 1125
},
{
"epoch": 1.377245508982036,
"grad_norm": 1.9929522275924683,
"learning_rate": 1.8034605146406388e-05,
"loss": 0.5717,
"step": 1150
},
{
"epoch": 1.407185628742515,
"grad_norm": 1.9487309455871582,
"learning_rate": 1.7701863354037267e-05,
"loss": 0.5633,
"step": 1175
},
{
"epoch": 1.437125748502994,
"grad_norm": 2.029733896255493,
"learning_rate": 1.7369121561668147e-05,
"loss": 0.566,
"step": 1200
},
{
"epoch": 1.467065868263473,
"grad_norm": 2.0024046897888184,
"learning_rate": 1.7036379769299024e-05,
"loss": 0.5707,
"step": 1225
},
{
"epoch": 1.4970059880239521,
"grad_norm": 2.075111150741577,
"learning_rate": 1.6703637976929903e-05,
"loss": 0.5707,
"step": 1250
},
{
"epoch": 1.5269461077844313,
"grad_norm": 2.050729751586914,
"learning_rate": 1.637089618456078e-05,
"loss": 0.5769,
"step": 1275
},
{
"epoch": 1.55688622754491,
"grad_norm": 1.8547407388687134,
"learning_rate": 1.603815439219166e-05,
"loss": 0.5821,
"step": 1300
},
{
"epoch": 1.5868263473053892,
"grad_norm": 1.965280294418335,
"learning_rate": 1.5705412599822536e-05,
"loss": 0.5611,
"step": 1325
},
{
"epoch": 1.6167664670658684,
"grad_norm": 2.0741708278656006,
"learning_rate": 1.5372670807453416e-05,
"loss": 0.5786,
"step": 1350
},
{
"epoch": 1.6467065868263473,
"grad_norm": 1.8129183053970337,
"learning_rate": 1.5039929015084294e-05,
"loss": 0.5716,
"step": 1375
},
{
"epoch": 1.6766467065868262,
"grad_norm": 1.8226035833358765,
"learning_rate": 1.4707187222715174e-05,
"loss": 0.5739,
"step": 1400
},
{
"epoch": 1.7065868263473054,
"grad_norm": 2.042602062225342,
"learning_rate": 1.4374445430346052e-05,
"loss": 0.5664,
"step": 1425
},
{
"epoch": 1.7365269461077846,
"grad_norm": 1.8656301498413086,
"learning_rate": 1.404170363797693e-05,
"loss": 0.5656,
"step": 1450
},
{
"epoch": 1.7664670658682635,
"grad_norm": 1.8996257781982422,
"learning_rate": 1.3708961845607808e-05,
"loss": 0.5791,
"step": 1475
},
{
"epoch": 1.7964071856287425,
"grad_norm": 1.7967721223831177,
"learning_rate": 1.3376220053238688e-05,
"loss": 0.5551,
"step": 1500
},
{
"epoch": 1.8263473053892216,
"grad_norm": 1.7963491678237915,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.5686,
"step": 1525
},
{
"epoch": 1.8562874251497006,
"grad_norm": 2.0248234272003174,
"learning_rate": 1.2710736468500444e-05,
"loss": 0.5518,
"step": 1550
},
{
"epoch": 1.8862275449101795,
"grad_norm": 1.84022855758667,
"learning_rate": 1.2377994676131322e-05,
"loss": 0.5521,
"step": 1575
},
{
"epoch": 1.9161676646706587,
"grad_norm": 2.160158157348633,
"learning_rate": 1.20452528837622e-05,
"loss": 0.5525,
"step": 1600
},
{
"epoch": 1.9461077844311379,
"grad_norm": 1.9900472164154053,
"learning_rate": 1.171251109139308e-05,
"loss": 0.5574,
"step": 1625
},
{
"epoch": 1.9760479041916168,
"grad_norm": 1.9723472595214844,
"learning_rate": 1.1379769299023958e-05,
"loss": 0.5785,
"step": 1650
},
{
"epoch": 2.0059880239520957,
"grad_norm": 2.0681216716766357,
"learning_rate": 1.1047027506654836e-05,
"loss": 0.5553,
"step": 1675
},
{
"epoch": 2.035928143712575,
"grad_norm": 2.1955301761627197,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.5657,
"step": 1700
},
{
"epoch": 2.065868263473054,
"grad_norm": 2.136237859725952,
"learning_rate": 1.0381543921916594e-05,
"loss": 0.5415,
"step": 1725
},
{
"epoch": 2.095808383233533,
"grad_norm": 2.073801279067993,
"learning_rate": 1.0048802129547472e-05,
"loss": 0.5335,
"step": 1750
},
{
"epoch": 2.125748502994012,
"grad_norm": 2.140120506286621,
"learning_rate": 9.71606033717835e-06,
"loss": 0.5623,
"step": 1775
},
{
"epoch": 2.155688622754491,
"grad_norm": 2.025439977645874,
"learning_rate": 9.383318544809228e-06,
"loss": 0.5427,
"step": 1800
},
{
"epoch": 2.18562874251497,
"grad_norm": 2.0939273834228516,
"learning_rate": 9.050576752440106e-06,
"loss": 0.5542,
"step": 1825
},
{
"epoch": 2.215568862275449,
"grad_norm": 2.1501097679138184,
"learning_rate": 8.717834960070984e-06,
"loss": 0.5559,
"step": 1850
},
{
"epoch": 2.245508982035928,
"grad_norm": 2.0543227195739746,
"learning_rate": 8.385093167701862e-06,
"loss": 0.542,
"step": 1875
},
{
"epoch": 2.2754491017964074,
"grad_norm": 1.9872088432312012,
"learning_rate": 8.052351375332742e-06,
"loss": 0.5421,
"step": 1900
},
{
"epoch": 2.305389221556886,
"grad_norm": 2.2318456172943115,
"learning_rate": 7.71960958296362e-06,
"loss": 0.5575,
"step": 1925
},
{
"epoch": 2.3353293413173652,
"grad_norm": 2.1007492542266846,
"learning_rate": 7.386867790594499e-06,
"loss": 0.5585,
"step": 1950
},
{
"epoch": 2.3652694610778444,
"grad_norm": 2.1291024684906006,
"learning_rate": 7.054125998225377e-06,
"loss": 0.5606,
"step": 1975
},
{
"epoch": 2.3952095808383236,
"grad_norm": 2.263563394546509,
"learning_rate": 6.721384205856256e-06,
"loss": 0.5585,
"step": 2000
},
{
"epoch": 2.4251497005988023,
"grad_norm": 2.1472697257995605,
"learning_rate": 6.388642413487134e-06,
"loss": 0.5377,
"step": 2025
},
{
"epoch": 2.4550898203592815,
"grad_norm": 2.075249195098877,
"learning_rate": 6.055900621118012e-06,
"loss": 0.5413,
"step": 2050
},
{
"epoch": 2.4850299401197606,
"grad_norm": 2.2139124870300293,
"learning_rate": 5.7231588287488905e-06,
"loss": 0.5468,
"step": 2075
},
{
"epoch": 2.5149700598802394,
"grad_norm": 2.2758278846740723,
"learning_rate": 5.390417036379769e-06,
"loss": 0.5619,
"step": 2100
},
{
"epoch": 2.5449101796407185,
"grad_norm": 2.212797164916992,
"learning_rate": 5.057675244010648e-06,
"loss": 0.544,
"step": 2125
},
{
"epoch": 2.5748502994011977,
"grad_norm": 2.078122615814209,
"learning_rate": 4.724933451641526e-06,
"loss": 0.5263,
"step": 2150
},
{
"epoch": 2.6047904191616764,
"grad_norm": 3.65337872505188,
"learning_rate": 4.3921916592724045e-06,
"loss": 0.551,
"step": 2175
},
{
"epoch": 2.6347305389221556,
"grad_norm": 2.2584967613220215,
"learning_rate": 4.059449866903283e-06,
"loss": 0.5445,
"step": 2200
},
{
"epoch": 2.6646706586826348,
"grad_norm": 2.1008565425872803,
"learning_rate": 3.7267080745341615e-06,
"loss": 0.5451,
"step": 2225
},
{
"epoch": 2.694610778443114,
"grad_norm": 2.453005790710449,
"learning_rate": 3.3939662821650396e-06,
"loss": 0.5573,
"step": 2250
},
{
"epoch": 2.724550898203593,
"grad_norm": 2.185372829437256,
"learning_rate": 3.0612244897959185e-06,
"loss": 0.5399,
"step": 2275
},
{
"epoch": 2.754491017964072,
"grad_norm": 2.158651351928711,
"learning_rate": 2.7284826974267966e-06,
"loss": 0.5482,
"step": 2300
},
{
"epoch": 2.784431137724551,
"grad_norm": 2.1352522373199463,
"learning_rate": 2.3957409050576756e-06,
"loss": 0.5453,
"step": 2325
},
{
"epoch": 2.81437125748503,
"grad_norm": 2.063870906829834,
"learning_rate": 2.0629991126885537e-06,
"loss": 0.5452,
"step": 2350
},
{
"epoch": 2.844311377245509,
"grad_norm": 2.159843921661377,
"learning_rate": 1.7302573203194322e-06,
"loss": 0.5374,
"step": 2375
},
{
"epoch": 2.874251497005988,
"grad_norm": 2.268249273300171,
"learning_rate": 1.3975155279503105e-06,
"loss": 0.5608,
"step": 2400
},
{
"epoch": 2.904191616766467,
"grad_norm": 1.9996740818023682,
"learning_rate": 1.078083407275954e-06,
"loss": 0.5355,
"step": 2425
},
{
"epoch": 2.934131736526946,
"grad_norm": 2.2193121910095215,
"learning_rate": 7.453416149068323e-07,
"loss": 0.5425,
"step": 2450
},
{
"epoch": 2.964071856287425,
"grad_norm": 2.098750591278076,
"learning_rate": 4.125998225377108e-07,
"loss": 0.5527,
"step": 2475
},
{
"epoch": 2.9940119760479043,
"grad_norm": 2.186506986618042,
"learning_rate": 7.985803016858918e-08,
"loss": 0.5567,
"step": 2500
}
],
"logging_steps": 25,
"max_steps": 2505,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 2.1895496620572672e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}