“Sara
adding model files
45522b4
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.6,
"eval_steps": 50,
"global_step": 2800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1,
"grad_norm": 10.53576374053955,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.6618,
"step": 50
},
{
"epoch": 0.1,
"eval_loss": 0.7731789350509644,
"eval_runtime": 2.2494,
"eval_samples_per_second": 69.353,
"eval_steps_per_second": 3.557,
"step": 50
},
{
"epoch": 0.2,
"grad_norm": 5.800010681152344,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7629,
"step": 100
},
{
"epoch": 0.2,
"eval_loss": 0.6901325583457947,
"eval_runtime": 2.2539,
"eval_samples_per_second": 69.213,
"eval_steps_per_second": 3.549,
"step": 100
},
{
"epoch": 0.3,
"grad_norm": 4.960265636444092,
"learning_rate": 6e-06,
"loss": 0.7256,
"step": 150
},
{
"epoch": 0.3,
"eval_loss": 0.6716309785842896,
"eval_runtime": 2.2526,
"eval_samples_per_second": 69.254,
"eval_steps_per_second": 3.551,
"step": 150
},
{
"epoch": 0.4,
"grad_norm": 5.574848651885986,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7243,
"step": 200
},
{
"epoch": 0.4,
"eval_loss": 0.6644517779350281,
"eval_runtime": 2.2546,
"eval_samples_per_second": 69.193,
"eval_steps_per_second": 3.548,
"step": 200
},
{
"epoch": 0.5,
"grad_norm": 3.0581891536712646,
"learning_rate": 1e-05,
"loss": 0.6918,
"step": 250
},
{
"epoch": 0.5,
"eval_loss": 0.6718080043792725,
"eval_runtime": 2.255,
"eval_samples_per_second": 69.18,
"eval_steps_per_second": 3.548,
"step": 250
},
{
"epoch": 0.6,
"grad_norm": 3.797400712966919,
"learning_rate": 1.2e-05,
"loss": 0.7433,
"step": 300
},
{
"epoch": 0.6,
"eval_loss": 0.67710280418396,
"eval_runtime": 2.2558,
"eval_samples_per_second": 69.155,
"eval_steps_per_second": 3.546,
"step": 300
},
{
"epoch": 0.7,
"grad_norm": 8.121636390686035,
"learning_rate": 1.4e-05,
"loss": 0.7523,
"step": 350
},
{
"epoch": 0.7,
"eval_loss": 0.680716335773468,
"eval_runtime": 2.2562,
"eval_samples_per_second": 69.144,
"eval_steps_per_second": 3.546,
"step": 350
},
{
"epoch": 0.8,
"grad_norm": 2.615454912185669,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.7322,
"step": 400
},
{
"epoch": 0.8,
"eval_loss": 0.6906686425209045,
"eval_runtime": 2.2633,
"eval_samples_per_second": 68.926,
"eval_steps_per_second": 3.535,
"step": 400
},
{
"epoch": 0.9,
"grad_norm": 2.9651033878326416,
"learning_rate": 1.8e-05,
"loss": 0.7497,
"step": 450
},
{
"epoch": 0.9,
"eval_loss": 0.6827173233032227,
"eval_runtime": 2.5909,
"eval_samples_per_second": 60.21,
"eval_steps_per_second": 3.088,
"step": 450
},
{
"epoch": 1.0,
"grad_norm": 3.7542426586151123,
"learning_rate": 2e-05,
"loss": 0.7622,
"step": 500
},
{
"epoch": 1.0,
"eval_loss": 0.6903170347213745,
"eval_runtime": 2.4721,
"eval_samples_per_second": 63.105,
"eval_steps_per_second": 3.236,
"step": 500
},
{
"epoch": 1.1,
"grad_norm": 2.384434938430786,
"learning_rate": 1.999390827019096e-05,
"loss": 0.484,
"step": 550
},
{
"epoch": 1.1,
"eval_loss": 0.7237842679023743,
"eval_runtime": 2.9344,
"eval_samples_per_second": 53.162,
"eval_steps_per_second": 2.726,
"step": 550
},
{
"epoch": 1.2,
"grad_norm": 3.1198794841766357,
"learning_rate": 1.9975640502598243e-05,
"loss": 0.5145,
"step": 600
},
{
"epoch": 1.2,
"eval_loss": 0.7352678179740906,
"eval_runtime": 3.8017,
"eval_samples_per_second": 41.034,
"eval_steps_per_second": 2.104,
"step": 600
},
{
"epoch": 1.3,
"grad_norm": 6.234444618225098,
"learning_rate": 1.9945218953682736e-05,
"loss": 0.5093,
"step": 650
},
{
"epoch": 1.3,
"eval_loss": 0.7311124801635742,
"eval_runtime": 2.2672,
"eval_samples_per_second": 68.808,
"eval_steps_per_second": 3.529,
"step": 650
},
{
"epoch": 1.4,
"grad_norm": 2.112931489944458,
"learning_rate": 1.9902680687415704e-05,
"loss": 0.5248,
"step": 700
},
{
"epoch": 1.4,
"eval_loss": 0.734488844871521,
"eval_runtime": 2.2746,
"eval_samples_per_second": 68.582,
"eval_steps_per_second": 3.517,
"step": 700
},
{
"epoch": 1.5,
"grad_norm": 3.4556541442871094,
"learning_rate": 1.9848077530122083e-05,
"loss": 0.5107,
"step": 750
},
{
"epoch": 1.5,
"eval_loss": 0.723623514175415,
"eval_runtime": 2.256,
"eval_samples_per_second": 69.148,
"eval_steps_per_second": 3.546,
"step": 750
},
{
"epoch": 1.6,
"grad_norm": 3.025707960128784,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.5171,
"step": 800
},
{
"epoch": 1.6,
"eval_loss": 0.7228586077690125,
"eval_runtime": 2.2603,
"eval_samples_per_second": 69.016,
"eval_steps_per_second": 3.539,
"step": 800
},
{
"epoch": 1.7,
"grad_norm": 2.2873287200927734,
"learning_rate": 1.9702957262759964e-05,
"loss": 0.5391,
"step": 850
},
{
"epoch": 1.7,
"eval_loss": 0.7198938727378845,
"eval_runtime": 2.4311,
"eval_samples_per_second": 64.168,
"eval_steps_per_second": 3.291,
"step": 850
},
{
"epoch": 1.8,
"grad_norm": 3.1473968029022217,
"learning_rate": 1.961261695938319e-05,
"loss": 0.5244,
"step": 900
},
{
"epoch": 1.8,
"eval_loss": 0.7222604751586914,
"eval_runtime": 2.6131,
"eval_samples_per_second": 59.699,
"eval_steps_per_second": 3.061,
"step": 900
},
{
"epoch": 1.9,
"grad_norm": 2.5658185482025146,
"learning_rate": 1.9510565162951538e-05,
"loss": 0.5435,
"step": 950
},
{
"epoch": 1.9,
"eval_loss": 0.7172784209251404,
"eval_runtime": 3.0626,
"eval_samples_per_second": 50.937,
"eval_steps_per_second": 2.612,
"step": 950
},
{
"epoch": 2.0,
"grad_norm": 3.090545415878296,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.5197,
"step": 1000
},
{
"epoch": 2.0,
"eval_loss": 0.7204703092575073,
"eval_runtime": 3.4963,
"eval_samples_per_second": 44.619,
"eval_steps_per_second": 2.288,
"step": 1000
},
{
"epoch": 2.1,
"grad_norm": 1.921531081199646,
"learning_rate": 1.9271838545667876e-05,
"loss": 0.2538,
"step": 1050
},
{
"epoch": 2.1,
"eval_loss": 0.791098952293396,
"eval_runtime": 2.2604,
"eval_samples_per_second": 69.014,
"eval_steps_per_second": 3.539,
"step": 1050
},
{
"epoch": 2.2,
"grad_norm": 1.807320475578308,
"learning_rate": 1.913545457642601e-05,
"loss": 0.2521,
"step": 1100
},
{
"epoch": 2.2,
"eval_loss": 0.8204991221427917,
"eval_runtime": 2.2623,
"eval_samples_per_second": 68.956,
"eval_steps_per_second": 3.536,
"step": 1100
},
{
"epoch": 2.3,
"grad_norm": 2.746616840362549,
"learning_rate": 1.8987940462991673e-05,
"loss": 0.2687,
"step": 1150
},
{
"epoch": 2.3,
"eval_loss": 0.8025296330451965,
"eval_runtime": 2.2565,
"eval_samples_per_second": 69.132,
"eval_steps_per_second": 3.545,
"step": 1150
},
{
"epoch": 2.4,
"grad_norm": 2.3170738220214844,
"learning_rate": 1.8829475928589272e-05,
"loss": 0.2689,
"step": 1200
},
{
"epoch": 2.4,
"eval_loss": 0.8150458931922913,
"eval_runtime": 2.2607,
"eval_samples_per_second": 69.005,
"eval_steps_per_second": 3.539,
"step": 1200
},
{
"epoch": 2.5,
"grad_norm": 1.9649097919464111,
"learning_rate": 1.866025403784439e-05,
"loss": 0.2772,
"step": 1250
},
{
"epoch": 2.5,
"eval_loss": 0.7988224625587463,
"eval_runtime": 2.5979,
"eval_samples_per_second": 60.048,
"eval_steps_per_second": 3.079,
"step": 1250
},
{
"epoch": 2.6,
"grad_norm": 2.264338970184326,
"learning_rate": 1.848048096156426e-05,
"loss": 0.2788,
"step": 1300
},
{
"epoch": 2.6,
"eval_loss": 0.8175423741340637,
"eval_runtime": 3.4025,
"eval_samples_per_second": 45.849,
"eval_steps_per_second": 2.351,
"step": 1300
},
{
"epoch": 2.7,
"grad_norm": 2.027390241622925,
"learning_rate": 1.8290375725550417e-05,
"loss": 0.2742,
"step": 1350
},
{
"epoch": 2.7,
"eval_loss": 0.8078347444534302,
"eval_runtime": 2.7124,
"eval_samples_per_second": 57.513,
"eval_steps_per_second": 2.949,
"step": 1350
},
{
"epoch": 2.8,
"grad_norm": 1.8391352891921997,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.2749,
"step": 1400
},
{
"epoch": 2.8,
"eval_loss": 0.804284393787384,
"eval_runtime": 2.9467,
"eval_samples_per_second": 52.94,
"eval_steps_per_second": 2.715,
"step": 1400
},
{
"epoch": 2.9,
"grad_norm": 1.9982004165649414,
"learning_rate": 1.788010753606722e-05,
"loss": 0.2717,
"step": 1450
},
{
"epoch": 2.9,
"eval_loss": 0.7994141578674316,
"eval_runtime": 2.2711,
"eval_samples_per_second": 68.688,
"eval_steps_per_second": 3.522,
"step": 1450
},
{
"epoch": 3.0,
"grad_norm": 1.782399296760559,
"learning_rate": 1.766044443118978e-05,
"loss": 0.2715,
"step": 1500
},
{
"epoch": 3.0,
"eval_loss": 0.804834246635437,
"eval_runtime": 2.2867,
"eval_samples_per_second": 68.222,
"eval_steps_per_second": 3.499,
"step": 1500
},
{
"epoch": 3.1,
"grad_norm": 1.8651448488235474,
"learning_rate": 1.7431448254773943e-05,
"loss": 0.1627,
"step": 1550
},
{
"epoch": 3.1,
"eval_loss": 0.859173595905304,
"eval_runtime": 2.2588,
"eval_samples_per_second": 69.062,
"eval_steps_per_second": 3.542,
"step": 1550
},
{
"epoch": 3.2,
"grad_norm": 1.4768388271331787,
"learning_rate": 1.7193398003386514e-05,
"loss": 0.1651,
"step": 1600
},
{
"epoch": 3.2,
"eval_loss": 0.868316650390625,
"eval_runtime": 2.259,
"eval_samples_per_second": 69.058,
"eval_steps_per_second": 3.541,
"step": 1600
},
{
"epoch": 3.3,
"grad_norm": 1.4704113006591797,
"learning_rate": 1.6946583704589973e-05,
"loss": 0.1702,
"step": 1650
},
{
"epoch": 3.3,
"eval_loss": 0.872775137424469,
"eval_runtime": 2.8294,
"eval_samples_per_second": 55.136,
"eval_steps_per_second": 2.827,
"step": 1650
},
{
"epoch": 3.4,
"grad_norm": 1.082715630531311,
"learning_rate": 1.6691306063588583e-05,
"loss": 0.1734,
"step": 1700
},
{
"epoch": 3.4,
"eval_loss": 0.8728486895561218,
"eval_runtime": 3.3787,
"eval_samples_per_second": 46.171,
"eval_steps_per_second": 2.368,
"step": 1700
},
{
"epoch": 3.5,
"grad_norm": 2.210588216781616,
"learning_rate": 1.6427876096865394e-05,
"loss": 0.1752,
"step": 1750
},
{
"epoch": 3.5,
"eval_loss": 0.8705567717552185,
"eval_runtime": 3.1278,
"eval_samples_per_second": 49.875,
"eval_steps_per_second": 2.558,
"step": 1750
},
{
"epoch": 3.6,
"grad_norm": 1.4183433055877686,
"learning_rate": 1.6156614753256583e-05,
"loss": 0.1706,
"step": 1800
},
{
"epoch": 3.6,
"eval_loss": 0.8853814601898193,
"eval_runtime": 3.6433,
"eval_samples_per_second": 42.818,
"eval_steps_per_second": 2.196,
"step": 1800
},
{
"epoch": 3.7,
"grad_norm": 1.4250963926315308,
"learning_rate": 1.5877852522924733e-05,
"loss": 0.1784,
"step": 1850
},
{
"epoch": 3.7,
"eval_loss": 0.884819507598877,
"eval_runtime": 2.2666,
"eval_samples_per_second": 68.827,
"eval_steps_per_second": 3.53,
"step": 1850
},
{
"epoch": 3.8,
"grad_norm": 1.252785563468933,
"learning_rate": 1.5591929034707468e-05,
"loss": 0.1729,
"step": 1900
},
{
"epoch": 3.8,
"eval_loss": 0.8708668351173401,
"eval_runtime": 2.2648,
"eval_samples_per_second": 68.88,
"eval_steps_per_second": 3.532,
"step": 1900
},
{
"epoch": 3.9,
"grad_norm": 1.4024217128753662,
"learning_rate": 1.529919264233205e-05,
"loss": 0.174,
"step": 1950
},
{
"epoch": 3.9,
"eval_loss": 0.8670658469200134,
"eval_runtime": 2.2608,
"eval_samples_per_second": 69.003,
"eval_steps_per_second": 3.539,
"step": 1950
},
{
"epoch": 4.0,
"grad_norm": 1.6221123933792114,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.174,
"step": 2000
},
{
"epoch": 4.0,
"eval_loss": 0.8709214925765991,
"eval_runtime": 2.2598,
"eval_samples_per_second": 69.033,
"eval_steps_per_second": 3.54,
"step": 2000
},
{
"epoch": 4.1,
"grad_norm": 1.5479576587677002,
"learning_rate": 1.469471562785891e-05,
"loss": 0.1167,
"step": 2050
},
{
"epoch": 4.1,
"eval_loss": 0.9011654853820801,
"eval_runtime": 2.738,
"eval_samples_per_second": 56.976,
"eval_steps_per_second": 2.922,
"step": 2050
},
{
"epoch": 4.2,
"grad_norm": 1.3002970218658447,
"learning_rate": 1.4383711467890776e-05,
"loss": 0.1186,
"step": 2100
},
{
"epoch": 4.2,
"eval_loss": 0.9147914052009583,
"eval_runtime": 3.018,
"eval_samples_per_second": 51.69,
"eval_steps_per_second": 2.651,
"step": 2100
},
{
"epoch": 4.3,
"grad_norm": 1.7996995449066162,
"learning_rate": 1.4067366430758004e-05,
"loss": 0.1153,
"step": 2150
},
{
"epoch": 4.3,
"eval_loss": 0.9160046577453613,
"eval_runtime": 3.6692,
"eval_samples_per_second": 42.516,
"eval_steps_per_second": 2.18,
"step": 2150
},
{
"epoch": 4.4,
"grad_norm": 1.1670547723770142,
"learning_rate": 1.3746065934159123e-05,
"loss": 0.1214,
"step": 2200
},
{
"epoch": 4.4,
"eval_loss": 0.9355931282043457,
"eval_runtime": 2.337,
"eval_samples_per_second": 66.753,
"eval_steps_per_second": 3.423,
"step": 2200
},
{
"epoch": 4.5,
"grad_norm": 1.1401852369308472,
"learning_rate": 1.342020143325669e-05,
"loss": 0.1193,
"step": 2250
},
{
"epoch": 4.5,
"eval_loss": 0.9175124764442444,
"eval_runtime": 2.2626,
"eval_samples_per_second": 68.947,
"eval_steps_per_second": 3.536,
"step": 2250
},
{
"epoch": 4.6,
"grad_norm": 0.8389841914176941,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.1186,
"step": 2300
},
{
"epoch": 4.6,
"eval_loss": 0.9386661052703857,
"eval_runtime": 2.2532,
"eval_samples_per_second": 69.235,
"eval_steps_per_second": 3.55,
"step": 2300
},
{
"epoch": 4.7,
"grad_norm": 1.2419942617416382,
"learning_rate": 1.2756373558169992e-05,
"loss": 0.1187,
"step": 2350
},
{
"epoch": 4.7,
"eval_loss": 0.9336636662483215,
"eval_runtime": 2.2535,
"eval_samples_per_second": 69.225,
"eval_steps_per_second": 3.55,
"step": 2350
},
{
"epoch": 4.8,
"grad_norm": 1.0060522556304932,
"learning_rate": 1.2419218955996677e-05,
"loss": 0.1245,
"step": 2400
},
{
"epoch": 4.8,
"eval_loss": 0.9188296794891357,
"eval_runtime": 2.2614,
"eval_samples_per_second": 68.983,
"eval_steps_per_second": 3.538,
"step": 2400
},
{
"epoch": 4.9,
"grad_norm": 0.7993331551551819,
"learning_rate": 1.2079116908177592e-05,
"loss": 0.1222,
"step": 2450
},
{
"epoch": 4.9,
"eval_loss": 0.9250988364219666,
"eval_runtime": 2.4444,
"eval_samples_per_second": 63.82,
"eval_steps_per_second": 3.273,
"step": 2450
},
{
"epoch": 5.0,
"grad_norm": 1.1892589330673218,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.1186,
"step": 2500
},
{
"epoch": 5.0,
"eval_loss": 0.9481778144836426,
"eval_runtime": 3.3935,
"eval_samples_per_second": 45.97,
"eval_steps_per_second": 2.357,
"step": 2500
},
{
"epoch": 5.1,
"grad_norm": 0.7223986983299255,
"learning_rate": 1.1391731009600655e-05,
"loss": 0.0726,
"step": 2550
},
{
"epoch": 5.1,
"eval_loss": 0.974181056022644,
"eval_runtime": 2.9499,
"eval_samples_per_second": 52.883,
"eval_steps_per_second": 2.712,
"step": 2550
},
{
"epoch": 5.2,
"grad_norm": 0.7545835971832275,
"learning_rate": 1.1045284632676535e-05,
"loss": 0.0717,
"step": 2600
},
{
"epoch": 5.2,
"eval_loss": 0.9890027046203613,
"eval_runtime": 2.7635,
"eval_samples_per_second": 56.449,
"eval_steps_per_second": 2.895,
"step": 2600
},
{
"epoch": 5.3,
"grad_norm": 1.2251814603805542,
"learning_rate": 1.0697564737441254e-05,
"loss": 0.072,
"step": 2650
},
{
"epoch": 5.3,
"eval_loss": 0.9911813735961914,
"eval_runtime": 2.2537,
"eval_samples_per_second": 69.22,
"eval_steps_per_second": 3.55,
"step": 2650
},
{
"epoch": 5.4,
"grad_norm": 0.45753681659698486,
"learning_rate": 1.0348994967025012e-05,
"loss": 0.0718,
"step": 2700
},
{
"epoch": 5.4,
"eval_loss": 0.9854485988616943,
"eval_runtime": 2.2539,
"eval_samples_per_second": 69.212,
"eval_steps_per_second": 3.549,
"step": 2700
},
{
"epoch": 5.5,
"grad_norm": 1.0563805103302002,
"learning_rate": 1e-05,
"loss": 0.072,
"step": 2750
},
{
"epoch": 5.5,
"eval_loss": 0.9962345957756042,
"eval_runtime": 2.2507,
"eval_samples_per_second": 69.313,
"eval_steps_per_second": 3.555,
"step": 2750
},
{
"epoch": 5.6,
"grad_norm": 1.6450284719467163,
"learning_rate": 9.651005032974994e-06,
"loss": 0.0699,
"step": 2800
},
{
"epoch": 5.6,
"eval_loss": 0.9950909614562988,
"eval_runtime": 2.2532,
"eval_samples_per_second": 69.235,
"eval_steps_per_second": 3.551,
"step": 2800
}
],
"logging_steps": 50,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 400,
"total_flos": 1.3524716052545536e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}