baby-dev's picture
Training in progress, step 100, checkpoint
b241499 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.26490066225165565,
"eval_steps": 25,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026490066225165563,
"grad_norm": 10.047337532043457,
"learning_rate": 2e-05,
"loss": 8.8879,
"step": 1
},
{
"epoch": 0.0026490066225165563,
"eval_loss": 2.289355754852295,
"eval_runtime": 6.4257,
"eval_samples_per_second": 24.744,
"eval_steps_per_second": 12.45,
"step": 1
},
{
"epoch": 0.005298013245033113,
"grad_norm": 8.795169830322266,
"learning_rate": 4e-05,
"loss": 10.083,
"step": 2
},
{
"epoch": 0.007947019867549669,
"grad_norm": 9.06167221069336,
"learning_rate": 6e-05,
"loss": 9.4119,
"step": 3
},
{
"epoch": 0.010596026490066225,
"grad_norm": 8.713191032409668,
"learning_rate": 8e-05,
"loss": 9.5243,
"step": 4
},
{
"epoch": 0.013245033112582781,
"grad_norm": 10.839781761169434,
"learning_rate": 0.0001,
"loss": 9.3628,
"step": 5
},
{
"epoch": 0.015894039735099338,
"grad_norm": 9.976048469543457,
"learning_rate": 0.00012,
"loss": 8.335,
"step": 6
},
{
"epoch": 0.018543046357615896,
"grad_norm": 9.373313903808594,
"learning_rate": 0.00014,
"loss": 8.1384,
"step": 7
},
{
"epoch": 0.02119205298013245,
"grad_norm": 14.212782859802246,
"learning_rate": 0.00016,
"loss": 8.1568,
"step": 8
},
{
"epoch": 0.02384105960264901,
"grad_norm": 11.925253868103027,
"learning_rate": 0.00018,
"loss": 8.3016,
"step": 9
},
{
"epoch": 0.026490066225165563,
"grad_norm": 11.655914306640625,
"learning_rate": 0.0002,
"loss": 7.6722,
"step": 10
},
{
"epoch": 0.02913907284768212,
"grad_norm": 10.125999450683594,
"learning_rate": 0.0001999390827019096,
"loss": 7.6015,
"step": 11
},
{
"epoch": 0.031788079470198675,
"grad_norm": 10.365988731384277,
"learning_rate": 0.00019975640502598244,
"loss": 6.6575,
"step": 12
},
{
"epoch": 0.03443708609271523,
"grad_norm": 9.90079116821289,
"learning_rate": 0.00019945218953682734,
"loss": 7.5634,
"step": 13
},
{
"epoch": 0.03708609271523179,
"grad_norm": 8.313066482543945,
"learning_rate": 0.00019902680687415705,
"loss": 7.8489,
"step": 14
},
{
"epoch": 0.039735099337748346,
"grad_norm": 9.736310005187988,
"learning_rate": 0.00019848077530122083,
"loss": 8.3458,
"step": 15
},
{
"epoch": 0.0423841059602649,
"grad_norm": 9.37343978881836,
"learning_rate": 0.00019781476007338058,
"loss": 7.0014,
"step": 16
},
{
"epoch": 0.045033112582781455,
"grad_norm": 8.530896186828613,
"learning_rate": 0.00019702957262759965,
"loss": 6.7211,
"step": 17
},
{
"epoch": 0.04768211920529802,
"grad_norm": 8.414729118347168,
"learning_rate": 0.0001961261695938319,
"loss": 6.8012,
"step": 18
},
{
"epoch": 0.05033112582781457,
"grad_norm": 25.417264938354492,
"learning_rate": 0.00019510565162951537,
"loss": 7.5455,
"step": 19
},
{
"epoch": 0.052980132450331126,
"grad_norm": 6.27047872543335,
"learning_rate": 0.00019396926207859084,
"loss": 6.9449,
"step": 20
},
{
"epoch": 0.05562913907284768,
"grad_norm": 6.81449556350708,
"learning_rate": 0.00019271838545667876,
"loss": 6.7678,
"step": 21
},
{
"epoch": 0.05827814569536424,
"grad_norm": 7.924989700317383,
"learning_rate": 0.0001913545457642601,
"loss": 7.2337,
"step": 22
},
{
"epoch": 0.060927152317880796,
"grad_norm": 6.0346293449401855,
"learning_rate": 0.0001898794046299167,
"loss": 7.1267,
"step": 23
},
{
"epoch": 0.06357615894039735,
"grad_norm": 7.905987739562988,
"learning_rate": 0.00018829475928589271,
"loss": 7.5959,
"step": 24
},
{
"epoch": 0.06622516556291391,
"grad_norm": 6.775877475738525,
"learning_rate": 0.00018660254037844388,
"loss": 7.3511,
"step": 25
},
{
"epoch": 0.06622516556291391,
"eval_loss": 1.7976596355438232,
"eval_runtime": 6.4278,
"eval_samples_per_second": 24.736,
"eval_steps_per_second": 12.446,
"step": 25
},
{
"epoch": 0.06887417218543046,
"grad_norm": 9.095159530639648,
"learning_rate": 0.0001848048096156426,
"loss": 7.3748,
"step": 26
},
{
"epoch": 0.07152317880794702,
"grad_norm": 8.567487716674805,
"learning_rate": 0.00018290375725550417,
"loss": 6.9435,
"step": 27
},
{
"epoch": 0.07417218543046358,
"grad_norm": 8.406890869140625,
"learning_rate": 0.00018090169943749476,
"loss": 7.2,
"step": 28
},
{
"epoch": 0.07682119205298013,
"grad_norm": 7.356227397918701,
"learning_rate": 0.00017880107536067218,
"loss": 8.3478,
"step": 29
},
{
"epoch": 0.07947019867549669,
"grad_norm": 7.5263776779174805,
"learning_rate": 0.0001766044443118978,
"loss": 7.1226,
"step": 30
},
{
"epoch": 0.08211920529801324,
"grad_norm": 7.3591766357421875,
"learning_rate": 0.00017431448254773944,
"loss": 6.8925,
"step": 31
},
{
"epoch": 0.0847682119205298,
"grad_norm": 6.77475643157959,
"learning_rate": 0.0001719339800338651,
"loss": 6.9482,
"step": 32
},
{
"epoch": 0.08741721854304636,
"grad_norm": 7.577714443206787,
"learning_rate": 0.00016946583704589973,
"loss": 7.6245,
"step": 33
},
{
"epoch": 0.09006622516556291,
"grad_norm": 8.364958763122559,
"learning_rate": 0.00016691306063588583,
"loss": 6.0576,
"step": 34
},
{
"epoch": 0.09271523178807947,
"grad_norm": 6.96998929977417,
"learning_rate": 0.00016427876096865394,
"loss": 7.0662,
"step": 35
},
{
"epoch": 0.09536423841059603,
"grad_norm": 7.5627593994140625,
"learning_rate": 0.0001615661475325658,
"loss": 8.0993,
"step": 36
},
{
"epoch": 0.09801324503311258,
"grad_norm": 7.6041107177734375,
"learning_rate": 0.00015877852522924732,
"loss": 7.5794,
"step": 37
},
{
"epoch": 0.10066225165562914,
"grad_norm": 6.256470680236816,
"learning_rate": 0.0001559192903470747,
"loss": 7.9102,
"step": 38
},
{
"epoch": 0.10331125827814569,
"grad_norm": 6.799134731292725,
"learning_rate": 0.0001529919264233205,
"loss": 6.9581,
"step": 39
},
{
"epoch": 0.10596026490066225,
"grad_norm": 6.711977481842041,
"learning_rate": 0.00015000000000000001,
"loss": 7.503,
"step": 40
},
{
"epoch": 0.10860927152317881,
"grad_norm": 6.807905197143555,
"learning_rate": 0.00014694715627858908,
"loss": 8.0716,
"step": 41
},
{
"epoch": 0.11125827814569536,
"grad_norm": 9.389381408691406,
"learning_rate": 0.00014383711467890774,
"loss": 6.842,
"step": 42
},
{
"epoch": 0.11390728476821192,
"grad_norm": 6.579544544219971,
"learning_rate": 0.00014067366430758004,
"loss": 6.9505,
"step": 43
},
{
"epoch": 0.11655629139072848,
"grad_norm": 6.2934184074401855,
"learning_rate": 0.00013746065934159123,
"loss": 6.7775,
"step": 44
},
{
"epoch": 0.11920529801324503,
"grad_norm": 6.887556076049805,
"learning_rate": 0.00013420201433256689,
"loss": 7.4255,
"step": 45
},
{
"epoch": 0.12185430463576159,
"grad_norm": 6.750312805175781,
"learning_rate": 0.00013090169943749476,
"loss": 6.7287,
"step": 46
},
{
"epoch": 0.12450331125827814,
"grad_norm": 5.679995059967041,
"learning_rate": 0.0001275637355816999,
"loss": 6.5704,
"step": 47
},
{
"epoch": 0.1271523178807947,
"grad_norm": 8.375008583068848,
"learning_rate": 0.00012419218955996676,
"loss": 6.5227,
"step": 48
},
{
"epoch": 0.12980132450331125,
"grad_norm": 6.405532360076904,
"learning_rate": 0.00012079116908177593,
"loss": 7.0324,
"step": 49
},
{
"epoch": 0.13245033112582782,
"grad_norm": 8.176584243774414,
"learning_rate": 0.00011736481776669306,
"loss": 6.9371,
"step": 50
},
{
"epoch": 0.13245033112582782,
"eval_loss": 1.7544549703598022,
"eval_runtime": 6.4152,
"eval_samples_per_second": 24.785,
"eval_steps_per_second": 12.47,
"step": 50
},
{
"epoch": 0.13509933774834437,
"grad_norm": 8.039058685302734,
"learning_rate": 0.00011391731009600654,
"loss": 6.7803,
"step": 51
},
{
"epoch": 0.13774834437086092,
"grad_norm": 5.868615627288818,
"learning_rate": 0.00011045284632676536,
"loss": 7.0988,
"step": 52
},
{
"epoch": 0.1403973509933775,
"grad_norm": 6.132235527038574,
"learning_rate": 0.00010697564737441252,
"loss": 6.5921,
"step": 53
},
{
"epoch": 0.14304635761589404,
"grad_norm": 6.561174392700195,
"learning_rate": 0.00010348994967025012,
"loss": 6.8622,
"step": 54
},
{
"epoch": 0.1456953642384106,
"grad_norm": 6.470300674438477,
"learning_rate": 0.0001,
"loss": 7.5364,
"step": 55
},
{
"epoch": 0.14834437086092717,
"grad_norm": 7.035355567932129,
"learning_rate": 9.651005032974994e-05,
"loss": 6.9205,
"step": 56
},
{
"epoch": 0.1509933774834437,
"grad_norm": 6.995288848876953,
"learning_rate": 9.302435262558747e-05,
"loss": 6.7877,
"step": 57
},
{
"epoch": 0.15364238410596026,
"grad_norm": 7.111328601837158,
"learning_rate": 8.954715367323468e-05,
"loss": 6.6399,
"step": 58
},
{
"epoch": 0.1562913907284768,
"grad_norm": 6.084195137023926,
"learning_rate": 8.608268990399349e-05,
"loss": 7.2281,
"step": 59
},
{
"epoch": 0.15894039735099338,
"grad_norm": 7.517582893371582,
"learning_rate": 8.263518223330697e-05,
"loss": 6.7618,
"step": 60
},
{
"epoch": 0.16158940397350993,
"grad_norm": 7.0816874504089355,
"learning_rate": 7.920883091822408e-05,
"loss": 7.3287,
"step": 61
},
{
"epoch": 0.16423841059602648,
"grad_norm": 6.235696792602539,
"learning_rate": 7.580781044003324e-05,
"loss": 6.5588,
"step": 62
},
{
"epoch": 0.16688741721854305,
"grad_norm": 9.056792259216309,
"learning_rate": 7.243626441830009e-05,
"loss": 7.5109,
"step": 63
},
{
"epoch": 0.1695364238410596,
"grad_norm": 7.157319068908691,
"learning_rate": 6.909830056250527e-05,
"loss": 6.6009,
"step": 64
},
{
"epoch": 0.17218543046357615,
"grad_norm": 6.171271324157715,
"learning_rate": 6.579798566743314e-05,
"loss": 7.2154,
"step": 65
},
{
"epoch": 0.17483443708609273,
"grad_norm": 6.188497543334961,
"learning_rate": 6.25393406584088e-05,
"loss": 7.397,
"step": 66
},
{
"epoch": 0.17748344370860927,
"grad_norm": 5.943967819213867,
"learning_rate": 5.9326335692419995e-05,
"loss": 7.0184,
"step": 67
},
{
"epoch": 0.18013245033112582,
"grad_norm": 7.671113967895508,
"learning_rate": 5.616288532109225e-05,
"loss": 6.6367,
"step": 68
},
{
"epoch": 0.1827814569536424,
"grad_norm": 6.130111217498779,
"learning_rate": 5.305284372141095e-05,
"loss": 6.6626,
"step": 69
},
{
"epoch": 0.18543046357615894,
"grad_norm": 9.644905090332031,
"learning_rate": 5.000000000000002e-05,
"loss": 7.2466,
"step": 70
},
{
"epoch": 0.1880794701986755,
"grad_norm": 6.909838676452637,
"learning_rate": 4.700807357667952e-05,
"loss": 7.1044,
"step": 71
},
{
"epoch": 0.19072847682119207,
"grad_norm": 6.281887531280518,
"learning_rate": 4.4080709652925336e-05,
"loss": 6.2072,
"step": 72
},
{
"epoch": 0.19337748344370861,
"grad_norm": 6.749195575714111,
"learning_rate": 4.12214747707527e-05,
"loss": 6.725,
"step": 73
},
{
"epoch": 0.19602649006622516,
"grad_norm": 5.841734886169434,
"learning_rate": 3.843385246743417e-05,
"loss": 6.9915,
"step": 74
},
{
"epoch": 0.1986754966887417,
"grad_norm": 5.667914390563965,
"learning_rate": 3.5721239031346066e-05,
"loss": 5.9325,
"step": 75
},
{
"epoch": 0.1986754966887417,
"eval_loss": 1.7390562295913696,
"eval_runtime": 6.4222,
"eval_samples_per_second": 24.758,
"eval_steps_per_second": 12.457,
"step": 75
},
{
"epoch": 0.20132450331125828,
"grad_norm": 6.836699962615967,
"learning_rate": 3.308693936411421e-05,
"loss": 6.6397,
"step": 76
},
{
"epoch": 0.20397350993377483,
"grad_norm": 7.016422748565674,
"learning_rate": 3.053416295410026e-05,
"loss": 6.9548,
"step": 77
},
{
"epoch": 0.20662251655629138,
"grad_norm": 5.971282482147217,
"learning_rate": 2.8066019966134904e-05,
"loss": 6.9251,
"step": 78
},
{
"epoch": 0.20927152317880796,
"grad_norm": 5.713533878326416,
"learning_rate": 2.5685517452260567e-05,
"loss": 6.7799,
"step": 79
},
{
"epoch": 0.2119205298013245,
"grad_norm": 6.973901748657227,
"learning_rate": 2.339555568810221e-05,
"loss": 6.9072,
"step": 80
},
{
"epoch": 0.21456953642384105,
"grad_norm": 6.7183685302734375,
"learning_rate": 2.119892463932781e-05,
"loss": 6.9971,
"step": 81
},
{
"epoch": 0.21721854304635763,
"grad_norm": 6.681792259216309,
"learning_rate": 1.9098300562505266e-05,
"loss": 7.233,
"step": 82
},
{
"epoch": 0.21986754966887417,
"grad_norm": 8.250860214233398,
"learning_rate": 1.7096242744495837e-05,
"loss": 7.0648,
"step": 83
},
{
"epoch": 0.22251655629139072,
"grad_norm": 6.514826774597168,
"learning_rate": 1.5195190384357404e-05,
"loss": 6.2877,
"step": 84
},
{
"epoch": 0.2251655629139073,
"grad_norm": 6.5112786293029785,
"learning_rate": 1.339745962155613e-05,
"loss": 6.7192,
"step": 85
},
{
"epoch": 0.22781456953642384,
"grad_norm": 6.814388751983643,
"learning_rate": 1.1705240714107302e-05,
"loss": 7.2217,
"step": 86
},
{
"epoch": 0.2304635761589404,
"grad_norm": 5.921008586883545,
"learning_rate": 1.0120595370083318e-05,
"loss": 7.1328,
"step": 87
},
{
"epoch": 0.23311258278145697,
"grad_norm": 5.6203932762146,
"learning_rate": 8.645454235739903e-06,
"loss": 5.9535,
"step": 88
},
{
"epoch": 0.23576158940397351,
"grad_norm": 6.021057605743408,
"learning_rate": 7.281614543321269e-06,
"loss": 6.8638,
"step": 89
},
{
"epoch": 0.23841059602649006,
"grad_norm": 7.241350173950195,
"learning_rate": 6.030737921409169e-06,
"loss": 7.3687,
"step": 90
},
{
"epoch": 0.2410596026490066,
"grad_norm": 6.832858562469482,
"learning_rate": 4.8943483704846475e-06,
"loss": 6.4625,
"step": 91
},
{
"epoch": 0.24370860927152319,
"grad_norm": 6.067267894744873,
"learning_rate": 3.873830406168111e-06,
"loss": 6.7384,
"step": 92
},
{
"epoch": 0.24635761589403973,
"grad_norm": 7.329842567443848,
"learning_rate": 2.970427372400353e-06,
"loss": 6.5468,
"step": 93
},
{
"epoch": 0.24900662251655628,
"grad_norm": 5.869202136993408,
"learning_rate": 2.1852399266194314e-06,
"loss": 7.0172,
"step": 94
},
{
"epoch": 0.25165562913907286,
"grad_norm": 5.81821870803833,
"learning_rate": 1.5192246987791981e-06,
"loss": 6.985,
"step": 95
},
{
"epoch": 0.2543046357615894,
"grad_norm": 7.167514801025391,
"learning_rate": 9.731931258429638e-07,
"loss": 7.0374,
"step": 96
},
{
"epoch": 0.25695364238410595,
"grad_norm": 6.343252182006836,
"learning_rate": 5.478104631726711e-07,
"loss": 5.98,
"step": 97
},
{
"epoch": 0.2596026490066225,
"grad_norm": 6.643914222717285,
"learning_rate": 2.4359497401758024e-07,
"loss": 6.9213,
"step": 98
},
{
"epoch": 0.26225165562913905,
"grad_norm": 5.431641101837158,
"learning_rate": 6.09172980904238e-08,
"loss": 7.2081,
"step": 99
},
{
"epoch": 0.26490066225165565,
"grad_norm": 6.084027290344238,
"learning_rate": 0.0,
"loss": 6.7836,
"step": 100
},
{
"epoch": 0.26490066225165565,
"eval_loss": 1.736009955406189,
"eval_runtime": 6.4349,
"eval_samples_per_second": 24.709,
"eval_steps_per_second": 12.432,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.002425230445773e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}