llama3.2-1b-instruct-hh-sft / trainer_state.json
rngusry's picture
Model save
f9ea24f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9953917050691246,
"eval_steps": 500,
"global_step": 758,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026333113890717576,
"grad_norm": 31.375,
"learning_rate": 3.947368421052631e-06,
"loss": 2.687,
"step": 1
},
{
"epoch": 0.013166556945358789,
"grad_norm": 11.375,
"learning_rate": 1.9736842105263155e-05,
"loss": 2.3265,
"step": 5
},
{
"epoch": 0.026333113890717578,
"grad_norm": 3.5625,
"learning_rate": 3.947368421052631e-05,
"loss": 1.834,
"step": 10
},
{
"epoch": 0.03949967083607637,
"grad_norm": 2.828125,
"learning_rate": 5.921052631578947e-05,
"loss": 1.6157,
"step": 15
},
{
"epoch": 0.052666227781435156,
"grad_norm": 7.34375,
"learning_rate": 7.894736842105262e-05,
"loss": 1.5546,
"step": 20
},
{
"epoch": 0.06583278472679395,
"grad_norm": 1.6796875,
"learning_rate": 9.868421052631579e-05,
"loss": 1.561,
"step": 25
},
{
"epoch": 0.07899934167215274,
"grad_norm": 1.875,
"learning_rate": 0.00011842105263157894,
"loss": 1.4433,
"step": 30
},
{
"epoch": 0.09216589861751152,
"grad_norm": 2.0,
"learning_rate": 0.0001381578947368421,
"loss": 1.5053,
"step": 35
},
{
"epoch": 0.10533245556287031,
"grad_norm": 4.9375,
"learning_rate": 0.00015789473684210524,
"loss": 1.5204,
"step": 40
},
{
"epoch": 0.1184990125082291,
"grad_norm": 2.03125,
"learning_rate": 0.00017763157894736838,
"loss": 1.5645,
"step": 45
},
{
"epoch": 0.1316655694535879,
"grad_norm": 2.140625,
"learning_rate": 0.00019736842105263157,
"loss": 1.5742,
"step": 50
},
{
"epoch": 0.1448321263989467,
"grad_norm": 1.9765625,
"learning_rate": 0.00021710526315789472,
"loss": 1.6198,
"step": 55
},
{
"epoch": 0.15799868334430547,
"grad_norm": 2.125,
"learning_rate": 0.00023684210526315788,
"loss": 1.6436,
"step": 60
},
{
"epoch": 0.17116524028966426,
"grad_norm": 2.125,
"learning_rate": 0.00025657894736842105,
"loss": 1.6867,
"step": 65
},
{
"epoch": 0.18433179723502305,
"grad_norm": 2.359375,
"learning_rate": 0.0002763157894736842,
"loss": 1.7356,
"step": 70
},
{
"epoch": 0.19749835418038184,
"grad_norm": 3.859375,
"learning_rate": 0.00029605263157894733,
"loss": 1.7819,
"step": 75
},
{
"epoch": 0.21066491112574062,
"grad_norm": 5.40625,
"learning_rate": 0.0002999745375637391,
"loss": 1.9272,
"step": 80
},
{
"epoch": 0.2238314680710994,
"grad_norm": 5.15625,
"learning_rate": 0.00029987111123173417,
"loss": 2.0363,
"step": 85
},
{
"epoch": 0.2369980250164582,
"grad_norm": 3.109375,
"learning_rate": 0.00029968818442293417,
"loss": 1.8288,
"step": 90
},
{
"epoch": 0.250164581961817,
"grad_norm": 2.84375,
"learning_rate": 0.00029942585417250744,
"loss": 1.8436,
"step": 95
},
{
"epoch": 0.2633311389071758,
"grad_norm": 2.40625,
"learning_rate": 0.00029908425963589115,
"loss": 1.7724,
"step": 100
},
{
"epoch": 0.2764976958525346,
"grad_norm": 1.875,
"learning_rate": 0.00029866358201497474,
"loss": 1.7534,
"step": 105
},
{
"epoch": 0.2896642527978934,
"grad_norm": 1.9765625,
"learning_rate": 0.0002981640444619799,
"loss": 1.7532,
"step": 110
},
{
"epoch": 0.30283080974325216,
"grad_norm": 2.796875,
"learning_rate": 0.00029758591196108743,
"loss": 1.7545,
"step": 115
},
{
"epoch": 0.31599736668861095,
"grad_norm": 2.015625,
"learning_rate": 0.00029692949118787415,
"loss": 1.8269,
"step": 120
},
{
"epoch": 0.32916392363396973,
"grad_norm": 2.125,
"learning_rate": 0.0002961951303466338,
"loss": 1.7823,
"step": 125
},
{
"epoch": 0.3423304805793285,
"grad_norm": 2.71875,
"learning_rate": 0.0002953832189856691,
"loss": 1.7371,
"step": 130
},
{
"epoch": 0.3554970375246873,
"grad_norm": 1.9921875,
"learning_rate": 0.00029449418779065257,
"loss": 1.7607,
"step": 135
},
{
"epoch": 0.3686635944700461,
"grad_norm": 2.109375,
"learning_rate": 0.00029352850835616504,
"loss": 1.7956,
"step": 140
},
{
"epoch": 0.3818301514154049,
"grad_norm": 2.0625,
"learning_rate": 0.00029248669293553437,
"loss": 1.7176,
"step": 145
},
{
"epoch": 0.39499670836076367,
"grad_norm": 1.9453125,
"learning_rate": 0.0002913692941691059,
"loss": 1.843,
"step": 150
},
{
"epoch": 0.40816326530612246,
"grad_norm": 1.828125,
"learning_rate": 0.0002901769047910895,
"loss": 1.7918,
"step": 155
},
{
"epoch": 0.42132982225148125,
"grad_norm": 1.5625,
"learning_rate": 0.0002889101573151384,
"loss": 1.7714,
"step": 160
},
{
"epoch": 0.43449637919684003,
"grad_norm": 2.671875,
"learning_rate": 0.00028756972369882667,
"loss": 1.8033,
"step": 165
},
{
"epoch": 0.4476629361421988,
"grad_norm": 1.6015625,
"learning_rate": 0.0002861563149872031,
"loss": 1.8409,
"step": 170
},
{
"epoch": 0.4608294930875576,
"grad_norm": 1.8203125,
"learning_rate": 0.0002846706809356112,
"loss": 1.8259,
"step": 175
},
{
"epoch": 0.4739960500329164,
"grad_norm": 1.8125,
"learning_rate": 0.0002831136096119747,
"loss": 1.7612,
"step": 180
},
{
"epoch": 0.4871626069782752,
"grad_norm": 1.796875,
"learning_rate": 0.0002814859269787596,
"loss": 1.7649,
"step": 185
},
{
"epoch": 0.500329163923634,
"grad_norm": 1.8359375,
"learning_rate": 0.0002797884964548353,
"loss": 1.7443,
"step": 190
},
{
"epoch": 0.5134957208689928,
"grad_norm": 1.5703125,
"learning_rate": 0.0002780222184574662,
"loss": 1.7219,
"step": 195
},
{
"epoch": 0.5266622778143516,
"grad_norm": 1.6328125,
"learning_rate": 0.0002761880299246772,
"loss": 1.7409,
"step": 200
},
{
"epoch": 0.5398288347597103,
"grad_norm": 1.59375,
"learning_rate": 0.00027428690381824637,
"loss": 1.7043,
"step": 205
},
{
"epoch": 0.5529953917050692,
"grad_norm": 1.6171875,
"learning_rate": 0.00027231984860758907,
"loss": 1.6709,
"step": 210
},
{
"epoch": 0.5661619486504279,
"grad_norm": 2.0,
"learning_rate": 0.000270287907734806,
"loss": 1.7417,
"step": 215
},
{
"epoch": 0.5793285055957867,
"grad_norm": 1.65625,
"learning_rate": 0.0002681921590611799,
"loss": 1.66,
"step": 220
},
{
"epoch": 0.5924950625411455,
"grad_norm": 1.8671875,
"learning_rate": 0.0002660337142954145,
"loss": 1.732,
"step": 225
},
{
"epoch": 0.6056616194865043,
"grad_norm": 1.5234375,
"learning_rate": 0.0002638137184039186,
"loss": 1.6964,
"step": 230
},
{
"epoch": 0.618828176431863,
"grad_norm": 1.625,
"learning_rate": 0.00026153334900344853,
"loss": 1.648,
"step": 235
},
{
"epoch": 0.6319947333772219,
"grad_norm": 1.375,
"learning_rate": 0.0002591938157364303,
"loss": 1.6197,
"step": 240
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.7109375,
"learning_rate": 0.00025679635962929455,
"loss": 1.701,
"step": 245
},
{
"epoch": 0.6583278472679395,
"grad_norm": 1.640625,
"learning_rate": 0.00025434225243416234,
"loss": 1.7649,
"step": 250
},
{
"epoch": 0.6714944042132982,
"grad_norm": 1.6328125,
"learning_rate": 0.0002518327959542333,
"loss": 1.712,
"step": 255
},
{
"epoch": 0.684660961158657,
"grad_norm": 1.796875,
"learning_rate": 0.0002492693213532321,
"loss": 1.6628,
"step": 260
},
{
"epoch": 0.6978275181040158,
"grad_norm": 2.015625,
"learning_rate": 0.0002466531884492808,
"loss": 1.6714,
"step": 265
},
{
"epoch": 0.7109940750493746,
"grad_norm": 1.9921875,
"learning_rate": 0.0002439857849935712,
"loss": 1.6833,
"step": 270
},
{
"epoch": 0.7241606319947334,
"grad_norm": 1.703125,
"learning_rate": 0.00024126852593421967,
"loss": 1.7174,
"step": 275
},
{
"epoch": 0.7373271889400922,
"grad_norm": 1.6015625,
"learning_rate": 0.0002385028526656952,
"loss": 1.6437,
"step": 280
},
{
"epoch": 0.7504937458854509,
"grad_norm": 1.7109375,
"learning_rate": 0.00023569023226421883,
"loss": 1.6515,
"step": 285
},
{
"epoch": 0.7636603028308098,
"grad_norm": 1.6015625,
"learning_rate": 0.0002328321567095398,
"loss": 1.6352,
"step": 290
},
{
"epoch": 0.7768268597761685,
"grad_norm": 1.625,
"learning_rate": 0.00022993014209350167,
"loss": 1.6205,
"step": 295
},
{
"epoch": 0.7899934167215273,
"grad_norm": 1.546875,
"learning_rate": 0.00022698572781581757,
"loss": 1.6508,
"step": 300
},
{
"epoch": 0.8031599736668861,
"grad_norm": 1.453125,
"learning_rate": 0.0002240004757674819,
"loss": 1.5989,
"step": 305
},
{
"epoch": 0.8163265306122449,
"grad_norm": 1.8046875,
"learning_rate": 0.00022097596950225134,
"loss": 1.6176,
"step": 310
},
{
"epoch": 0.8294930875576036,
"grad_norm": 1.3671875,
"learning_rate": 0.00021791381339663423,
"loss": 1.6204,
"step": 315
},
{
"epoch": 0.8426596445029625,
"grad_norm": 1.5390625,
"learning_rate": 0.00021481563179883502,
"loss": 1.5592,
"step": 320
},
{
"epoch": 0.8558262014483212,
"grad_norm": 1.3125,
"learning_rate": 0.00021168306816710393,
"loss": 1.5973,
"step": 325
},
{
"epoch": 0.8689927583936801,
"grad_norm": 1.421875,
"learning_rate": 0.0002085177841979498,
"loss": 1.5367,
"step": 330
},
{
"epoch": 0.8821593153390388,
"grad_norm": 1.6796875,
"learning_rate": 0.00020532145894467828,
"loss": 1.5283,
"step": 335
},
{
"epoch": 0.8953258722843976,
"grad_norm": 1.46875,
"learning_rate": 0.000202095787926723,
"loss": 1.5374,
"step": 340
},
{
"epoch": 0.9084924292297564,
"grad_norm": 1.515625,
"learning_rate": 0.00019884248223024203,
"loss": 1.5021,
"step": 345
},
{
"epoch": 0.9216589861751152,
"grad_norm": 1.3046875,
"learning_rate": 0.00019556326760045658,
"loss": 1.5345,
"step": 350
},
{
"epoch": 0.934825543120474,
"grad_norm": 1.328125,
"learning_rate": 0.00019225988352621445,
"loss": 1.5164,
"step": 355
},
{
"epoch": 0.9479921000658328,
"grad_norm": 1.3984375,
"learning_rate": 0.0001889340823172622,
"loss": 1.4778,
"step": 360
},
{
"epoch": 0.9611586570111915,
"grad_norm": 1.2734375,
"learning_rate": 0.00018558762817471678,
"loss": 1.5624,
"step": 365
},
{
"epoch": 0.9743252139565504,
"grad_norm": 1.453125,
"learning_rate": 0.00018222229625522928,
"loss": 1.527,
"step": 370
},
{
"epoch": 0.9874917709019092,
"grad_norm": 1.609375,
"learning_rate": 0.00017883987172933707,
"loss": 1.4608,
"step": 375
},
{
"epoch": 1.0,
"grad_norm": 1.3671875,
"learning_rate": 0.0001754421488345041,
"loss": 1.4084,
"step": 380
},
{
"epoch": 1.0131665569453587,
"grad_norm": 1.4375,
"learning_rate": 0.00017203092992335137,
"loss": 1.013,
"step": 385
},
{
"epoch": 1.0263331138907177,
"grad_norm": 1.3984375,
"learning_rate": 0.0001686080245075831,
"loss": 1.0124,
"step": 390
},
{
"epoch": 1.0394996708360764,
"grad_norm": 1.53125,
"learning_rate": 0.0001651752482981148,
"loss": 1.0275,
"step": 395
},
{
"epoch": 1.0526662277814351,
"grad_norm": 1.2265625,
"learning_rate": 0.00016173442224191309,
"loss": 0.9538,
"step": 400
},
{
"epoch": 1.0658327847267939,
"grad_norm": 1.2109375,
"learning_rate": 0.00015828737155605804,
"loss": 0.9683,
"step": 405
},
{
"epoch": 1.0789993416721528,
"grad_norm": 1.21875,
"learning_rate": 0.0001548359247595405,
"loss": 1.0414,
"step": 410
},
{
"epoch": 1.0921658986175116,
"grad_norm": 1.2578125,
"learning_rate": 0.00015138191270330773,
"loss": 0.9749,
"step": 415
},
{
"epoch": 1.1053324555628703,
"grad_norm": 1.4921875,
"learning_rate": 0.00014792716759907186,
"loss": 0.9802,
"step": 420
},
{
"epoch": 1.118499012508229,
"grad_norm": 1.34375,
"learning_rate": 0.00014447352204739712,
"loss": 0.9399,
"step": 425
},
{
"epoch": 1.131665569453588,
"grad_norm": 1.21875,
"learning_rate": 0.00014102280806558006,
"loss": 1.0111,
"step": 430
},
{
"epoch": 1.1448321263989467,
"grad_norm": 1.2890625,
"learning_rate": 0.00013757685611583983,
"loss": 0.9483,
"step": 435
},
{
"epoch": 1.1579986833443054,
"grad_norm": 1.15625,
"learning_rate": 0.00013413749413433273,
"loss": 0.9546,
"step": 440
},
{
"epoch": 1.1711652402896642,
"grad_norm": 1.2734375,
"learning_rate": 0.0001307065465615073,
"loss": 0.9294,
"step": 445
},
{
"epoch": 1.1843317972350231,
"grad_norm": 1.2265625,
"learning_rate": 0.00012728583337431353,
"loss": 0.9498,
"step": 450
},
{
"epoch": 1.1974983541803819,
"grad_norm": 1.296875,
"learning_rate": 0.0001238771691207795,
"loss": 0.942,
"step": 455
},
{
"epoch": 1.2106649111257406,
"grad_norm": 1.4375,
"learning_rate": 0.00012048236195746822,
"loss": 0.9069,
"step": 460
},
{
"epoch": 1.2238314680710993,
"grad_norm": 1.5078125,
"learning_rate": 0.00011710321269032502,
"loss": 0.9452,
"step": 465
},
{
"epoch": 1.2369980250164583,
"grad_norm": 1.3984375,
"learning_rate": 0.00011374151381942327,
"loss": 0.9533,
"step": 470
},
{
"epoch": 1.250164581961817,
"grad_norm": 1.375,
"learning_rate": 0.00011039904858811712,
"loss": 0.9229,
"step": 475
},
{
"epoch": 1.2633311389071757,
"grad_norm": 1.1015625,
"learning_rate": 0.00010707759003710384,
"loss": 0.8528,
"step": 480
},
{
"epoch": 1.2764976958525347,
"grad_norm": 1.328125,
"learning_rate": 0.00010377890006389856,
"loss": 0.8836,
"step": 485
},
{
"epoch": 1.2896642527978934,
"grad_norm": 1.3203125,
"learning_rate": 0.00010050472848821968,
"loss": 0.9177,
"step": 490
},
{
"epoch": 1.3028308097432522,
"grad_norm": 1.296875,
"learning_rate": 9.725681212378167e-05,
"loss": 0.8867,
"step": 495
},
{
"epoch": 1.315997366688611,
"grad_norm": 1.2421875,
"learning_rate": 9.403687385698632e-05,
"loss": 0.9074,
"step": 500
},
{
"epoch": 1.3291639236339696,
"grad_norm": 1.1796875,
"learning_rate": 9.084662173300223e-05,
"loss": 0.8652,
"step": 505
},
{
"epoch": 1.3423304805793286,
"grad_norm": 1.1796875,
"learning_rate": 8.768774804971705e-05,
"loss": 0.8758,
"step": 510
},
{
"epoch": 1.3554970375246873,
"grad_norm": 3.953125,
"learning_rate": 8.456192846004275e-05,
"loss": 0.8357,
"step": 515
},
{
"epoch": 1.368663594470046,
"grad_norm": 1.0703125,
"learning_rate": 8.147082108305058e-05,
"loss": 0.8258,
"step": 520
},
{
"epoch": 1.381830151415405,
"grad_norm": 1.1015625,
"learning_rate": 7.84160656244067e-05,
"loss": 0.906,
"step": 525
},
{
"epoch": 1.3949967083607637,
"grad_norm": 1.1484375,
"learning_rate": 7.539928250657594e-05,
"loss": 0.809,
"step": 530
},
{
"epoch": 1.4081632653061225,
"grad_norm": 1.15625,
"learning_rate": 7.242207200925383e-05,
"loss": 0.7685,
"step": 535
},
{
"epoch": 1.4213298222514812,
"grad_norm": 1.1171875,
"learning_rate": 6.948601342048397e-05,
"loss": 0.8473,
"step": 540
},
{
"epoch": 1.43449637919684,
"grad_norm": 1.1015625,
"learning_rate": 6.65926641989106e-05,
"loss": 0.8022,
"step": 545
},
{
"epoch": 1.4476629361421989,
"grad_norm": 1.15625,
"learning_rate": 6.374355914761062e-05,
"loss": 0.7762,
"step": 550
},
{
"epoch": 1.4608294930875576,
"grad_norm": 1.109375,
"learning_rate": 6.094020959994336e-05,
"loss": 0.862,
"step": 555
},
{
"epoch": 1.4739960500329163,
"grad_norm": 1.0703125,
"learning_rate": 5.818410261785056e-05,
"loss": 0.793,
"step": 560
},
{
"epoch": 1.4871626069782753,
"grad_norm": 1.0625,
"learning_rate": 5.5476700203030643e-05,
"loss": 0.7979,
"step": 565
},
{
"epoch": 1.500329163923634,
"grad_norm": 1.28125,
"learning_rate": 5.281943852140697e-05,
"loss": 0.8223,
"step": 570
},
{
"epoch": 1.5134957208689928,
"grad_norm": 1.0078125,
"learning_rate": 5.021372714130087e-05,
"loss": 0.84,
"step": 575
},
{
"epoch": 1.5266622778143515,
"grad_norm": 1.21875,
"learning_rate": 4.766094828571313e-05,
"loss": 0.7897,
"step": 580
},
{
"epoch": 1.5398288347597102,
"grad_norm": 1.0859375,
"learning_rate": 4.516245609911161e-05,
"loss": 0.7917,
"step": 585
},
{
"epoch": 1.5529953917050692,
"grad_norm": 1.1015625,
"learning_rate": 4.271957592911325e-05,
"loss": 0.7691,
"step": 590
},
{
"epoch": 1.566161948650428,
"grad_norm": 1.1875,
"learning_rate": 4.033360362344117e-05,
"loss": 0.8063,
"step": 595
},
{
"epoch": 1.5793285055957869,
"grad_norm": 1.1640625,
"learning_rate": 3.800580484253105e-05,
"loss": 0.7744,
"step": 600
},
{
"epoch": 1.5924950625411456,
"grad_norm": 1.1015625,
"learning_rate": 3.5737414388149785e-05,
"loss": 0.7701,
"step": 605
},
{
"epoch": 1.6056616194865043,
"grad_norm": 1.046875,
"learning_rate": 3.352963554838402e-05,
"loss": 0.7414,
"step": 610
},
{
"epoch": 1.618828176431863,
"grad_norm": 1.2109375,
"learning_rate": 3.138363945934523e-05,
"loss": 0.7739,
"step": 615
},
{
"epoch": 1.6319947333772218,
"grad_norm": 1.0859375,
"learning_rate": 2.9300564483929852e-05,
"loss": 0.794,
"step": 620
},
{
"epoch": 1.6451612903225805,
"grad_norm": 1.3203125,
"learning_rate": 2.728151560796454e-05,
"loss": 0.8121,
"step": 625
},
{
"epoch": 1.6583278472679395,
"grad_norm": 1.0546875,
"learning_rate": 2.5327563854056714e-05,
"loss": 0.7925,
"step": 630
},
{
"epoch": 1.6714944042132982,
"grad_norm": 1.2109375,
"learning_rate": 2.3439745713460624e-05,
"loss": 0.8124,
"step": 635
},
{
"epoch": 1.6846609611586572,
"grad_norm": 1.109375,
"learning_rate": 2.1619062596261583e-05,
"loss": 0.7899,
"step": 640
},
{
"epoch": 1.6978275181040159,
"grad_norm": 1.046875,
"learning_rate": 1.9866480300168885e-05,
"loss": 0.7489,
"step": 645
},
{
"epoch": 1.7109940750493746,
"grad_norm": 1.03125,
"learning_rate": 1.8182928498199634e-05,
"loss": 0.7739,
"step": 650
},
{
"epoch": 1.7241606319947334,
"grad_norm": 0.99609375,
"learning_rate": 1.6569300245525457e-05,
"loss": 0.7311,
"step": 655
},
{
"epoch": 1.737327188940092,
"grad_norm": 1.15625,
"learning_rate": 1.5026451505743408e-05,
"loss": 0.7321,
"step": 660
},
{
"epoch": 1.7504937458854508,
"grad_norm": 1.1171875,
"learning_rate": 1.3555200696822232e-05,
"loss": 0.7963,
"step": 665
},
{
"epoch": 1.7636603028308098,
"grad_norm": 1.1171875,
"learning_rate": 1.215632825696541e-05,
"loss": 0.7587,
"step": 670
},
{
"epoch": 1.7768268597761685,
"grad_norm": 1.171875,
"learning_rate": 1.0830576230620492e-05,
"loss": 0.7989,
"step": 675
},
{
"epoch": 1.7899934167215275,
"grad_norm": 1.078125,
"learning_rate": 9.578647874855095e-06,
"loss": 0.8169,
"step": 680
},
{
"epoch": 1.8031599736668862,
"grad_norm": 1.09375,
"learning_rate": 8.401207286307881e-06,
"loss": 0.7674,
"step": 685
},
{
"epoch": 1.816326530612245,
"grad_norm": 1.0234375,
"learning_rate": 7.2988790489124424e-06,
"loss": 0.8234,
"step": 690
},
{
"epoch": 1.8294930875576036,
"grad_norm": 1.15625,
"learning_rate": 6.272247902581201e-06,
"loss": 0.7603,
"step": 695
},
{
"epoch": 1.8426596445029624,
"grad_norm": 1.015625,
"learning_rate": 5.3218584330249e-06,
"loss": 0.795,
"step": 700
},
{
"epoch": 1.8558262014483211,
"grad_norm": 1.0546875,
"learning_rate": 4.448214782872134e-06,
"loss": 0.759,
"step": 705
},
{
"epoch": 1.86899275839368,
"grad_norm": 1.125,
"learning_rate": 3.6517803842424474e-06,
"loss": 0.7344,
"step": 710
},
{
"epoch": 1.8821593153390388,
"grad_norm": 1.0546875,
"learning_rate": 2.932977712914586e-06,
"loss": 0.7102,
"step": 715
},
{
"epoch": 1.8953258722843978,
"grad_norm": 1.046875,
"learning_rate": 2.292188064220374e-06,
"loss": 0.7783,
"step": 720
},
{
"epoch": 1.9084924292297565,
"grad_norm": 1.125,
"learning_rate": 1.7297513507832927e-06,
"loss": 0.7961,
"step": 725
},
{
"epoch": 1.9216589861751152,
"grad_norm": 44.75,
"learning_rate": 1.2459659222086304e-06,
"loss": 0.7633,
"step": 730
},
{
"epoch": 1.934825543120474,
"grad_norm": 1.0859375,
"learning_rate": 8.410884068213941e-07,
"loss": 0.7727,
"step": 735
},
{
"epoch": 1.9479921000658327,
"grad_norm": 1.09375,
"learning_rate": 5.153335755354038e-07,
"loss": 0.7779,
"step": 740
},
{
"epoch": 1.9611586570111914,
"grad_norm": 1.0234375,
"learning_rate": 2.688742279261913e-07,
"loss": 0.7058,
"step": 745
},
{
"epoch": 1.9743252139565504,
"grad_norm": 2.1875,
"learning_rate": 1.0184110056790651e-07,
"loss": 0.8194,
"step": 750
},
{
"epoch": 1.9874917709019093,
"grad_norm": 1.0546875,
"learning_rate": 1.432279768290856e-08,
"loss": 0.7634,
"step": 755
},
{
"epoch": 1.9953917050691246,
"step": 758,
"total_flos": 1.449790274661253e+17,
"train_loss": 1.2643018703032924,
"train_runtime": 2142.5178,
"train_samples_per_second": 11.339,
"train_steps_per_second": 0.354
}
],
"logging_steps": 5,
"max_steps": 758,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.449790274661253e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}