yihanwang617's picture
Training in progress, step 2100
accd47e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6157872455066774,
"eval_steps": 200,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003078936227533387,
"grad_norm": 0.567386619418904,
"learning_rate": 6.153846153846154e-07,
"loss": 1.3715,
"step": 1
},
{
"epoch": 0.0015394681137666935,
"grad_norm": 0.5288856739094128,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.3257,
"step": 5
},
{
"epoch": 0.003078936227533387,
"grad_norm": 0.514193298068005,
"learning_rate": 6.153846153846155e-06,
"loss": 1.3509,
"step": 10
},
{
"epoch": 0.004618404341300081,
"grad_norm": 0.5439189819234317,
"learning_rate": 9.230769230769232e-06,
"loss": 1.3301,
"step": 15
},
{
"epoch": 0.006157872455066774,
"grad_norm": 0.40780463269778033,
"learning_rate": 1.230769230769231e-05,
"loss": 1.3432,
"step": 20
},
{
"epoch": 0.007697340568833468,
"grad_norm": 0.21241926749939638,
"learning_rate": 1.5384615384615387e-05,
"loss": 1.2848,
"step": 25
},
{
"epoch": 0.009236808682600161,
"grad_norm": 0.22229732202041577,
"learning_rate": 1.8461538461538465e-05,
"loss": 1.2157,
"step": 30
},
{
"epoch": 0.010776276796366856,
"grad_norm": 0.22180484192931016,
"learning_rate": 2.1538461538461542e-05,
"loss": 1.2437,
"step": 35
},
{
"epoch": 0.012315744910133548,
"grad_norm": 0.26148958772269565,
"learning_rate": 2.461538461538462e-05,
"loss": 1.1774,
"step": 40
},
{
"epoch": 0.013855213023900243,
"grad_norm": 0.1381120764969843,
"learning_rate": 2.7692307692307694e-05,
"loss": 1.202,
"step": 45
},
{
"epoch": 0.015394681137666935,
"grad_norm": 0.11244566836575712,
"learning_rate": 3.0769230769230774e-05,
"loss": 1.147,
"step": 50
},
{
"epoch": 0.01693414925143363,
"grad_norm": 0.11021185664583819,
"learning_rate": 3.384615384615385e-05,
"loss": 1.1643,
"step": 55
},
{
"epoch": 0.018473617365200323,
"grad_norm": 0.09699197885691936,
"learning_rate": 3.692307692307693e-05,
"loss": 1.1375,
"step": 60
},
{
"epoch": 0.020013085478967015,
"grad_norm": 0.08432503700919332,
"learning_rate": 4e-05,
"loss": 1.157,
"step": 65
},
{
"epoch": 0.02155255359273371,
"grad_norm": 0.08841813904259435,
"learning_rate": 4.3076923076923084e-05,
"loss": 1.1193,
"step": 70
},
{
"epoch": 0.023092021706500404,
"grad_norm": 0.09260059592301743,
"learning_rate": 4.615384615384616e-05,
"loss": 1.1143,
"step": 75
},
{
"epoch": 0.024631489820267097,
"grad_norm": 0.11247544113589666,
"learning_rate": 4.923076923076924e-05,
"loss": 1.1422,
"step": 80
},
{
"epoch": 0.026170957934033793,
"grad_norm": 0.10305516893175011,
"learning_rate": 5.230769230769231e-05,
"loss": 1.1301,
"step": 85
},
{
"epoch": 0.027710426047800486,
"grad_norm": 0.10094078438604197,
"learning_rate": 5.538461538461539e-05,
"loss": 1.07,
"step": 90
},
{
"epoch": 0.02924989416156718,
"grad_norm": 0.09878535861697932,
"learning_rate": 5.846153846153847e-05,
"loss": 1.1062,
"step": 95
},
{
"epoch": 0.03078936227533387,
"grad_norm": 0.0880914200143609,
"learning_rate": 6.153846153846155e-05,
"loss": 1.0792,
"step": 100
},
{
"epoch": 0.03232883038910057,
"grad_norm": 0.09739295977826301,
"learning_rate": 6.461538461538462e-05,
"loss": 1.0886,
"step": 105
},
{
"epoch": 0.03386829850286726,
"grad_norm": 0.12352134648547326,
"learning_rate": 6.76923076923077e-05,
"loss": 1.0846,
"step": 110
},
{
"epoch": 0.03540776661663395,
"grad_norm": 0.09422623480770391,
"learning_rate": 7.076923076923078e-05,
"loss": 1.0892,
"step": 115
},
{
"epoch": 0.036947234730400645,
"grad_norm": 0.11191294618136459,
"learning_rate": 7.384615384615386e-05,
"loss": 1.0709,
"step": 120
},
{
"epoch": 0.03848670284416734,
"grad_norm": 0.10765812088608237,
"learning_rate": 7.692307692307693e-05,
"loss": 1.084,
"step": 125
},
{
"epoch": 0.04002617095793403,
"grad_norm": 0.0922143630736113,
"learning_rate": 8e-05,
"loss": 1.1205,
"step": 130
},
{
"epoch": 0.04156563907170073,
"grad_norm": 0.12839655709773654,
"learning_rate": 8.307692307692309e-05,
"loss": 1.0738,
"step": 135
},
{
"epoch": 0.04310510718546742,
"grad_norm": 0.11121867345439279,
"learning_rate": 8.615384615384617e-05,
"loss": 1.0821,
"step": 140
},
{
"epoch": 0.044644575299234115,
"grad_norm": 0.0928933843755614,
"learning_rate": 8.923076923076924e-05,
"loss": 1.0604,
"step": 145
},
{
"epoch": 0.04618404341300081,
"grad_norm": 0.09586858247024442,
"learning_rate": 9.230769230769232e-05,
"loss": 1.0644,
"step": 150
},
{
"epoch": 0.0477235115267675,
"grad_norm": 0.10507553798424621,
"learning_rate": 9.53846153846154e-05,
"loss": 1.0617,
"step": 155
},
{
"epoch": 0.049262979640534194,
"grad_norm": 0.11036562639685599,
"learning_rate": 9.846153846153848e-05,
"loss": 1.0732,
"step": 160
},
{
"epoch": 0.050802447754300886,
"grad_norm": 0.11309124416557308,
"learning_rate": 0.00010153846153846153,
"loss": 1.0557,
"step": 165
},
{
"epoch": 0.052341915868067586,
"grad_norm": 0.15677878221994676,
"learning_rate": 0.00010461538461538463,
"loss": 1.0706,
"step": 170
},
{
"epoch": 0.05388138398183428,
"grad_norm": 0.11661519431178777,
"learning_rate": 0.0001076923076923077,
"loss": 1.098,
"step": 175
},
{
"epoch": 0.05542085209560097,
"grad_norm": 0.12109397309650635,
"learning_rate": 0.00011076923076923077,
"loss": 1.0311,
"step": 180
},
{
"epoch": 0.056960320209367664,
"grad_norm": 0.11093871974267784,
"learning_rate": 0.00011384615384615384,
"loss": 1.0707,
"step": 185
},
{
"epoch": 0.05849978832313436,
"grad_norm": 0.11357908911113367,
"learning_rate": 0.00011692307692307694,
"loss": 1.0845,
"step": 190
},
{
"epoch": 0.06003925643690105,
"grad_norm": 0.12394460027791813,
"learning_rate": 0.00012,
"loss": 1.0495,
"step": 195
},
{
"epoch": 0.06157872455066774,
"grad_norm": 0.11194042323004598,
"learning_rate": 0.0001230769230769231,
"loss": 1.0614,
"step": 200
},
{
"epoch": 0.06157872455066774,
"eval_loss": 1.0632128715515137,
"eval_runtime": 3818.978,
"eval_samples_per_second": 6.051,
"eval_steps_per_second": 0.378,
"step": 200
},
{
"epoch": 0.06311819266443444,
"grad_norm": 0.1033181877601895,
"learning_rate": 0.00012615384615384615,
"loss": 1.0433,
"step": 205
},
{
"epoch": 0.06465766077820113,
"grad_norm": 0.12063646484450094,
"learning_rate": 0.00012923076923076923,
"loss": 1.0714,
"step": 210
},
{
"epoch": 0.06619712889196783,
"grad_norm": 0.11588361936604014,
"learning_rate": 0.0001323076923076923,
"loss": 1.057,
"step": 215
},
{
"epoch": 0.06773659700573452,
"grad_norm": 0.11616868785038624,
"learning_rate": 0.0001353846153846154,
"loss": 1.0553,
"step": 220
},
{
"epoch": 0.06927606511950121,
"grad_norm": 0.09562740234655313,
"learning_rate": 0.00013846153846153847,
"loss": 1.0257,
"step": 225
},
{
"epoch": 0.0708155332332679,
"grad_norm": 0.09393900612118493,
"learning_rate": 0.00014153846153846156,
"loss": 1.0445,
"step": 230
},
{
"epoch": 0.0723550013470346,
"grad_norm": 0.10841095433316549,
"learning_rate": 0.0001446153846153846,
"loss": 1.0103,
"step": 235
},
{
"epoch": 0.07389446946080129,
"grad_norm": 0.10223643461806235,
"learning_rate": 0.00014769230769230772,
"loss": 1.0717,
"step": 240
},
{
"epoch": 0.07543393757456798,
"grad_norm": 0.11204179743813408,
"learning_rate": 0.00015076923076923077,
"loss": 1.0724,
"step": 245
},
{
"epoch": 0.07697340568833468,
"grad_norm": 0.09200458344532117,
"learning_rate": 0.00015384615384615385,
"loss": 1.0636,
"step": 250
},
{
"epoch": 0.07851287380210137,
"grad_norm": 0.11400449644867688,
"learning_rate": 0.00015692307692307693,
"loss": 0.9971,
"step": 255
},
{
"epoch": 0.08005234191586806,
"grad_norm": 0.11612263416102313,
"learning_rate": 0.00016,
"loss": 1.0694,
"step": 260
},
{
"epoch": 0.08159181002963477,
"grad_norm": 0.09082765987050115,
"learning_rate": 0.0001630769230769231,
"loss": 1.0467,
"step": 265
},
{
"epoch": 0.08313127814340146,
"grad_norm": 0.091051842252486,
"learning_rate": 0.00016615384615384617,
"loss": 1.0909,
"step": 270
},
{
"epoch": 0.08467074625716815,
"grad_norm": 0.09996524510558816,
"learning_rate": 0.00016923076923076923,
"loss": 1.038,
"step": 275
},
{
"epoch": 0.08621021437093485,
"grad_norm": 0.08559129966794987,
"learning_rate": 0.00017230769230769234,
"loss": 1.055,
"step": 280
},
{
"epoch": 0.08774968248470154,
"grad_norm": 0.09417658234399054,
"learning_rate": 0.0001753846153846154,
"loss": 1.0628,
"step": 285
},
{
"epoch": 0.08928915059846823,
"grad_norm": 0.09367818742833312,
"learning_rate": 0.00017846153846153847,
"loss": 1.0698,
"step": 290
},
{
"epoch": 0.09082861871223492,
"grad_norm": 0.091480252033389,
"learning_rate": 0.00018153846153846155,
"loss": 1.065,
"step": 295
},
{
"epoch": 0.09236808682600162,
"grad_norm": 0.0938796736597296,
"learning_rate": 0.00018461538461538463,
"loss": 1.0532,
"step": 300
},
{
"epoch": 0.09390755493976831,
"grad_norm": 0.09576286670666531,
"learning_rate": 0.0001876923076923077,
"loss": 1.0496,
"step": 305
},
{
"epoch": 0.095447023053535,
"grad_norm": 0.09248254454681183,
"learning_rate": 0.0001907692307692308,
"loss": 1.0636,
"step": 310
},
{
"epoch": 0.0969864911673017,
"grad_norm": 0.08693274686688061,
"learning_rate": 0.00019384615384615385,
"loss": 1.0838,
"step": 315
},
{
"epoch": 0.09852595928106839,
"grad_norm": 0.07975001335214404,
"learning_rate": 0.00019692307692307696,
"loss": 1.037,
"step": 320
},
{
"epoch": 0.10006542739483508,
"grad_norm": 0.09900399369206245,
"learning_rate": 0.0002,
"loss": 1.0476,
"step": 325
},
{
"epoch": 0.10160489550860177,
"grad_norm": 0.0883635278339313,
"learning_rate": 0.00019999855506507185,
"loss": 1.0765,
"step": 330
},
{
"epoch": 0.10314436362236847,
"grad_norm": 0.08877039836306448,
"learning_rate": 0.00019999422030204418,
"loss": 1.059,
"step": 335
},
{
"epoch": 0.10468383173613517,
"grad_norm": 0.07996176059015918,
"learning_rate": 0.00019998699583618593,
"loss": 1.0418,
"step": 340
},
{
"epoch": 0.10622329984990186,
"grad_norm": 0.0870003860102271,
"learning_rate": 0.00019997688187627482,
"loss": 1.0557,
"step": 345
},
{
"epoch": 0.10776276796366856,
"grad_norm": 0.09383600372135154,
"learning_rate": 0.0001999638787145911,
"loss": 1.0735,
"step": 350
},
{
"epoch": 0.10930223607743525,
"grad_norm": 0.09891188283344632,
"learning_rate": 0.0001999479867269092,
"loss": 1.0584,
"step": 355
},
{
"epoch": 0.11084170419120194,
"grad_norm": 0.12102740620306172,
"learning_rate": 0.00019992920637248697,
"loss": 1.0489,
"step": 360
},
{
"epoch": 0.11238117230496864,
"grad_norm": 0.08137547822372986,
"learning_rate": 0.00019990753819405213,
"loss": 1.0277,
"step": 365
},
{
"epoch": 0.11392064041873533,
"grad_norm": 0.08421354630258387,
"learning_rate": 0.00019988298281778684,
"loss": 1.062,
"step": 370
},
{
"epoch": 0.11546010853250202,
"grad_norm": 0.08071712029036497,
"learning_rate": 0.00019985554095330955,
"loss": 1.0215,
"step": 375
},
{
"epoch": 0.11699957664626871,
"grad_norm": 0.0928078752872806,
"learning_rate": 0.0001998252133936544,
"loss": 1.053,
"step": 380
},
{
"epoch": 0.1185390447600354,
"grad_norm": 0.08562676553948985,
"learning_rate": 0.00019979200101524845,
"loss": 1.0701,
"step": 385
},
{
"epoch": 0.1200785128738021,
"grad_norm": 0.08403649770517699,
"learning_rate": 0.00019975590477788613,
"loss": 1.0514,
"step": 390
},
{
"epoch": 0.12161798098756879,
"grad_norm": 0.08183724425629081,
"learning_rate": 0.0001997169257247018,
"loss": 1.0095,
"step": 395
},
{
"epoch": 0.12315744910133548,
"grad_norm": 0.08039658119705519,
"learning_rate": 0.00019967506498213931,
"loss": 1.0689,
"step": 400
},
{
"epoch": 0.12315744910133548,
"eval_loss": 1.0475565195083618,
"eval_runtime": 3802.3739,
"eval_samples_per_second": 6.078,
"eval_steps_per_second": 0.38,
"step": 400
},
{
"epoch": 0.12469691721510218,
"grad_norm": 0.08379276105126061,
"learning_rate": 0.00019963032375991966,
"loss": 1.0782,
"step": 405
},
{
"epoch": 0.12623638532886888,
"grad_norm": 0.07908083743105887,
"learning_rate": 0.00019958270335100595,
"loss": 1.0235,
"step": 410
},
{
"epoch": 0.12777585344263556,
"grad_norm": 0.08493671539476158,
"learning_rate": 0.00019953220513156602,
"loss": 1.0907,
"step": 415
},
{
"epoch": 0.12931532155640227,
"grad_norm": 0.08337456151040325,
"learning_rate": 0.0001994788305609327,
"loss": 1.0326,
"step": 420
},
{
"epoch": 0.13085478967016895,
"grad_norm": 0.08547228164895211,
"learning_rate": 0.00019942258118156163,
"loss": 1.0442,
"step": 425
},
{
"epoch": 0.13239425778393565,
"grad_norm": 0.0750158452168834,
"learning_rate": 0.00019936345861898663,
"loss": 1.0684,
"step": 430
},
{
"epoch": 0.13393372589770233,
"grad_norm": 0.07579154319260911,
"learning_rate": 0.0001993014645817728,
"loss": 1.0547,
"step": 435
},
{
"epoch": 0.13547319401146904,
"grad_norm": 0.07323307416728322,
"learning_rate": 0.00019923660086146723,
"loss": 1.0491,
"step": 440
},
{
"epoch": 0.13701266212523572,
"grad_norm": 0.08937561723880813,
"learning_rate": 0.0001991688693325469,
"loss": 1.0484,
"step": 445
},
{
"epoch": 0.13855213023900242,
"grad_norm": 0.08089791643667314,
"learning_rate": 0.00019909827195236493,
"loss": 1.0422,
"step": 450
},
{
"epoch": 0.14009159835276913,
"grad_norm": 0.07004371269562998,
"learning_rate": 0.00019902481076109372,
"loss": 1.0779,
"step": 455
},
{
"epoch": 0.1416310664665358,
"grad_norm": 0.07219709532645062,
"learning_rate": 0.00019894848788166604,
"loss": 1.0578,
"step": 460
},
{
"epoch": 0.14317053458030252,
"grad_norm": 0.08319824295039546,
"learning_rate": 0.00019886930551971387,
"loss": 1.0776,
"step": 465
},
{
"epoch": 0.1447100026940692,
"grad_norm": 0.08461977605731112,
"learning_rate": 0.0001987872659635043,
"loss": 1.0398,
"step": 470
},
{
"epoch": 0.1462494708078359,
"grad_norm": 0.07913296391073374,
"learning_rate": 0.00019870237158387384,
"loss": 1.0496,
"step": 475
},
{
"epoch": 0.14778893892160258,
"grad_norm": 0.07884194841699313,
"learning_rate": 0.00019861462483415952,
"loss": 1.0401,
"step": 480
},
{
"epoch": 0.1493284070353693,
"grad_norm": 0.08715608209779739,
"learning_rate": 0.0001985240282501282,
"loss": 1.0458,
"step": 485
},
{
"epoch": 0.15086787514913597,
"grad_norm": 0.07255994722623947,
"learning_rate": 0.0001984305844499033,
"loss": 1.023,
"step": 490
},
{
"epoch": 0.15240734326290267,
"grad_norm": 0.0769318406138446,
"learning_rate": 0.00019833429613388902,
"loss": 1.0476,
"step": 495
},
{
"epoch": 0.15394681137666935,
"grad_norm": 0.07401383185768683,
"learning_rate": 0.0001982351660846924,
"loss": 1.0415,
"step": 500
},
{
"epoch": 0.15548627949043606,
"grad_norm": 0.07649017525208568,
"learning_rate": 0.00019813319716704278,
"loss": 1.0427,
"step": 505
},
{
"epoch": 0.15702574760420274,
"grad_norm": 0.07708900519770054,
"learning_rate": 0.00019802839232770921,
"loss": 1.0855,
"step": 510
},
{
"epoch": 0.15856521571796944,
"grad_norm": 0.07439539455159229,
"learning_rate": 0.00019792075459541518,
"loss": 1.0224,
"step": 515
},
{
"epoch": 0.16010468383173612,
"grad_norm": 0.08182572188820655,
"learning_rate": 0.00019781028708075102,
"loss": 1.0302,
"step": 520
},
{
"epoch": 0.16164415194550283,
"grad_norm": 0.08161041915734439,
"learning_rate": 0.00019769699297608417,
"loss": 1.0245,
"step": 525
},
{
"epoch": 0.16318362005926954,
"grad_norm": 0.08888339173595923,
"learning_rate": 0.00019758087555546682,
"loss": 1.0747,
"step": 530
},
{
"epoch": 0.16472308817303621,
"grad_norm": 0.07764918511303621,
"learning_rate": 0.0001974619381745413,
"loss": 1.0696,
"step": 535
},
{
"epoch": 0.16626255628680292,
"grad_norm": 0.07999928337629646,
"learning_rate": 0.00019734018427044307,
"loss": 1.0351,
"step": 540
},
{
"epoch": 0.1678020244005696,
"grad_norm": 0.07656288262788609,
"learning_rate": 0.0001972156173617016,
"loss": 1.0399,
"step": 545
},
{
"epoch": 0.1693414925143363,
"grad_norm": 0.07902096762825829,
"learning_rate": 0.00019708824104813837,
"loss": 1.0397,
"step": 550
},
{
"epoch": 0.17088096062810298,
"grad_norm": 0.08100109284928467,
"learning_rate": 0.00019695805901076308,
"loss": 1.0661,
"step": 555
},
{
"epoch": 0.1724204287418697,
"grad_norm": 0.07617725821540045,
"learning_rate": 0.00019682507501166718,
"loss": 1.088,
"step": 560
},
{
"epoch": 0.17395989685563637,
"grad_norm": 0.0781772918354948,
"learning_rate": 0.00019668929289391523,
"loss": 1.0325,
"step": 565
},
{
"epoch": 0.17549936496940308,
"grad_norm": 0.07602400296386462,
"learning_rate": 0.00019655071658143366,
"loss": 1.0493,
"step": 570
},
{
"epoch": 0.17703883308316976,
"grad_norm": 0.07965301834511823,
"learning_rate": 0.00019640935007889755,
"loss": 1.0759,
"step": 575
},
{
"epoch": 0.17857830119693646,
"grad_norm": 0.07753220082647658,
"learning_rate": 0.0001962651974716149,
"loss": 1.0601,
"step": 580
},
{
"epoch": 0.18011776931070314,
"grad_norm": 0.07648415793765183,
"learning_rate": 0.0001961182629254084,
"loss": 1.0151,
"step": 585
},
{
"epoch": 0.18165723742446985,
"grad_norm": 0.08029992977054808,
"learning_rate": 0.00019596855068649522,
"loss": 1.0499,
"step": 590
},
{
"epoch": 0.18319670553823653,
"grad_norm": 0.08092320232434004,
"learning_rate": 0.00019581606508136426,
"loss": 1.0631,
"step": 595
},
{
"epoch": 0.18473617365200323,
"grad_norm": 0.0748434186581261,
"learning_rate": 0.00019566081051665098,
"loss": 1.0053,
"step": 600
},
{
"epoch": 0.18473617365200323,
"eval_loss": 1.0412589311599731,
"eval_runtime": 3798.478,
"eval_samples_per_second": 6.084,
"eval_steps_per_second": 0.38,
"step": 600
},
{
"epoch": 0.18627564176576994,
"grad_norm": 0.07474772922675897,
"learning_rate": 0.00019550279147901036,
"loss": 1.0811,
"step": 605
},
{
"epoch": 0.18781510987953662,
"grad_norm": 0.0773035541962382,
"learning_rate": 0.00019534201253498682,
"loss": 1.0359,
"step": 610
},
{
"epoch": 0.18935457799330332,
"grad_norm": 0.08385957791440671,
"learning_rate": 0.0001951784783308827,
"loss": 0.9995,
"step": 615
},
{
"epoch": 0.19089404610707,
"grad_norm": 0.07841850327107855,
"learning_rate": 0.0001950121935926236,
"loss": 1.0417,
"step": 620
},
{
"epoch": 0.1924335142208367,
"grad_norm": 0.11766060709484247,
"learning_rate": 0.00019484316312562205,
"loss": 1.0227,
"step": 625
},
{
"epoch": 0.1939729823346034,
"grad_norm": 0.07110694761741441,
"learning_rate": 0.00019467139181463862,
"loss": 1.0652,
"step": 630
},
{
"epoch": 0.1955124504483701,
"grad_norm": 0.07470835721419704,
"learning_rate": 0.00019449688462364056,
"loss": 1.0299,
"step": 635
},
{
"epoch": 0.19705191856213677,
"grad_norm": 0.08102828325973369,
"learning_rate": 0.00019431964659565867,
"loss": 1.0488,
"step": 640
},
{
"epoch": 0.19859138667590348,
"grad_norm": 0.07603967351010721,
"learning_rate": 0.0001941396828526412,
"loss": 1.0459,
"step": 645
},
{
"epoch": 0.20013085478967016,
"grad_norm": 0.08273841396400562,
"learning_rate": 0.00019395699859530623,
"loss": 1.0194,
"step": 650
},
{
"epoch": 0.20167032290343687,
"grad_norm": 0.08236332870987446,
"learning_rate": 0.00019377159910299093,
"loss": 1.0307,
"step": 655
},
{
"epoch": 0.20320979101720354,
"grad_norm": 0.06867994321607887,
"learning_rate": 0.00019358348973349943,
"loss": 1.0098,
"step": 660
},
{
"epoch": 0.20474925913097025,
"grad_norm": 0.08144032256455716,
"learning_rate": 0.00019339267592294763,
"loss": 1.048,
"step": 665
},
{
"epoch": 0.20628872724473693,
"grad_norm": 0.07918355281230142,
"learning_rate": 0.00019319916318560635,
"loss": 1.0227,
"step": 670
},
{
"epoch": 0.20782819535850364,
"grad_norm": 0.07792694280227995,
"learning_rate": 0.00019300295711374187,
"loss": 1.039,
"step": 675
},
{
"epoch": 0.20936766347227034,
"grad_norm": 0.07841373321559497,
"learning_rate": 0.00019280406337745428,
"loss": 1.0185,
"step": 680
},
{
"epoch": 0.21090713158603702,
"grad_norm": 0.12957233088012476,
"learning_rate": 0.00019260248772451377,
"loss": 1.0496,
"step": 685
},
{
"epoch": 0.21244659969980373,
"grad_norm": 0.07758713390528212,
"learning_rate": 0.0001923982359801943,
"loss": 1.0425,
"step": 690
},
{
"epoch": 0.2139860678135704,
"grad_norm": 0.08451147321948667,
"learning_rate": 0.00019219131404710552,
"loss": 1.0749,
"step": 695
},
{
"epoch": 0.21552553592733711,
"grad_norm": 0.08218923027527074,
"learning_rate": 0.00019198172790502196,
"loss": 1.0244,
"step": 700
},
{
"epoch": 0.2170650040411038,
"grad_norm": 0.07475377041516394,
"learning_rate": 0.0001917694836107104,
"loss": 1.0367,
"step": 705
},
{
"epoch": 0.2186044721548705,
"grad_norm": 0.06989397102142611,
"learning_rate": 0.00019155458729775467,
"loss": 1.049,
"step": 710
},
{
"epoch": 0.22014394026863718,
"grad_norm": 0.07403450910939992,
"learning_rate": 0.0001913370451763786,
"loss": 1.0135,
"step": 715
},
{
"epoch": 0.22168340838240388,
"grad_norm": 0.06736867483748331,
"learning_rate": 0.00019111686353326631,
"loss": 1.0213,
"step": 720
},
{
"epoch": 0.22322287649617056,
"grad_norm": 0.07406189263799307,
"learning_rate": 0.00019089404873138082,
"loss": 1.0521,
"step": 725
},
{
"epoch": 0.22476234460993727,
"grad_norm": 0.07355438357203191,
"learning_rate": 0.00019066860720977986,
"loss": 1.0483,
"step": 730
},
{
"epoch": 0.22630181272370395,
"grad_norm": 0.07568463711454308,
"learning_rate": 0.00019044054548343002,
"loss": 1.0289,
"step": 735
},
{
"epoch": 0.22784128083747066,
"grad_norm": 0.07229067305689793,
"learning_rate": 0.0001902098701430184,
"loss": 1.0694,
"step": 740
},
{
"epoch": 0.22938074895123733,
"grad_norm": 0.07530804590739208,
"learning_rate": 0.00018997658785476214,
"loss": 1.0651,
"step": 745
},
{
"epoch": 0.23092021706500404,
"grad_norm": 0.07259570093477205,
"learning_rate": 0.00018974070536021572,
"loss": 1.0685,
"step": 750
},
{
"epoch": 0.23245968517877075,
"grad_norm": 0.06991198063848746,
"learning_rate": 0.00018950222947607625,
"loss": 1.0524,
"step": 755
},
{
"epoch": 0.23399915329253743,
"grad_norm": 0.07071964916232602,
"learning_rate": 0.0001892611670939865,
"loss": 0.9967,
"step": 760
},
{
"epoch": 0.23553862140630413,
"grad_norm": 0.08069984398117862,
"learning_rate": 0.00018901752518033548,
"loss": 1.0503,
"step": 765
},
{
"epoch": 0.2370780895200708,
"grad_norm": 0.0719126875966159,
"learning_rate": 0.0001887713107760575,
"loss": 1.0497,
"step": 770
},
{
"epoch": 0.23861755763383752,
"grad_norm": 0.07933165083127114,
"learning_rate": 0.00018852253099642833,
"loss": 1.0163,
"step": 775
},
{
"epoch": 0.2401570257476042,
"grad_norm": 0.07529876789807866,
"learning_rate": 0.0001882711930308599,
"loss": 1.0503,
"step": 780
},
{
"epoch": 0.2416964938613709,
"grad_norm": 0.074705285570636,
"learning_rate": 0.00018801730414269225,
"loss": 1.0424,
"step": 785
},
{
"epoch": 0.24323596197513758,
"grad_norm": 0.07414239254048278,
"learning_rate": 0.0001877608716689839,
"loss": 1.0655,
"step": 790
},
{
"epoch": 0.2447754300889043,
"grad_norm": 0.07941506265986978,
"learning_rate": 0.00018750190302029956,
"loss": 1.0193,
"step": 795
},
{
"epoch": 0.24631489820267097,
"grad_norm": 0.08230667165269098,
"learning_rate": 0.00018724040568049612,
"loss": 1.0446,
"step": 800
},
{
"epoch": 0.24631489820267097,
"eval_loss": 1.0366028547286987,
"eval_runtime": 3798.2715,
"eval_samples_per_second": 6.084,
"eval_steps_per_second": 0.38,
"step": 800
},
{
"epoch": 0.24785436631643767,
"grad_norm": 0.08052061406166201,
"learning_rate": 0.00018697638720650646,
"loss": 1.0329,
"step": 805
},
{
"epoch": 0.24939383443020435,
"grad_norm": 0.07060612206330524,
"learning_rate": 0.00018670985522812084,
"loss": 1.0123,
"step": 810
},
{
"epoch": 0.25093330254397106,
"grad_norm": 0.07261155032686553,
"learning_rate": 0.0001864408174477665,
"loss": 1.0394,
"step": 815
},
{
"epoch": 0.25247277065773777,
"grad_norm": 0.07296759582556935,
"learning_rate": 0.00018616928164028523,
"loss": 1.0021,
"step": 820
},
{
"epoch": 0.2540122387715045,
"grad_norm": 0.06646733390910516,
"learning_rate": 0.00018589525565270844,
"loss": 1.0286,
"step": 825
},
{
"epoch": 0.2555517068852711,
"grad_norm": 0.07496596424661404,
"learning_rate": 0.0001856187474040306,
"loss": 1.0502,
"step": 830
},
{
"epoch": 0.25709117499903783,
"grad_norm": 0.08500360217319118,
"learning_rate": 0.00018533976488498016,
"loss": 1.0256,
"step": 835
},
{
"epoch": 0.25863064311280454,
"grad_norm": 0.07817756873072405,
"learning_rate": 0.0001850583161577889,
"loss": 1.0609,
"step": 840
},
{
"epoch": 0.26017011122657124,
"grad_norm": 0.07136612848707545,
"learning_rate": 0.00018477440935595873,
"loss": 1.0775,
"step": 845
},
{
"epoch": 0.2617095793403379,
"grad_norm": 0.07292608365481835,
"learning_rate": 0.00018448805268402672,
"loss": 1.058,
"step": 850
},
{
"epoch": 0.2632490474541046,
"grad_norm": 0.07716711803643432,
"learning_rate": 0.00018419925441732804,
"loss": 1.0294,
"step": 855
},
{
"epoch": 0.2647885155678713,
"grad_norm": 0.07526261921660161,
"learning_rate": 0.00018390802290175673,
"loss": 1.0467,
"step": 860
},
{
"epoch": 0.266327983681638,
"grad_norm": 0.0735157839737638,
"learning_rate": 0.00018361436655352456,
"loss": 1.0278,
"step": 865
},
{
"epoch": 0.26786745179540467,
"grad_norm": 0.07101822956411033,
"learning_rate": 0.00018331829385891783,
"loss": 1.0188,
"step": 870
},
{
"epoch": 0.26940691990917137,
"grad_norm": 0.07406443039738211,
"learning_rate": 0.00018301981337405212,
"loss": 1.0476,
"step": 875
},
{
"epoch": 0.2709463880229381,
"grad_norm": 0.07470379094242477,
"learning_rate": 0.00018271893372462497,
"loss": 1.0468,
"step": 880
},
{
"epoch": 0.2724858561367048,
"grad_norm": 0.07458412123750419,
"learning_rate": 0.00018241566360566665,
"loss": 1.0279,
"step": 885
},
{
"epoch": 0.27402532425047144,
"grad_norm": 0.08164107594170099,
"learning_rate": 0.00018211001178128892,
"loss": 1.0472,
"step": 890
},
{
"epoch": 0.27556479236423814,
"grad_norm": 0.07748097167228449,
"learning_rate": 0.00018180198708443173,
"loss": 1.0534,
"step": 895
},
{
"epoch": 0.27710426047800485,
"grad_norm": 0.07485972229218758,
"learning_rate": 0.00018149159841660795,
"loss": 1.0419,
"step": 900
},
{
"epoch": 0.27864372859177156,
"grad_norm": 0.07553124022662376,
"learning_rate": 0.00018117885474764613,
"loss": 1.0836,
"step": 905
},
{
"epoch": 0.28018319670553826,
"grad_norm": 0.07966215645919128,
"learning_rate": 0.00018086376511543126,
"loss": 1.0642,
"step": 910
},
{
"epoch": 0.2817226648193049,
"grad_norm": 0.08376456009997757,
"learning_rate": 0.00018054633862564368,
"loss": 1.0398,
"step": 915
},
{
"epoch": 0.2832621329330716,
"grad_norm": 0.075508959246266,
"learning_rate": 0.0001802265844514958,
"loss": 0.9996,
"step": 920
},
{
"epoch": 0.2848016010468383,
"grad_norm": 0.07358158800850821,
"learning_rate": 0.0001799045118334671,
"loss": 1.0542,
"step": 925
},
{
"epoch": 0.28634106916060503,
"grad_norm": 0.08094264187967125,
"learning_rate": 0.00017958013007903713,
"loss": 1.0563,
"step": 930
},
{
"epoch": 0.2878805372743717,
"grad_norm": 0.07424176124118159,
"learning_rate": 0.0001792534485624164,
"loss": 1.0405,
"step": 935
},
{
"epoch": 0.2894200053881384,
"grad_norm": 0.07418414794842867,
"learning_rate": 0.00017892447672427563,
"loss": 1.0391,
"step": 940
},
{
"epoch": 0.2909594735019051,
"grad_norm": 0.0704593007549167,
"learning_rate": 0.00017859322407147272,
"loss": 1.0543,
"step": 945
},
{
"epoch": 0.2924989416156718,
"grad_norm": 0.07058098730245323,
"learning_rate": 0.00017825970017677832,
"loss": 1.0693,
"step": 950
},
{
"epoch": 0.29403840972943845,
"grad_norm": 0.07100077379863531,
"learning_rate": 0.00017792391467859886,
"loss": 1.0157,
"step": 955
},
{
"epoch": 0.29557787784320516,
"grad_norm": 0.07264965656385536,
"learning_rate": 0.0001775858772806983,
"loss": 1.0669,
"step": 960
},
{
"epoch": 0.29711734595697187,
"grad_norm": 0.06945646756969821,
"learning_rate": 0.00017724559775191744,
"loss": 1.0282,
"step": 965
},
{
"epoch": 0.2986568140707386,
"grad_norm": 0.0800750187488917,
"learning_rate": 0.00017690308592589182,
"loss": 1.0424,
"step": 970
},
{
"epoch": 0.3001962821845053,
"grad_norm": 0.07826578070698212,
"learning_rate": 0.0001765583517007675,
"loss": 0.994,
"step": 975
},
{
"epoch": 0.30173575029827193,
"grad_norm": 0.07185624380063993,
"learning_rate": 0.00017621140503891488,
"loss": 1.0117,
"step": 980
},
{
"epoch": 0.30327521841203864,
"grad_norm": 0.07770724836361542,
"learning_rate": 0.00017586225596664102,
"loss": 1.0282,
"step": 985
},
{
"epoch": 0.30481468652580535,
"grad_norm": 0.07425549788358596,
"learning_rate": 0.00017551091457389966,
"loss": 1.0332,
"step": 990
},
{
"epoch": 0.30635415463957205,
"grad_norm": 0.07157671192234144,
"learning_rate": 0.00017515739101399983,
"loss": 1.0202,
"step": 995
},
{
"epoch": 0.3078936227533387,
"grad_norm": 0.07195148099166214,
"learning_rate": 0.00017480169550331231,
"loss": 1.0091,
"step": 1000
},
{
"epoch": 0.3078936227533387,
"eval_loss": 1.033624291419983,
"eval_runtime": 3799.3073,
"eval_samples_per_second": 6.082,
"eval_steps_per_second": 0.38,
"step": 1000
},
{
"epoch": 0.3094330908671054,
"grad_norm": 0.0709967222808181,
"learning_rate": 0.00017444383832097442,
"loss": 1.0306,
"step": 1005
},
{
"epoch": 0.3109725589808721,
"grad_norm": 0.08017250953363526,
"learning_rate": 0.00017408382980859305,
"loss": 1.0335,
"step": 1010
},
{
"epoch": 0.3125120270946388,
"grad_norm": 0.0763005407159528,
"learning_rate": 0.00017372168036994566,
"loss": 1.0155,
"step": 1015
},
{
"epoch": 0.3140514952084055,
"grad_norm": 0.068090767981409,
"learning_rate": 0.00017335740047067972,
"loss": 1.0226,
"step": 1020
},
{
"epoch": 0.3155909633221722,
"grad_norm": 0.07053765308848822,
"learning_rate": 0.0001729910006380102,
"loss": 1.0455,
"step": 1025
},
{
"epoch": 0.3171304314359389,
"grad_norm": 0.07639366775520491,
"learning_rate": 0.00017262249146041546,
"loss": 1.0737,
"step": 1030
},
{
"epoch": 0.3186698995497056,
"grad_norm": 0.07414091472835294,
"learning_rate": 0.00017225188358733107,
"loss": 1.0159,
"step": 1035
},
{
"epoch": 0.32020936766347224,
"grad_norm": 0.07840200264036183,
"learning_rate": 0.00017187918772884232,
"loss": 1.0605,
"step": 1040
},
{
"epoch": 0.32174883577723895,
"grad_norm": 0.06946548404139283,
"learning_rate": 0.00017150441465537447,
"loss": 1.0549,
"step": 1045
},
{
"epoch": 0.32328830389100566,
"grad_norm": 0.0726329779508538,
"learning_rate": 0.00017112757519738154,
"loss": 1.0294,
"step": 1050
},
{
"epoch": 0.32482777200477236,
"grad_norm": 0.07366641465053547,
"learning_rate": 0.0001707486802450335,
"loss": 1.0439,
"step": 1055
},
{
"epoch": 0.32636724011853907,
"grad_norm": 0.07461023494546891,
"learning_rate": 0.00017036774074790132,
"loss": 1.0036,
"step": 1060
},
{
"epoch": 0.3279067082323057,
"grad_norm": 0.07745841056330656,
"learning_rate": 0.00016998476771464072,
"loss": 1.039,
"step": 1065
},
{
"epoch": 0.32944617634607243,
"grad_norm": 0.07562279638819498,
"learning_rate": 0.00016959977221267392,
"loss": 1.0136,
"step": 1070
},
{
"epoch": 0.33098564445983913,
"grad_norm": 0.07269409212200949,
"learning_rate": 0.0001692127653678699,
"loss": 1.0447,
"step": 1075
},
{
"epoch": 0.33252511257360584,
"grad_norm": 0.07863977410900856,
"learning_rate": 0.00016882375836422284,
"loss": 1.032,
"step": 1080
},
{
"epoch": 0.3340645806873725,
"grad_norm": 0.08154682576838618,
"learning_rate": 0.00016843276244352885,
"loss": 1.0576,
"step": 1085
},
{
"epoch": 0.3356040488011392,
"grad_norm": 0.07324914224304953,
"learning_rate": 0.00016803978890506113,
"loss": 1.0677,
"step": 1090
},
{
"epoch": 0.3371435169149059,
"grad_norm": 0.08330706239189462,
"learning_rate": 0.00016764484910524358,
"loss": 1.0244,
"step": 1095
},
{
"epoch": 0.3386829850286726,
"grad_norm": 0.07527643648007623,
"learning_rate": 0.00016724795445732243,
"loss": 0.9977,
"step": 1100
},
{
"epoch": 0.34022245314243926,
"grad_norm": 0.07895912028160554,
"learning_rate": 0.00016684911643103642,
"loss": 1.0575,
"step": 1105
},
{
"epoch": 0.34176192125620597,
"grad_norm": 0.073939133015858,
"learning_rate": 0.0001664483465522855,
"loss": 1.0337,
"step": 1110
},
{
"epoch": 0.3433013893699727,
"grad_norm": 0.07648599682491888,
"learning_rate": 0.00016604565640279754,
"loss": 1.0462,
"step": 1115
},
{
"epoch": 0.3448408574837394,
"grad_norm": 0.07375239907970622,
"learning_rate": 0.0001656410576197938,
"loss": 1.0537,
"step": 1120
},
{
"epoch": 0.3463803255975061,
"grad_norm": 0.07218952828382294,
"learning_rate": 0.0001652345618956526,
"loss": 1.0702,
"step": 1125
},
{
"epoch": 0.34791979371127274,
"grad_norm": 0.07501734343767677,
"learning_rate": 0.00016482618097757122,
"loss": 1.045,
"step": 1130
},
{
"epoch": 0.34945926182503945,
"grad_norm": 0.07478505250167114,
"learning_rate": 0.00016441592666722684,
"loss": 1.0356,
"step": 1135
},
{
"epoch": 0.35099872993880615,
"grad_norm": 0.07035501241737965,
"learning_rate": 0.00016400381082043507,
"loss": 1.0819,
"step": 1140
},
{
"epoch": 0.35253819805257286,
"grad_norm": 0.07713003380587562,
"learning_rate": 0.00016358984534680748,
"loss": 1.0494,
"step": 1145
},
{
"epoch": 0.3540776661663395,
"grad_norm": 0.07012091124270008,
"learning_rate": 0.00016317404220940758,
"loss": 1.022,
"step": 1150
},
{
"epoch": 0.3556171342801062,
"grad_norm": 0.06697708347109951,
"learning_rate": 0.00016275641342440483,
"loss": 1.0589,
"step": 1155
},
{
"epoch": 0.3571566023938729,
"grad_norm": 0.07573896521834783,
"learning_rate": 0.0001623369710607277,
"loss": 1.0044,
"step": 1160
},
{
"epoch": 0.35869607050763963,
"grad_norm": 0.06946529088193742,
"learning_rate": 0.00016191572723971455,
"loss": 1.0652,
"step": 1165
},
{
"epoch": 0.3602355386214063,
"grad_norm": 0.0727340465476027,
"learning_rate": 0.00016149269413476353,
"loss": 1.0057,
"step": 1170
},
{
"epoch": 0.361775006735173,
"grad_norm": 0.08017843239666048,
"learning_rate": 0.00016106788397098095,
"loss": 0.9942,
"step": 1175
},
{
"epoch": 0.3633144748489397,
"grad_norm": 0.06899110462149576,
"learning_rate": 0.0001606413090248276,
"loss": 0.9958,
"step": 1180
},
{
"epoch": 0.3648539429627064,
"grad_norm": 0.07737025459856088,
"learning_rate": 0.00016021298162376428,
"loss": 1.0211,
"step": 1185
},
{
"epoch": 0.36639341107647305,
"grad_norm": 0.07807131065906221,
"learning_rate": 0.00015978291414589542,
"loss": 1.039,
"step": 1190
},
{
"epoch": 0.36793287919023976,
"grad_norm": 0.07059155235596021,
"learning_rate": 0.0001593511190196115,
"loss": 1.0513,
"step": 1195
},
{
"epoch": 0.36947234730400647,
"grad_norm": 0.07422722970665956,
"learning_rate": 0.00015891760872322963,
"loss": 1.0093,
"step": 1200
},
{
"epoch": 0.36947234730400647,
"eval_loss": 1.0309594869613647,
"eval_runtime": 3796.5579,
"eval_samples_per_second": 6.087,
"eval_steps_per_second": 0.381,
"step": 1200
},
{
"epoch": 0.37101181541777317,
"grad_norm": 0.06806084338199529,
"learning_rate": 0.00015848239578463325,
"loss": 1.0504,
"step": 1205
},
{
"epoch": 0.3725512835315399,
"grad_norm": 0.07638211255486586,
"learning_rate": 0.00015804549278090982,
"loss": 1.0145,
"step": 1210
},
{
"epoch": 0.37409075164530653,
"grad_norm": 0.07232165039483601,
"learning_rate": 0.00015760691233798757,
"loss": 1.011,
"step": 1215
},
{
"epoch": 0.37563021975907324,
"grad_norm": 0.0725477342684882,
"learning_rate": 0.00015716666713027055,
"loss": 1.0338,
"step": 1220
},
{
"epoch": 0.37716968787283994,
"grad_norm": 0.08448404468374969,
"learning_rate": 0.00015672476988027228,
"loss": 1.0388,
"step": 1225
},
{
"epoch": 0.37870915598660665,
"grad_norm": 0.08451055602238913,
"learning_rate": 0.0001562812333582482,
"loss": 1.0041,
"step": 1230
},
{
"epoch": 0.3802486241003733,
"grad_norm": 0.07357435195090126,
"learning_rate": 0.00015583607038182655,
"loss": 1.0286,
"step": 1235
},
{
"epoch": 0.38178809221414,
"grad_norm": 0.07981414373807207,
"learning_rate": 0.000155389293815638,
"loss": 1.0293,
"step": 1240
},
{
"epoch": 0.3833275603279067,
"grad_norm": 0.074241776686085,
"learning_rate": 0.00015494091657094385,
"loss": 1.033,
"step": 1245
},
{
"epoch": 0.3848670284416734,
"grad_norm": 0.07517760872068341,
"learning_rate": 0.00015449095160526292,
"loss": 1.0559,
"step": 1250
},
{
"epoch": 0.38640649655544007,
"grad_norm": 0.07476423646372729,
"learning_rate": 0.00015403941192199718,
"loss": 1.0343,
"step": 1255
},
{
"epoch": 0.3879459646692068,
"grad_norm": 0.07214431115898451,
"learning_rate": 0.0001535863105700558,
"loss": 1.0467,
"step": 1260
},
{
"epoch": 0.3894854327829735,
"grad_norm": 0.07644175139453621,
"learning_rate": 0.00015313166064347814,
"loss": 1.0188,
"step": 1265
},
{
"epoch": 0.3910249008967402,
"grad_norm": 0.06984113294468933,
"learning_rate": 0.00015267547528105538,
"loss": 1.0341,
"step": 1270
},
{
"epoch": 0.3925643690105069,
"grad_norm": 0.06690863213166448,
"learning_rate": 0.0001522177676659508,
"loss": 1.0625,
"step": 1275
},
{
"epoch": 0.39410383712427355,
"grad_norm": 0.06950059946334636,
"learning_rate": 0.00015175855102531887,
"loss": 1.0123,
"step": 1280
},
{
"epoch": 0.39564330523804025,
"grad_norm": 0.06964516306819979,
"learning_rate": 0.00015129783862992283,
"loss": 1.0201,
"step": 1285
},
{
"epoch": 0.39718277335180696,
"grad_norm": 0.080130428292661,
"learning_rate": 0.0001508356437937512,
"loss": 1.0448,
"step": 1290
},
{
"epoch": 0.39872224146557367,
"grad_norm": 0.07373600496196321,
"learning_rate": 0.00015037197987363338,
"loss": 1.0272,
"step": 1295
},
{
"epoch": 0.4002617095793403,
"grad_norm": 0.07444079143838864,
"learning_rate": 0.0001499068602688532,
"loss": 1.0625,
"step": 1300
},
{
"epoch": 0.401801177693107,
"grad_norm": 0.07495218470810172,
"learning_rate": 0.00014944029842076185,
"loss": 1.0277,
"step": 1305
},
{
"epoch": 0.40334064580687373,
"grad_norm": 0.07148138370454796,
"learning_rate": 0.0001489723078123896,
"loss": 1.0393,
"step": 1310
},
{
"epoch": 0.40488011392064044,
"grad_norm": 0.07184439296288066,
"learning_rate": 0.00014850290196805594,
"loss": 1.0413,
"step": 1315
},
{
"epoch": 0.4064195820344071,
"grad_norm": 0.07052670997188848,
"learning_rate": 0.00014803209445297887,
"loss": 1.0056,
"step": 1320
},
{
"epoch": 0.4079590501481738,
"grad_norm": 0.07344695167875763,
"learning_rate": 0.00014755989887288285,
"loss": 1.0411,
"step": 1325
},
{
"epoch": 0.4094985182619405,
"grad_norm": 0.0769706853052285,
"learning_rate": 0.00014708632887360564,
"loss": 1.0387,
"step": 1330
},
{
"epoch": 0.4110379863757072,
"grad_norm": 0.0739404718972198,
"learning_rate": 0.0001466113981407039,
"loss": 1.0452,
"step": 1335
},
{
"epoch": 0.41257745448947386,
"grad_norm": 0.08213823024505344,
"learning_rate": 0.00014613512039905765,
"loss": 1.0339,
"step": 1340
},
{
"epoch": 0.41411692260324057,
"grad_norm": 0.07328616522330499,
"learning_rate": 0.00014565750941247386,
"loss": 1.0133,
"step": 1345
},
{
"epoch": 0.4156563907170073,
"grad_norm": 0.07420350570178859,
"learning_rate": 0.0001451785789832884,
"loss": 1.0186,
"step": 1350
},
{
"epoch": 0.417195858830774,
"grad_norm": 0.07745166588825841,
"learning_rate": 0.00014469834295196743,
"loss": 1.0498,
"step": 1355
},
{
"epoch": 0.4187353269445407,
"grad_norm": 0.07355335272924506,
"learning_rate": 0.00014421681519670722,
"loss": 1.0435,
"step": 1360
},
{
"epoch": 0.42027479505830734,
"grad_norm": 0.07280563497532423,
"learning_rate": 0.0001437340096330332,
"loss": 1.0503,
"step": 1365
},
{
"epoch": 0.42181426317207404,
"grad_norm": 0.08984865578017236,
"learning_rate": 0.0001432499402133979,
"loss": 1.0373,
"step": 1370
},
{
"epoch": 0.42335373128584075,
"grad_norm": 0.07403634652711334,
"learning_rate": 0.0001427646209267775,
"loss": 1.0296,
"step": 1375
},
{
"epoch": 0.42489319939960746,
"grad_norm": 0.06941364422871579,
"learning_rate": 0.00014227806579826774,
"loss": 1.0097,
"step": 1380
},
{
"epoch": 0.4264326675133741,
"grad_norm": 0.07802992325990128,
"learning_rate": 0.00014179028888867867,
"loss": 1.0745,
"step": 1385
},
{
"epoch": 0.4279721356271408,
"grad_norm": 0.08282848213775869,
"learning_rate": 0.00014130130429412815,
"loss": 1.0273,
"step": 1390
},
{
"epoch": 0.4295116037409075,
"grad_norm": 0.08153105885218227,
"learning_rate": 0.0001408111261456346,
"loss": 1.0099,
"step": 1395
},
{
"epoch": 0.43105107185467423,
"grad_norm": 0.0676196363646441,
"learning_rate": 0.00014031976860870855,
"loss": 1.0086,
"step": 1400
},
{
"epoch": 0.43105107185467423,
"eval_loss": 1.0290647745132446,
"eval_runtime": 3812.7057,
"eval_samples_per_second": 6.061,
"eval_steps_per_second": 0.379,
"step": 1400
},
{
"epoch": 0.4325905399684409,
"grad_norm": 0.07464451118448104,
"learning_rate": 0.00013982724588294335,
"loss": 1.0198,
"step": 1405
},
{
"epoch": 0.4341300080822076,
"grad_norm": 0.07512676303409935,
"learning_rate": 0.00013933357220160476,
"loss": 1.0591,
"step": 1410
},
{
"epoch": 0.4356694761959743,
"grad_norm": 0.06938767156943278,
"learning_rate": 0.00013883876183121973,
"loss": 1.0523,
"step": 1415
},
{
"epoch": 0.437208944309741,
"grad_norm": 0.0777955845768148,
"learning_rate": 0.000138342829071164,
"loss": 1.0436,
"step": 1420
},
{
"epoch": 0.4387484124235077,
"grad_norm": 0.076768877519892,
"learning_rate": 0.00013784578825324885,
"loss": 1.0098,
"step": 1425
},
{
"epoch": 0.44028788053727436,
"grad_norm": 0.07138945295655025,
"learning_rate": 0.00013734765374130717,
"loss": 1.0262,
"step": 1430
},
{
"epoch": 0.44182734865104106,
"grad_norm": 0.07543031342117723,
"learning_rate": 0.00013684843993077788,
"loss": 1.0124,
"step": 1435
},
{
"epoch": 0.44336681676480777,
"grad_norm": 0.07202890380872994,
"learning_rate": 0.00013634816124829063,
"loss": 1.0183,
"step": 1440
},
{
"epoch": 0.4449062848785745,
"grad_norm": 0.06782846978809214,
"learning_rate": 0.0001358468321512481,
"loss": 1.0552,
"step": 1445
},
{
"epoch": 0.4464457529923411,
"grad_norm": 0.07176237407539487,
"learning_rate": 0.00013534446712740877,
"loss": 1.025,
"step": 1450
},
{
"epoch": 0.44798522110610783,
"grad_norm": 0.07601715009456655,
"learning_rate": 0.0001348410806944681,
"loss": 1.0153,
"step": 1455
},
{
"epoch": 0.44952468921987454,
"grad_norm": 0.0761359028970367,
"learning_rate": 0.00013433668739963882,
"loss": 1.0244,
"step": 1460
},
{
"epoch": 0.45106415733364125,
"grad_norm": 0.07165181931814346,
"learning_rate": 0.00013383130181923071,
"loss": 1.0237,
"step": 1465
},
{
"epoch": 0.4526036254474079,
"grad_norm": 0.07219427827224394,
"learning_rate": 0.00013332493855822936,
"loss": 1.0064,
"step": 1470
},
{
"epoch": 0.4541430935611746,
"grad_norm": 0.07315240256522645,
"learning_rate": 0.00013281761224987398,
"loss": 1.0049,
"step": 1475
},
{
"epoch": 0.4556825616749413,
"grad_norm": 0.07283831171004836,
"learning_rate": 0.00013230933755523466,
"loss": 1.028,
"step": 1480
},
{
"epoch": 0.457222029788708,
"grad_norm": 0.08277958377037488,
"learning_rate": 0.00013180012916278854,
"loss": 1.0402,
"step": 1485
},
{
"epoch": 0.45876149790247467,
"grad_norm": 0.0732834274712129,
"learning_rate": 0.00013129000178799548,
"loss": 1.0366,
"step": 1490
},
{
"epoch": 0.4603009660162414,
"grad_norm": 0.07270925231246442,
"learning_rate": 0.00013077897017287272,
"loss": 1.0006,
"step": 1495
},
{
"epoch": 0.4618404341300081,
"grad_norm": 0.07601545515982518,
"learning_rate": 0.00013026704908556888,
"loss": 1.0555,
"step": 1500
},
{
"epoch": 0.4633799022437748,
"grad_norm": 0.0775749000511019,
"learning_rate": 0.0001297542533199371,
"loss": 1.0409,
"step": 1505
},
{
"epoch": 0.4649193703575415,
"grad_norm": 0.07378648062711159,
"learning_rate": 0.00012924059769510768,
"loss": 1.0314,
"step": 1510
},
{
"epoch": 0.46645883847130815,
"grad_norm": 0.07321573368492998,
"learning_rate": 0.00012872609705505964,
"loss": 1.0502,
"step": 1515
},
{
"epoch": 0.46799830658507485,
"grad_norm": 0.07930733821420928,
"learning_rate": 0.00012821076626819196,
"loss": 1.0414,
"step": 1520
},
{
"epoch": 0.46953777469884156,
"grad_norm": 0.07511260278964532,
"learning_rate": 0.00012769462022689363,
"loss": 1.0205,
"step": 1525
},
{
"epoch": 0.47107724281260827,
"grad_norm": 0.06779778370699593,
"learning_rate": 0.0001271776738471136,
"loss": 1.0274,
"step": 1530
},
{
"epoch": 0.4726167109263749,
"grad_norm": 0.07717323700425802,
"learning_rate": 0.00012665994206792938,
"loss": 1.0589,
"step": 1535
},
{
"epoch": 0.4741561790401416,
"grad_norm": 0.06842508429114769,
"learning_rate": 0.00012614143985111565,
"loss": 1.0987,
"step": 1540
},
{
"epoch": 0.47569564715390833,
"grad_norm": 0.07643849238067586,
"learning_rate": 0.00012562218218071164,
"loss": 1.0218,
"step": 1545
},
{
"epoch": 0.47723511526767504,
"grad_norm": 0.07406016562514833,
"learning_rate": 0.0001251021840625883,
"loss": 1.0182,
"step": 1550
},
{
"epoch": 0.4787745833814417,
"grad_norm": 0.07954321360596633,
"learning_rate": 0.00012458146052401442,
"loss": 1.0283,
"step": 1555
},
{
"epoch": 0.4803140514952084,
"grad_norm": 0.07374926807557698,
"learning_rate": 0.00012406002661322264,
"loss": 1.0165,
"step": 1560
},
{
"epoch": 0.4818535196089751,
"grad_norm": 0.07376676091481264,
"learning_rate": 0.00012353789739897437,
"loss": 1.0503,
"step": 1565
},
{
"epoch": 0.4833929877227418,
"grad_norm": 0.07439474348790363,
"learning_rate": 0.00012301508797012432,
"loss": 1.0292,
"step": 1570
},
{
"epoch": 0.4849324558365085,
"grad_norm": 0.07661999249880341,
"learning_rate": 0.00012249161343518466,
"loss": 1.0111,
"step": 1575
},
{
"epoch": 0.48647192395027516,
"grad_norm": 0.07208564187421422,
"learning_rate": 0.00012196748892188816,
"loss": 1.0441,
"step": 1580
},
{
"epoch": 0.48801139206404187,
"grad_norm": 0.07849547483606649,
"learning_rate": 0.00012144272957675108,
"loss": 1.0235,
"step": 1585
},
{
"epoch": 0.4895508601778086,
"grad_norm": 0.07505211623162304,
"learning_rate": 0.00012091735056463562,
"loss": 1.0032,
"step": 1590
},
{
"epoch": 0.4910903282915753,
"grad_norm": 0.08312558481401704,
"learning_rate": 0.00012039136706831145,
"loss": 1.059,
"step": 1595
},
{
"epoch": 0.49262979640534194,
"grad_norm": 0.07454335234650318,
"learning_rate": 0.00011986479428801709,
"loss": 1.0362,
"step": 1600
},
{
"epoch": 0.49262979640534194,
"eval_loss": 1.0269535779953003,
"eval_runtime": 3800.036,
"eval_samples_per_second": 6.081,
"eval_steps_per_second": 0.38,
"step": 1600
},
{
"epoch": 0.49416926451910864,
"grad_norm": 0.07456362557265384,
"learning_rate": 0.00011933764744102058,
"loss": 1.016,
"step": 1605
},
{
"epoch": 0.49570873263287535,
"grad_norm": 0.08008662438079932,
"learning_rate": 0.00011880994176117976,
"loss": 1.0392,
"step": 1610
},
{
"epoch": 0.49724820074664206,
"grad_norm": 0.07176749751196013,
"learning_rate": 0.00011828169249850201,
"loss": 1.0392,
"step": 1615
},
{
"epoch": 0.4987876688604087,
"grad_norm": 0.07655608798136061,
"learning_rate": 0.00011775291491870351,
"loss": 1.0212,
"step": 1620
},
{
"epoch": 0.5003271369741754,
"grad_norm": 0.07786227626103659,
"learning_rate": 0.00011722362430276816,
"loss": 1.03,
"step": 1625
},
{
"epoch": 0.5018666050879421,
"grad_norm": 0.07799113973393568,
"learning_rate": 0.00011669383594650593,
"loss": 1.0589,
"step": 1630
},
{
"epoch": 0.5034060732017088,
"grad_norm": 0.06547994125184468,
"learning_rate": 0.00011616356516011083,
"loss": 1.0084,
"step": 1635
},
{
"epoch": 0.5049455413154755,
"grad_norm": 0.07784862670275924,
"learning_rate": 0.00011563282726771847,
"loss": 1.0449,
"step": 1640
},
{
"epoch": 0.5064850094292422,
"grad_norm": 0.0771399540024009,
"learning_rate": 0.0001151016376069632,
"loss": 1.0634,
"step": 1645
},
{
"epoch": 0.508024477543009,
"grad_norm": 0.07334720494239291,
"learning_rate": 0.00011457001152853493,
"loss": 1.0142,
"step": 1650
},
{
"epoch": 0.5095639456567755,
"grad_norm": 0.07439128068501075,
"learning_rate": 0.00011403796439573544,
"loss": 1.0309,
"step": 1655
},
{
"epoch": 0.5111034137705422,
"grad_norm": 0.0708288260968639,
"learning_rate": 0.00011350551158403442,
"loss": 1.0531,
"step": 1660
},
{
"epoch": 0.512642881884309,
"grad_norm": 0.06763171945470464,
"learning_rate": 0.0001129726684806252,
"loss": 1.0086,
"step": 1665
},
{
"epoch": 0.5141823499980757,
"grad_norm": 0.07768921401375369,
"learning_rate": 0.00011243945048398003,
"loss": 1.0148,
"step": 1670
},
{
"epoch": 0.5157218181118424,
"grad_norm": 0.06896327791840266,
"learning_rate": 0.000111905873003405,
"loss": 1.0261,
"step": 1675
},
{
"epoch": 0.5172612862256091,
"grad_norm": 0.07842199537412599,
"learning_rate": 0.00011137195145859494,
"loss": 0.999,
"step": 1680
},
{
"epoch": 0.5188007543393758,
"grad_norm": 0.06865343546929636,
"learning_rate": 0.00011083770127918762,
"loss": 0.9982,
"step": 1685
},
{
"epoch": 0.5203402224531425,
"grad_norm": 0.08103281697737574,
"learning_rate": 0.00011030313790431788,
"loss": 1.042,
"step": 1690
},
{
"epoch": 0.5218796905669091,
"grad_norm": 0.07974961051333619,
"learning_rate": 0.00010976827678217161,
"loss": 1.0039,
"step": 1695
},
{
"epoch": 0.5234191586806758,
"grad_norm": 0.06625737159764002,
"learning_rate": 0.00010923313336953913,
"loss": 1.0115,
"step": 1700
},
{
"epoch": 0.5249586267944425,
"grad_norm": 0.07000163587644152,
"learning_rate": 0.00010869772313136861,
"loss": 1.0223,
"step": 1705
},
{
"epoch": 0.5264980949082092,
"grad_norm": 0.06839426065828255,
"learning_rate": 0.00010816206154031916,
"loss": 1.0088,
"step": 1710
},
{
"epoch": 0.5280375630219759,
"grad_norm": 0.07949491269796151,
"learning_rate": 0.00010762616407631356,
"loss": 1.071,
"step": 1715
},
{
"epoch": 0.5295770311357426,
"grad_norm": 0.07557511886462906,
"learning_rate": 0.00010709004622609116,
"loss": 1.0676,
"step": 1720
},
{
"epoch": 0.5311164992495093,
"grad_norm": 0.08195884945191133,
"learning_rate": 0.00010655372348276006,
"loss": 1.0198,
"step": 1725
},
{
"epoch": 0.532655967363276,
"grad_norm": 0.0781242359342465,
"learning_rate": 0.00010601721134534959,
"loss": 1.0314,
"step": 1730
},
{
"epoch": 0.5341954354770427,
"grad_norm": 0.07628144377233338,
"learning_rate": 0.00010548052531836223,
"loss": 1.0299,
"step": 1735
},
{
"epoch": 0.5357349035908093,
"grad_norm": 0.07983920659956817,
"learning_rate": 0.00010494368091132576,
"loss": 1.0317,
"step": 1740
},
{
"epoch": 0.537274371704576,
"grad_norm": 0.07418551402340584,
"learning_rate": 0.00010440669363834483,
"loss": 1.0129,
"step": 1745
},
{
"epoch": 0.5388138398183427,
"grad_norm": 0.07002417164492032,
"learning_rate": 0.00010386957901765277,
"loss": 1.0278,
"step": 1750
},
{
"epoch": 0.5403533079321095,
"grad_norm": 0.0707377171946109,
"learning_rate": 0.00010333235257116313,
"loss": 0.9727,
"step": 1755
},
{
"epoch": 0.5418927760458762,
"grad_norm": 0.0737915692626489,
"learning_rate": 0.00010279502982402103,
"loss": 1.0433,
"step": 1760
},
{
"epoch": 0.5434322441596429,
"grad_norm": 0.07512990163556856,
"learning_rate": 0.00010225762630415457,
"loss": 1.0111,
"step": 1765
},
{
"epoch": 0.5449717122734096,
"grad_norm": 0.0753662165245646,
"learning_rate": 0.00010172015754182607,
"loss": 1.037,
"step": 1770
},
{
"epoch": 0.5465111803871763,
"grad_norm": 0.1349186814580228,
"learning_rate": 0.00010118263906918331,
"loss": 1.0381,
"step": 1775
},
{
"epoch": 0.5480506485009429,
"grad_norm": 0.07557478098317172,
"learning_rate": 0.00010064508641981054,
"loss": 0.9955,
"step": 1780
},
{
"epoch": 0.5495901166147096,
"grad_norm": 0.07668998832423247,
"learning_rate": 0.0001001075151282798,
"loss": 1.051,
"step": 1785
},
{
"epoch": 0.5511295847284763,
"grad_norm": 0.07585620860956059,
"learning_rate": 9.956994072970179e-05,
"loss": 1.0272,
"step": 1790
},
{
"epoch": 0.552669052842243,
"grad_norm": 0.07008056728318604,
"learning_rate": 9.903237875927698e-05,
"loss": 1.0653,
"step": 1795
},
{
"epoch": 0.5542085209560097,
"grad_norm": 0.07151388140558161,
"learning_rate": 9.849484475184672e-05,
"loss": 1.0155,
"step": 1800
},
{
"epoch": 0.5542085209560097,
"eval_loss": 1.0255825519561768,
"eval_runtime": 3798.3842,
"eval_samples_per_second": 6.084,
"eval_steps_per_second": 0.38,
"step": 1800
},
{
"epoch": 0.5557479890697764,
"grad_norm": 0.08081651371496165,
"learning_rate": 9.795735424144428e-05,
"loss": 1.0102,
"step": 1805
},
{
"epoch": 0.5572874571835431,
"grad_norm": 0.11914760104172627,
"learning_rate": 9.74199227608459e-05,
"loss": 1.0316,
"step": 1810
},
{
"epoch": 0.5588269252973098,
"grad_norm": 0.07607697309485299,
"learning_rate": 9.688256584112192e-05,
"loss": 1.0158,
"step": 1815
},
{
"epoch": 0.5603663934110765,
"grad_norm": 0.07962971403841683,
"learning_rate": 9.634529901118799e-05,
"loss": 1.0243,
"step": 1820
},
{
"epoch": 0.5619058615248431,
"grad_norm": 0.0715309251551143,
"learning_rate": 9.580813779735624e-05,
"loss": 1.0354,
"step": 1825
},
{
"epoch": 0.5634453296386098,
"grad_norm": 0.0752110141233035,
"learning_rate": 9.52710977228867e-05,
"loss": 1.0291,
"step": 1830
},
{
"epoch": 0.5649847977523765,
"grad_norm": 0.07768363081585787,
"learning_rate": 9.473419430753864e-05,
"loss": 0.9735,
"step": 1835
},
{
"epoch": 0.5665242658661432,
"grad_norm": 0.07642437297948991,
"learning_rate": 9.419744306712197e-05,
"loss": 1.0035,
"step": 1840
},
{
"epoch": 0.56806373397991,
"grad_norm": 0.0725535618760288,
"learning_rate": 9.3660859513049e-05,
"loss": 1.0624,
"step": 1845
},
{
"epoch": 0.5696032020936767,
"grad_norm": 0.075838714661654,
"learning_rate": 9.312445915188609e-05,
"loss": 1.0273,
"step": 1850
},
{
"epoch": 0.5711426702074434,
"grad_norm": 0.07494133610203221,
"learning_rate": 9.258825748490558e-05,
"loss": 1.043,
"step": 1855
},
{
"epoch": 0.5726821383212101,
"grad_norm": 0.0774699889487116,
"learning_rate": 9.205227000763788e-05,
"loss": 1.0386,
"step": 1860
},
{
"epoch": 0.5742216064349767,
"grad_norm": 0.0767041878914581,
"learning_rate": 9.151651220942349e-05,
"loss": 1.0475,
"step": 1865
},
{
"epoch": 0.5757610745487434,
"grad_norm": 0.07367320031783889,
"learning_rate": 9.098099957296552e-05,
"loss": 1.0356,
"step": 1870
},
{
"epoch": 0.5773005426625101,
"grad_norm": 0.07237560796588688,
"learning_rate": 9.044574757388224e-05,
"loss": 1.0291,
"step": 1875
},
{
"epoch": 0.5788400107762768,
"grad_norm": 0.0733597412062657,
"learning_rate": 8.991077168025976e-05,
"loss": 1.0289,
"step": 1880
},
{
"epoch": 0.5803794788900435,
"grad_norm": 0.0781082222836356,
"learning_rate": 8.937608735220527e-05,
"loss": 1.0411,
"step": 1885
},
{
"epoch": 0.5819189470038102,
"grad_norm": 0.0787222190348577,
"learning_rate": 8.884171004139996e-05,
"loss": 1.0176,
"step": 1890
},
{
"epoch": 0.5834584151175769,
"grad_norm": 0.07114468470409495,
"learning_rate": 8.830765519065262e-05,
"loss": 0.9838,
"step": 1895
},
{
"epoch": 0.5849978832313436,
"grad_norm": 0.08086984212119945,
"learning_rate": 8.777393823345343e-05,
"loss": 1.0438,
"step": 1900
},
{
"epoch": 0.5865373513451103,
"grad_norm": 0.0703832911854875,
"learning_rate": 8.724057459352784e-05,
"loss": 0.9889,
"step": 1905
},
{
"epoch": 0.5880768194588769,
"grad_norm": 0.06956832590218136,
"learning_rate": 8.670757968439086e-05,
"loss": 1.0573,
"step": 1910
},
{
"epoch": 0.5896162875726436,
"grad_norm": 0.07709995404199901,
"learning_rate": 8.617496890890179e-05,
"loss": 1.0277,
"step": 1915
},
{
"epoch": 0.5911557556864103,
"grad_norm": 0.07724046895867277,
"learning_rate": 8.564275765881887e-05,
"loss": 1.0349,
"step": 1920
},
{
"epoch": 0.592695223800177,
"grad_norm": 0.0743764979962109,
"learning_rate": 8.511096131435454e-05,
"loss": 1.0117,
"step": 1925
},
{
"epoch": 0.5942346919139437,
"grad_norm": 0.07389820884013973,
"learning_rate": 8.457959524373109e-05,
"loss": 1.025,
"step": 1930
},
{
"epoch": 0.5957741600277104,
"grad_norm": 0.07448602446253362,
"learning_rate": 8.404867480273636e-05,
"loss": 1.0524,
"step": 1935
},
{
"epoch": 0.5973136281414771,
"grad_norm": 0.07016834564183058,
"learning_rate": 8.351821533428023e-05,
"loss": 1.0253,
"step": 1940
},
{
"epoch": 0.5988530962552439,
"grad_norm": 0.07324470240300807,
"learning_rate": 8.298823216795093e-05,
"loss": 1.0454,
"step": 1945
},
{
"epoch": 0.6003925643690106,
"grad_norm": 0.07179143590884257,
"learning_rate": 8.245874061957224e-05,
"loss": 1.0349,
"step": 1950
},
{
"epoch": 0.6019320324827772,
"grad_norm": 0.07332238543813964,
"learning_rate": 8.192975599076078e-05,
"loss": 1.0112,
"step": 1955
},
{
"epoch": 0.6034715005965439,
"grad_norm": 0.06832929405392676,
"learning_rate": 8.140129356848387e-05,
"loss": 1.0159,
"step": 1960
},
{
"epoch": 0.6050109687103106,
"grad_norm": 0.07163605823180465,
"learning_rate": 8.087336862461783e-05,
"loss": 1.0064,
"step": 1965
},
{
"epoch": 0.6065504368240773,
"grad_norm": 0.07827638868021947,
"learning_rate": 8.034599641550642e-05,
"loss": 1.0431,
"step": 1970
},
{
"epoch": 0.608089904937844,
"grad_norm": 0.07793183715870357,
"learning_rate": 7.981919218152016e-05,
"loss": 0.9968,
"step": 1975
},
{
"epoch": 0.6096293730516107,
"grad_norm": 0.07306876363104758,
"learning_rate": 7.929297114661581e-05,
"loss": 1.0114,
"step": 1980
},
{
"epoch": 0.6111688411653774,
"grad_norm": 0.07438965485093788,
"learning_rate": 7.876734851789643e-05,
"loss": 1.042,
"step": 1985
},
{
"epoch": 0.6127083092791441,
"grad_norm": 0.07813162760393345,
"learning_rate": 7.824233948517185e-05,
"loss": 1.0437,
"step": 1990
},
{
"epoch": 0.6142477773929107,
"grad_norm": 0.07126718370690863,
"learning_rate": 7.771795922051999e-05,
"loss": 1.0444,
"step": 1995
},
{
"epoch": 0.6157872455066774,
"grad_norm": 0.0756263745043903,
"learning_rate": 7.719422287784798e-05,
"loss": 1.0138,
"step": 2000
},
{
"epoch": 0.6157872455066774,
"eval_loss": 1.02396821975708,
"eval_runtime": 3800.6654,
"eval_samples_per_second": 6.08,
"eval_steps_per_second": 0.38,
"step": 2000
}
],
"logging_steps": 5,
"max_steps": 3247,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 2.3416021014544384e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}