llama2_truth_model / trainer_state.json
Ogamon's picture
Initial commit
13d2f79 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.98793242156074,
"eval_steps": 500,
"global_step": 775,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006436041834271922,
"grad_norm": 181.50096130371094,
"learning_rate": 8.333333333333335e-09,
"loss": 8.4196,
"num_input_tokens_seen": 6848,
"step": 1
},
{
"epoch": 0.012872083668543845,
"grad_norm": 187.05642700195312,
"learning_rate": 1.666666666666667e-08,
"loss": 8.44,
"num_input_tokens_seen": 14000,
"step": 2
},
{
"epoch": 0.019308125502815767,
"grad_norm": 182.92320251464844,
"learning_rate": 2.5000000000000002e-08,
"loss": 8.3839,
"num_input_tokens_seen": 21152,
"step": 3
},
{
"epoch": 0.02574416733708769,
"grad_norm": 186.71311950683594,
"learning_rate": 3.333333333333334e-08,
"loss": 8.4024,
"num_input_tokens_seen": 28224,
"step": 4
},
{
"epoch": 0.032180209171359615,
"grad_norm": 180.32656860351562,
"learning_rate": 4.166666666666667e-08,
"loss": 8.4594,
"num_input_tokens_seen": 35360,
"step": 5
},
{
"epoch": 0.038616251005631534,
"grad_norm": 189.87557983398438,
"learning_rate": 5.0000000000000004e-08,
"loss": 8.4107,
"num_input_tokens_seen": 42192,
"step": 6
},
{
"epoch": 0.04505229283990346,
"grad_norm": 185.89984130859375,
"learning_rate": 5.833333333333334e-08,
"loss": 8.4551,
"num_input_tokens_seen": 49088,
"step": 7
},
{
"epoch": 0.05148833467417538,
"grad_norm": 188.8160400390625,
"learning_rate": 6.666666666666668e-08,
"loss": 8.4415,
"num_input_tokens_seen": 55856,
"step": 8
},
{
"epoch": 0.057924376508447305,
"grad_norm": 190.1417236328125,
"learning_rate": 7.500000000000001e-08,
"loss": 8.4965,
"num_input_tokens_seen": 63120,
"step": 9
},
{
"epoch": 0.06436041834271923,
"grad_norm": 185.3598175048828,
"learning_rate": 8.333333333333334e-08,
"loss": 8.4251,
"num_input_tokens_seen": 69968,
"step": 10
},
{
"epoch": 0.07079646017699115,
"grad_norm": 183.81944274902344,
"learning_rate": 9.166666666666668e-08,
"loss": 8.4291,
"num_input_tokens_seen": 77168,
"step": 11
},
{
"epoch": 0.07723250201126307,
"grad_norm": 196.39779663085938,
"learning_rate": 1.0000000000000001e-07,
"loss": 8.4463,
"num_input_tokens_seen": 84272,
"step": 12
},
{
"epoch": 0.083668543845535,
"grad_norm": 181.4925994873047,
"learning_rate": 1.0833333333333335e-07,
"loss": 8.5116,
"num_input_tokens_seen": 91232,
"step": 13
},
{
"epoch": 0.09010458567980692,
"grad_norm": 190.0314178466797,
"learning_rate": 1.1666666666666668e-07,
"loss": 8.4749,
"num_input_tokens_seen": 97968,
"step": 14
},
{
"epoch": 0.09654062751407884,
"grad_norm": 188.7615203857422,
"learning_rate": 1.2500000000000002e-07,
"loss": 8.3311,
"num_input_tokens_seen": 104864,
"step": 15
},
{
"epoch": 0.10297666934835076,
"grad_norm": 184.1820526123047,
"learning_rate": 1.3333333333333336e-07,
"loss": 8.3729,
"num_input_tokens_seen": 111488,
"step": 16
},
{
"epoch": 0.10941271118262269,
"grad_norm": 181.39308166503906,
"learning_rate": 1.4166666666666668e-07,
"loss": 8.4261,
"num_input_tokens_seen": 118384,
"step": 17
},
{
"epoch": 0.11584875301689461,
"grad_norm": 181.79583740234375,
"learning_rate": 1.5000000000000002e-07,
"loss": 8.3051,
"num_input_tokens_seen": 125360,
"step": 18
},
{
"epoch": 0.12228479485116653,
"grad_norm": 181.36965942382812,
"learning_rate": 1.5833333333333336e-07,
"loss": 8.2461,
"num_input_tokens_seen": 132320,
"step": 19
},
{
"epoch": 0.12872083668543846,
"grad_norm": 182.36839294433594,
"learning_rate": 1.6666666666666668e-07,
"loss": 8.2894,
"num_input_tokens_seen": 139376,
"step": 20
},
{
"epoch": 0.13515687851971037,
"grad_norm": 189.7889404296875,
"learning_rate": 1.7500000000000002e-07,
"loss": 8.2484,
"num_input_tokens_seen": 146544,
"step": 21
},
{
"epoch": 0.1415929203539823,
"grad_norm": 190.1185302734375,
"learning_rate": 1.8333333333333336e-07,
"loss": 8.3034,
"num_input_tokens_seen": 153472,
"step": 22
},
{
"epoch": 0.14802896218825423,
"grad_norm": 183.1331024169922,
"learning_rate": 1.9166666666666668e-07,
"loss": 8.054,
"num_input_tokens_seen": 159856,
"step": 23
},
{
"epoch": 0.15446500402252614,
"grad_norm": 168.13046264648438,
"learning_rate": 2.0000000000000002e-07,
"loss": 7.9583,
"num_input_tokens_seen": 166528,
"step": 24
},
{
"epoch": 0.16090104585679807,
"grad_norm": 167.57830810546875,
"learning_rate": 2.0833333333333333e-07,
"loss": 7.9626,
"num_input_tokens_seen": 173056,
"step": 25
},
{
"epoch": 0.16733708769107,
"grad_norm": 170.6557159423828,
"learning_rate": 2.166666666666667e-07,
"loss": 7.8761,
"num_input_tokens_seen": 179616,
"step": 26
},
{
"epoch": 0.1737731295253419,
"grad_norm": 179.7693328857422,
"learning_rate": 2.2500000000000002e-07,
"loss": 7.8896,
"num_input_tokens_seen": 186912,
"step": 27
},
{
"epoch": 0.18020917135961384,
"grad_norm": 180.4197998046875,
"learning_rate": 2.3333333333333336e-07,
"loss": 7.8352,
"num_input_tokens_seen": 193936,
"step": 28
},
{
"epoch": 0.18664521319388577,
"grad_norm": 164.2944793701172,
"learning_rate": 2.416666666666667e-07,
"loss": 7.691,
"num_input_tokens_seen": 200672,
"step": 29
},
{
"epoch": 0.19308125502815768,
"grad_norm": 167.71722412109375,
"learning_rate": 2.5000000000000004e-07,
"loss": 7.7851,
"num_input_tokens_seen": 207536,
"step": 30
},
{
"epoch": 0.1995172968624296,
"grad_norm": 169.2217254638672,
"learning_rate": 2.5833333333333333e-07,
"loss": 7.7249,
"num_input_tokens_seen": 214640,
"step": 31
},
{
"epoch": 0.20595333869670152,
"grad_norm": 155.74537658691406,
"learning_rate": 2.666666666666667e-07,
"loss": 6.8838,
"num_input_tokens_seen": 221744,
"step": 32
},
{
"epoch": 0.21238938053097345,
"grad_norm": 148.12120056152344,
"learning_rate": 2.75e-07,
"loss": 6.7173,
"num_input_tokens_seen": 228624,
"step": 33
},
{
"epoch": 0.21882542236524538,
"grad_norm": 150.97012329101562,
"learning_rate": 2.8333333333333336e-07,
"loss": 6.6793,
"num_input_tokens_seen": 235456,
"step": 34
},
{
"epoch": 0.2252614641995173,
"grad_norm": 149.623291015625,
"learning_rate": 2.916666666666667e-07,
"loss": 6.725,
"num_input_tokens_seen": 242768,
"step": 35
},
{
"epoch": 0.23169750603378922,
"grad_norm": 147.1656036376953,
"learning_rate": 3.0000000000000004e-07,
"loss": 6.6905,
"num_input_tokens_seen": 249552,
"step": 36
},
{
"epoch": 0.23813354786806115,
"grad_norm": 151.0162811279297,
"learning_rate": 3.083333333333334e-07,
"loss": 6.6179,
"num_input_tokens_seen": 256160,
"step": 37
},
{
"epoch": 0.24456958970233306,
"grad_norm": 150.03030395507812,
"learning_rate": 3.166666666666667e-07,
"loss": 6.501,
"num_input_tokens_seen": 262912,
"step": 38
},
{
"epoch": 0.251005631536605,
"grad_norm": 145.5784149169922,
"learning_rate": 3.25e-07,
"loss": 6.4588,
"num_input_tokens_seen": 269600,
"step": 39
},
{
"epoch": 0.2574416733708769,
"grad_norm": 143.5873565673828,
"learning_rate": 3.3333333333333335e-07,
"loss": 6.3614,
"num_input_tokens_seen": 276560,
"step": 40
},
{
"epoch": 0.26387771520514886,
"grad_norm": 144.9624481201172,
"learning_rate": 3.416666666666667e-07,
"loss": 6.2775,
"num_input_tokens_seen": 283696,
"step": 41
},
{
"epoch": 0.27031375703942073,
"grad_norm": 146.71554565429688,
"learning_rate": 3.5000000000000004e-07,
"loss": 5.9868,
"num_input_tokens_seen": 290832,
"step": 42
},
{
"epoch": 0.27674979887369267,
"grad_norm": 138.25450134277344,
"learning_rate": 3.583333333333334e-07,
"loss": 5.2286,
"num_input_tokens_seen": 298096,
"step": 43
},
{
"epoch": 0.2831858407079646,
"grad_norm": 156.28713989257812,
"learning_rate": 3.666666666666667e-07,
"loss": 4.5076,
"num_input_tokens_seen": 305120,
"step": 44
},
{
"epoch": 0.28962188254223653,
"grad_norm": 178.4820556640625,
"learning_rate": 3.75e-07,
"loss": 4.1167,
"num_input_tokens_seen": 312000,
"step": 45
},
{
"epoch": 0.29605792437650846,
"grad_norm": 317.7680358886719,
"learning_rate": 3.8333333333333335e-07,
"loss": 3.6585,
"num_input_tokens_seen": 319008,
"step": 46
},
{
"epoch": 0.3024939662107804,
"grad_norm": 282.17803955078125,
"learning_rate": 3.9166666666666675e-07,
"loss": 3.3613,
"num_input_tokens_seen": 326192,
"step": 47
},
{
"epoch": 0.3089300080450523,
"grad_norm": 257.7794494628906,
"learning_rate": 4.0000000000000003e-07,
"loss": 3.1068,
"num_input_tokens_seen": 333664,
"step": 48
},
{
"epoch": 0.3153660498793242,
"grad_norm": 255.1024169921875,
"learning_rate": 4.083333333333334e-07,
"loss": 2.9368,
"num_input_tokens_seen": 340912,
"step": 49
},
{
"epoch": 0.32180209171359614,
"grad_norm": 259.47015380859375,
"learning_rate": 4.1666666666666667e-07,
"loss": 2.3466,
"num_input_tokens_seen": 347712,
"step": 50
},
{
"epoch": 0.32823813354786807,
"grad_norm": 263.3533935546875,
"learning_rate": 4.2500000000000006e-07,
"loss": 2.0645,
"num_input_tokens_seen": 355232,
"step": 51
},
{
"epoch": 0.33467417538214,
"grad_norm": 239.1399688720703,
"learning_rate": 4.333333333333334e-07,
"loss": 1.7729,
"num_input_tokens_seen": 361968,
"step": 52
},
{
"epoch": 0.3411102172164119,
"grad_norm": 257.4410095214844,
"learning_rate": 4.416666666666667e-07,
"loss": 1.6199,
"num_input_tokens_seen": 369136,
"step": 53
},
{
"epoch": 0.3475462590506838,
"grad_norm": 169.56935119628906,
"learning_rate": 4.5000000000000003e-07,
"loss": 1.1593,
"num_input_tokens_seen": 375904,
"step": 54
},
{
"epoch": 0.35398230088495575,
"grad_norm": 95.25677490234375,
"learning_rate": 4.583333333333333e-07,
"loss": 0.7199,
"num_input_tokens_seen": 382848,
"step": 55
},
{
"epoch": 0.3604183427192277,
"grad_norm": 48.7137451171875,
"learning_rate": 4.666666666666667e-07,
"loss": 0.4394,
"num_input_tokens_seen": 389680,
"step": 56
},
{
"epoch": 0.3668543845534996,
"grad_norm": 62.34474563598633,
"learning_rate": 4.7500000000000006e-07,
"loss": 0.3806,
"num_input_tokens_seen": 396192,
"step": 57
},
{
"epoch": 0.37329042638777155,
"grad_norm": 30.711780548095703,
"learning_rate": 4.833333333333334e-07,
"loss": 0.3185,
"num_input_tokens_seen": 403104,
"step": 58
},
{
"epoch": 0.3797264682220434,
"grad_norm": 34.46913528442383,
"learning_rate": 4.916666666666667e-07,
"loss": 0.3056,
"num_input_tokens_seen": 410176,
"step": 59
},
{
"epoch": 0.38616251005631536,
"grad_norm": 25.92363166809082,
"learning_rate": 5.000000000000001e-07,
"loss": 0.2981,
"num_input_tokens_seen": 416928,
"step": 60
},
{
"epoch": 0.3925985518905873,
"grad_norm": 11.064619064331055,
"learning_rate": 5.083333333333334e-07,
"loss": 0.2473,
"num_input_tokens_seen": 424128,
"step": 61
},
{
"epoch": 0.3990345937248592,
"grad_norm": 55.367347717285156,
"learning_rate": 5.166666666666667e-07,
"loss": 0.2924,
"num_input_tokens_seen": 430864,
"step": 62
},
{
"epoch": 0.40547063555913115,
"grad_norm": 42.00873947143555,
"learning_rate": 5.250000000000001e-07,
"loss": 0.2656,
"num_input_tokens_seen": 437744,
"step": 63
},
{
"epoch": 0.41190667739340303,
"grad_norm": 13.313591003417969,
"learning_rate": 5.333333333333335e-07,
"loss": 0.2335,
"num_input_tokens_seen": 444624,
"step": 64
},
{
"epoch": 0.41834271922767496,
"grad_norm": 60.489715576171875,
"learning_rate": 5.416666666666667e-07,
"loss": 0.2647,
"num_input_tokens_seen": 451696,
"step": 65
},
{
"epoch": 0.4247787610619469,
"grad_norm": 77.01821899414062,
"learning_rate": 5.5e-07,
"loss": 0.3003,
"num_input_tokens_seen": 458784,
"step": 66
},
{
"epoch": 0.43121480289621883,
"grad_norm": 58.067596435546875,
"learning_rate": 5.583333333333333e-07,
"loss": 0.2656,
"num_input_tokens_seen": 465920,
"step": 67
},
{
"epoch": 0.43765084473049076,
"grad_norm": 12.40570068359375,
"learning_rate": 5.666666666666667e-07,
"loss": 0.2212,
"num_input_tokens_seen": 473152,
"step": 68
},
{
"epoch": 0.4440868865647627,
"grad_norm": 35.392276763916016,
"learning_rate": 5.750000000000001e-07,
"loss": 0.2532,
"num_input_tokens_seen": 480544,
"step": 69
},
{
"epoch": 0.4505229283990346,
"grad_norm": 51.42181396484375,
"learning_rate": 5.833333333333334e-07,
"loss": 0.2799,
"num_input_tokens_seen": 487552,
"step": 70
},
{
"epoch": 0.4569589702333065,
"grad_norm": 45.73934555053711,
"learning_rate": 5.916666666666667e-07,
"loss": 0.2876,
"num_input_tokens_seen": 494256,
"step": 71
},
{
"epoch": 0.46339501206757844,
"grad_norm": 20.654096603393555,
"learning_rate": 6.000000000000001e-07,
"loss": 0.2191,
"num_input_tokens_seen": 500768,
"step": 72
},
{
"epoch": 0.46983105390185037,
"grad_norm": 21.078027725219727,
"learning_rate": 6.083333333333334e-07,
"loss": 0.2344,
"num_input_tokens_seen": 507136,
"step": 73
},
{
"epoch": 0.4762670957361223,
"grad_norm": 36.7335205078125,
"learning_rate": 6.166666666666668e-07,
"loss": 0.2547,
"num_input_tokens_seen": 514208,
"step": 74
},
{
"epoch": 0.4827031375703942,
"grad_norm": 34.47271728515625,
"learning_rate": 6.25e-07,
"loss": 0.2349,
"num_input_tokens_seen": 521120,
"step": 75
},
{
"epoch": 0.4891391794046661,
"grad_norm": 5.103244781494141,
"learning_rate": 6.333333333333334e-07,
"loss": 0.2045,
"num_input_tokens_seen": 527824,
"step": 76
},
{
"epoch": 0.49557522123893805,
"grad_norm": 22.47526741027832,
"learning_rate": 6.416666666666667e-07,
"loss": 0.2262,
"num_input_tokens_seen": 534832,
"step": 77
},
{
"epoch": 0.50201126307321,
"grad_norm": 30.610803604125977,
"learning_rate": 6.5e-07,
"loss": 0.2393,
"num_input_tokens_seen": 541696,
"step": 78
},
{
"epoch": 0.5084473049074819,
"grad_norm": 10.922965049743652,
"learning_rate": 6.583333333333333e-07,
"loss": 0.2206,
"num_input_tokens_seen": 548608,
"step": 79
},
{
"epoch": 0.5148833467417538,
"grad_norm": 17.484182357788086,
"learning_rate": 6.666666666666667e-07,
"loss": 0.2029,
"num_input_tokens_seen": 555456,
"step": 80
},
{
"epoch": 0.5213193885760258,
"grad_norm": 16.49226188659668,
"learning_rate": 6.750000000000001e-07,
"loss": 0.2125,
"num_input_tokens_seen": 562768,
"step": 81
},
{
"epoch": 0.5277554304102977,
"grad_norm": 9.977084159851074,
"learning_rate": 6.833333333333334e-07,
"loss": 0.2023,
"num_input_tokens_seen": 569536,
"step": 82
},
{
"epoch": 0.5341914722445696,
"grad_norm": 17.79197120666504,
"learning_rate": 6.916666666666668e-07,
"loss": 0.2262,
"num_input_tokens_seen": 576096,
"step": 83
},
{
"epoch": 0.5406275140788415,
"grad_norm": 16.699260711669922,
"learning_rate": 7.000000000000001e-07,
"loss": 0.2003,
"num_input_tokens_seen": 583472,
"step": 84
},
{
"epoch": 0.5470635559131134,
"grad_norm": 25.02164077758789,
"learning_rate": 7.083333333333334e-07,
"loss": 0.2351,
"num_input_tokens_seen": 590304,
"step": 85
},
{
"epoch": 0.5534995977473853,
"grad_norm": 3.8612709045410156,
"learning_rate": 7.166666666666668e-07,
"loss": 0.1839,
"num_input_tokens_seen": 597152,
"step": 86
},
{
"epoch": 0.5599356395816573,
"grad_norm": 31.555482864379883,
"learning_rate": 7.25e-07,
"loss": 0.2315,
"num_input_tokens_seen": 604208,
"step": 87
},
{
"epoch": 0.5663716814159292,
"grad_norm": 54.94756317138672,
"learning_rate": 7.333333333333334e-07,
"loss": 0.2732,
"num_input_tokens_seen": 610896,
"step": 88
},
{
"epoch": 0.5728077232502011,
"grad_norm": 30.55241584777832,
"learning_rate": 7.416666666666668e-07,
"loss": 0.2405,
"num_input_tokens_seen": 618112,
"step": 89
},
{
"epoch": 0.5792437650844731,
"grad_norm": 16.687997817993164,
"learning_rate": 7.5e-07,
"loss": 0.2005,
"num_input_tokens_seen": 625040,
"step": 90
},
{
"epoch": 0.585679806918745,
"grad_norm": 10.350790977478027,
"learning_rate": 7.583333333333334e-07,
"loss": 0.2005,
"num_input_tokens_seen": 631840,
"step": 91
},
{
"epoch": 0.5921158487530169,
"grad_norm": 25.88368797302246,
"learning_rate": 7.666666666666667e-07,
"loss": 0.2115,
"num_input_tokens_seen": 638752,
"step": 92
},
{
"epoch": 0.5985518905872889,
"grad_norm": 17.11625099182129,
"learning_rate": 7.750000000000001e-07,
"loss": 0.2141,
"num_input_tokens_seen": 645968,
"step": 93
},
{
"epoch": 0.6049879324215608,
"grad_norm": 12.70864200592041,
"learning_rate": 7.833333333333335e-07,
"loss": 0.1898,
"num_input_tokens_seen": 652752,
"step": 94
},
{
"epoch": 0.6114239742558326,
"grad_norm": 3.674001455307007,
"learning_rate": 7.916666666666667e-07,
"loss": 0.2099,
"num_input_tokens_seen": 660048,
"step": 95
},
{
"epoch": 0.6178600160901045,
"grad_norm": 20.51032066345215,
"learning_rate": 8.000000000000001e-07,
"loss": 0.2014,
"num_input_tokens_seen": 666752,
"step": 96
},
{
"epoch": 0.6242960579243765,
"grad_norm": 47.562381744384766,
"learning_rate": 8.083333333333334e-07,
"loss": 0.2349,
"num_input_tokens_seen": 673856,
"step": 97
},
{
"epoch": 0.6307320997586484,
"grad_norm": 35.69169998168945,
"learning_rate": 8.166666666666668e-07,
"loss": 0.2205,
"num_input_tokens_seen": 681104,
"step": 98
},
{
"epoch": 0.6371681415929203,
"grad_norm": 10.080629348754883,
"learning_rate": 8.250000000000001e-07,
"loss": 0.199,
"num_input_tokens_seen": 688128,
"step": 99
},
{
"epoch": 0.6436041834271923,
"grad_norm": 26.242666244506836,
"learning_rate": 8.333333333333333e-07,
"loss": 0.236,
"num_input_tokens_seen": 695216,
"step": 100
},
{
"epoch": 0.6500402252614642,
"grad_norm": 22.0434627532959,
"learning_rate": 8.416666666666667e-07,
"loss": 0.2265,
"num_input_tokens_seen": 701968,
"step": 101
},
{
"epoch": 0.6564762670957361,
"grad_norm": 27.378408432006836,
"learning_rate": 8.500000000000001e-07,
"loss": 0.2443,
"num_input_tokens_seen": 708928,
"step": 102
},
{
"epoch": 0.6629123089300081,
"grad_norm": 11.929069519042969,
"learning_rate": 8.583333333333334e-07,
"loss": 0.2086,
"num_input_tokens_seen": 715952,
"step": 103
},
{
"epoch": 0.66934835076428,
"grad_norm": 6.677243232727051,
"learning_rate": 8.666666666666668e-07,
"loss": 0.1915,
"num_input_tokens_seen": 722928,
"step": 104
},
{
"epoch": 0.6757843925985519,
"grad_norm": 17.033658981323242,
"learning_rate": 8.75e-07,
"loss": 0.1967,
"num_input_tokens_seen": 730160,
"step": 105
},
{
"epoch": 0.6822204344328238,
"grad_norm": 6.806990146636963,
"learning_rate": 8.833333333333334e-07,
"loss": 0.188,
"num_input_tokens_seen": 737088,
"step": 106
},
{
"epoch": 0.6886564762670957,
"grad_norm": 4.871335506439209,
"learning_rate": 8.916666666666668e-07,
"loss": 0.1895,
"num_input_tokens_seen": 743744,
"step": 107
},
{
"epoch": 0.6950925181013676,
"grad_norm": 9.054122924804688,
"learning_rate": 9.000000000000001e-07,
"loss": 0.1667,
"num_input_tokens_seen": 750496,
"step": 108
},
{
"epoch": 0.7015285599356396,
"grad_norm": 15.78903579711914,
"learning_rate": 9.083333333333335e-07,
"loss": 0.1976,
"num_input_tokens_seen": 757792,
"step": 109
},
{
"epoch": 0.7079646017699115,
"grad_norm": 10.51429271697998,
"learning_rate": 9.166666666666666e-07,
"loss": 0.2057,
"num_input_tokens_seen": 764992,
"step": 110
},
{
"epoch": 0.7144006436041834,
"grad_norm": 24.346830368041992,
"learning_rate": 9.25e-07,
"loss": 0.2002,
"num_input_tokens_seen": 771648,
"step": 111
},
{
"epoch": 0.7208366854384554,
"grad_norm": 46.50392532348633,
"learning_rate": 9.333333333333334e-07,
"loss": 0.2173,
"num_input_tokens_seen": 778480,
"step": 112
},
{
"epoch": 0.7272727272727273,
"grad_norm": 22.505762100219727,
"learning_rate": 9.416666666666667e-07,
"loss": 0.1756,
"num_input_tokens_seen": 785328,
"step": 113
},
{
"epoch": 0.7337087691069992,
"grad_norm": 5.675211429595947,
"learning_rate": 9.500000000000001e-07,
"loss": 0.1786,
"num_input_tokens_seen": 792592,
"step": 114
},
{
"epoch": 0.7401448109412712,
"grad_norm": 14.814651489257812,
"learning_rate": 9.583333333333334e-07,
"loss": 0.1879,
"num_input_tokens_seen": 799808,
"step": 115
},
{
"epoch": 0.7465808527755431,
"grad_norm": 13.106173515319824,
"learning_rate": 9.666666666666668e-07,
"loss": 0.173,
"num_input_tokens_seen": 806896,
"step": 116
},
{
"epoch": 0.7530168946098149,
"grad_norm": 24.56918716430664,
"learning_rate": 9.750000000000002e-07,
"loss": 0.1714,
"num_input_tokens_seen": 813536,
"step": 117
},
{
"epoch": 0.7594529364440868,
"grad_norm": 27.256954193115234,
"learning_rate": 9.833333333333334e-07,
"loss": 0.2015,
"num_input_tokens_seen": 820608,
"step": 118
},
{
"epoch": 0.7658889782783588,
"grad_norm": 4.209413051605225,
"learning_rate": 9.916666666666668e-07,
"loss": 0.1847,
"num_input_tokens_seen": 827776,
"step": 119
},
{
"epoch": 0.7723250201126307,
"grad_norm": 18.684349060058594,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.1876,
"num_input_tokens_seen": 834704,
"step": 120
},
{
"epoch": 0.7787610619469026,
"grad_norm": 19.470041275024414,
"learning_rate": 1.0083333333333333e-06,
"loss": 0.1937,
"num_input_tokens_seen": 841568,
"step": 121
},
{
"epoch": 0.7851971037811746,
"grad_norm": 11.242873191833496,
"learning_rate": 1.0166666666666667e-06,
"loss": 0.1974,
"num_input_tokens_seen": 848704,
"step": 122
},
{
"epoch": 0.7916331456154465,
"grad_norm": 26.72730255126953,
"learning_rate": 1.025e-06,
"loss": 0.2099,
"num_input_tokens_seen": 855664,
"step": 123
},
{
"epoch": 0.7980691874497184,
"grad_norm": 41.4288215637207,
"learning_rate": 1.0333333333333333e-06,
"loss": 0.2239,
"num_input_tokens_seen": 862464,
"step": 124
},
{
"epoch": 0.8045052292839904,
"grad_norm": 27.283327102661133,
"learning_rate": 1.0416666666666667e-06,
"loss": 0.1953,
"num_input_tokens_seen": 869376,
"step": 125
},
{
"epoch": 0.8109412711182623,
"grad_norm": 4.882501602172852,
"learning_rate": 1.0500000000000001e-06,
"loss": 0.1906,
"num_input_tokens_seen": 876848,
"step": 126
},
{
"epoch": 0.8173773129525342,
"grad_norm": 8.478296279907227,
"learning_rate": 1.0583333333333335e-06,
"loss": 0.1852,
"num_input_tokens_seen": 883664,
"step": 127
},
{
"epoch": 0.8238133547868061,
"grad_norm": 6.773479461669922,
"learning_rate": 1.066666666666667e-06,
"loss": 0.198,
"num_input_tokens_seen": 890592,
"step": 128
},
{
"epoch": 0.830249396621078,
"grad_norm": 21.877212524414062,
"learning_rate": 1.075e-06,
"loss": 0.2105,
"num_input_tokens_seen": 898048,
"step": 129
},
{
"epoch": 0.8366854384553499,
"grad_norm": 12.123941421508789,
"learning_rate": 1.0833333333333335e-06,
"loss": 0.1899,
"num_input_tokens_seen": 905040,
"step": 130
},
{
"epoch": 0.8431214802896219,
"grad_norm": 15.84151554107666,
"learning_rate": 1.0916666666666667e-06,
"loss": 0.1742,
"num_input_tokens_seen": 912080,
"step": 131
},
{
"epoch": 0.8495575221238938,
"grad_norm": 8.174356460571289,
"learning_rate": 1.1e-06,
"loss": 0.1585,
"num_input_tokens_seen": 919424,
"step": 132
},
{
"epoch": 0.8559935639581657,
"grad_norm": 14.87348461151123,
"learning_rate": 1.1083333333333335e-06,
"loss": 0.1878,
"num_input_tokens_seen": 926608,
"step": 133
},
{
"epoch": 0.8624296057924377,
"grad_norm": 11.989315032958984,
"learning_rate": 1.1166666666666666e-06,
"loss": 0.1748,
"num_input_tokens_seen": 933712,
"step": 134
},
{
"epoch": 0.8688656476267096,
"grad_norm": 9.659666061401367,
"learning_rate": 1.125e-06,
"loss": 0.1944,
"num_input_tokens_seen": 940304,
"step": 135
},
{
"epoch": 0.8753016894609815,
"grad_norm": 20.558237075805664,
"learning_rate": 1.1333333333333334e-06,
"loss": 0.1727,
"num_input_tokens_seen": 947008,
"step": 136
},
{
"epoch": 0.8817377312952535,
"grad_norm": 8.66232967376709,
"learning_rate": 1.1416666666666668e-06,
"loss": 0.1748,
"num_input_tokens_seen": 954112,
"step": 137
},
{
"epoch": 0.8881737731295254,
"grad_norm": 16.516559600830078,
"learning_rate": 1.1500000000000002e-06,
"loss": 0.1625,
"num_input_tokens_seen": 961120,
"step": 138
},
{
"epoch": 0.8946098149637972,
"grad_norm": 6.140871047973633,
"learning_rate": 1.1583333333333334e-06,
"loss": 0.1649,
"num_input_tokens_seen": 967792,
"step": 139
},
{
"epoch": 0.9010458567980691,
"grad_norm": 11.593804359436035,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.1738,
"num_input_tokens_seen": 974496,
"step": 140
},
{
"epoch": 0.9074818986323411,
"grad_norm": 26.92620849609375,
"learning_rate": 1.175e-06,
"loss": 0.2221,
"num_input_tokens_seen": 981344,
"step": 141
},
{
"epoch": 0.913917940466613,
"grad_norm": 26.845230102539062,
"learning_rate": 1.1833333333333334e-06,
"loss": 0.1989,
"num_input_tokens_seen": 988224,
"step": 142
},
{
"epoch": 0.9203539823008849,
"grad_norm": 12.823030471801758,
"learning_rate": 1.1916666666666668e-06,
"loss": 0.1569,
"num_input_tokens_seen": 995552,
"step": 143
},
{
"epoch": 0.9267900241351569,
"grad_norm": 14.508877754211426,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.1594,
"num_input_tokens_seen": 1002224,
"step": 144
},
{
"epoch": 0.9332260659694288,
"grad_norm": 13.097854614257812,
"learning_rate": 1.2083333333333333e-06,
"loss": 0.1609,
"num_input_tokens_seen": 1009312,
"step": 145
},
{
"epoch": 0.9396621078037007,
"grad_norm": 12.183431625366211,
"learning_rate": 1.2166666666666667e-06,
"loss": 0.1649,
"num_input_tokens_seen": 1016256,
"step": 146
},
{
"epoch": 0.9460981496379727,
"grad_norm": 10.628469467163086,
"learning_rate": 1.2250000000000001e-06,
"loss": 0.1412,
"num_input_tokens_seen": 1022880,
"step": 147
},
{
"epoch": 0.9525341914722446,
"grad_norm": 11.713327407836914,
"learning_rate": 1.2333333333333335e-06,
"loss": 0.165,
"num_input_tokens_seen": 1029856,
"step": 148
},
{
"epoch": 0.9589702333065165,
"grad_norm": 10.031126976013184,
"learning_rate": 1.2416666666666667e-06,
"loss": 0.1971,
"num_input_tokens_seen": 1036928,
"step": 149
},
{
"epoch": 0.9654062751407884,
"grad_norm": 34.122074127197266,
"learning_rate": 1.25e-06,
"loss": 0.1843,
"num_input_tokens_seen": 1044000,
"step": 150
},
{
"epoch": 0.9718423169750603,
"grad_norm": 13.707520484924316,
"learning_rate": 1.2583333333333333e-06,
"loss": 0.1628,
"num_input_tokens_seen": 1050928,
"step": 151
},
{
"epoch": 0.9782783588093322,
"grad_norm": 8.588343620300293,
"learning_rate": 1.2666666666666669e-06,
"loss": 0.1878,
"num_input_tokens_seen": 1057920,
"step": 152
},
{
"epoch": 0.9847144006436042,
"grad_norm": 4.411599159240723,
"learning_rate": 1.275e-06,
"loss": 0.1153,
"num_input_tokens_seen": 1064704,
"step": 153
},
{
"epoch": 0.9911504424778761,
"grad_norm": 13.095698356628418,
"learning_rate": 1.2833333333333335e-06,
"loss": 0.1622,
"num_input_tokens_seen": 1071760,
"step": 154
},
{
"epoch": 0.997586484312148,
"grad_norm": 14.093315124511719,
"learning_rate": 1.2916666666666669e-06,
"loss": 0.1549,
"num_input_tokens_seen": 1078912,
"step": 155
},
{
"epoch": 1.00402252614642,
"grad_norm": 17.082075119018555,
"learning_rate": 1.3e-06,
"loss": 0.1729,
"num_input_tokens_seen": 1086288,
"step": 156
},
{
"epoch": 1.010458567980692,
"grad_norm": 4.992012977600098,
"learning_rate": 1.3083333333333334e-06,
"loss": 0.1198,
"num_input_tokens_seen": 1093584,
"step": 157
},
{
"epoch": 1.0168946098149638,
"grad_norm": 5.45336389541626,
"learning_rate": 1.3166666666666666e-06,
"loss": 0.1723,
"num_input_tokens_seen": 1100432,
"step": 158
},
{
"epoch": 1.0233306516492358,
"grad_norm": 7.4880757331848145,
"learning_rate": 1.3250000000000002e-06,
"loss": 0.1485,
"num_input_tokens_seen": 1107280,
"step": 159
},
{
"epoch": 1.0297666934835077,
"grad_norm": 40.28890609741211,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.1757,
"num_input_tokens_seen": 1113968,
"step": 160
},
{
"epoch": 1.0362027353177796,
"grad_norm": 39.24993896484375,
"learning_rate": 1.3416666666666666e-06,
"loss": 0.1907,
"num_input_tokens_seen": 1120752,
"step": 161
},
{
"epoch": 1.0426387771520516,
"grad_norm": 5.63855504989624,
"learning_rate": 1.3500000000000002e-06,
"loss": 0.1842,
"num_input_tokens_seen": 1127712,
"step": 162
},
{
"epoch": 1.0490748189863235,
"grad_norm": 5.1802754402160645,
"learning_rate": 1.3583333333333334e-06,
"loss": 0.1549,
"num_input_tokens_seen": 1134592,
"step": 163
},
{
"epoch": 1.0555108608205954,
"grad_norm": 4.200067043304443,
"learning_rate": 1.3666666666666668e-06,
"loss": 0.153,
"num_input_tokens_seen": 1141888,
"step": 164
},
{
"epoch": 1.0619469026548674,
"grad_norm": 6.892277240753174,
"learning_rate": 1.3750000000000002e-06,
"loss": 0.1532,
"num_input_tokens_seen": 1148688,
"step": 165
},
{
"epoch": 1.068382944489139,
"grad_norm": 11.852892875671387,
"learning_rate": 1.3833333333333336e-06,
"loss": 0.1629,
"num_input_tokens_seen": 1155552,
"step": 166
},
{
"epoch": 1.074818986323411,
"grad_norm": 8.346076011657715,
"learning_rate": 1.3916666666666668e-06,
"loss": 0.1708,
"num_input_tokens_seen": 1162624,
"step": 167
},
{
"epoch": 1.081255028157683,
"grad_norm": 7.836976528167725,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.1461,
"num_input_tokens_seen": 1169904,
"step": 168
},
{
"epoch": 1.0876910699919549,
"grad_norm": 15.59913158416748,
"learning_rate": 1.4083333333333335e-06,
"loss": 0.1402,
"num_input_tokens_seen": 1176928,
"step": 169
},
{
"epoch": 1.0941271118262268,
"grad_norm": 8.46536636352539,
"learning_rate": 1.4166666666666667e-06,
"loss": 0.143,
"num_input_tokens_seen": 1184160,
"step": 170
},
{
"epoch": 1.1005631536604987,
"grad_norm": 7.491546154022217,
"learning_rate": 1.425e-06,
"loss": 0.1454,
"num_input_tokens_seen": 1191120,
"step": 171
},
{
"epoch": 1.1069991954947707,
"grad_norm": 16.70829200744629,
"learning_rate": 1.4333333333333335e-06,
"loss": 0.1286,
"num_input_tokens_seen": 1197920,
"step": 172
},
{
"epoch": 1.1134352373290426,
"grad_norm": 16.273927688598633,
"learning_rate": 1.4416666666666667e-06,
"loss": 0.1523,
"num_input_tokens_seen": 1204576,
"step": 173
},
{
"epoch": 1.1198712791633145,
"grad_norm": 8.122928619384766,
"learning_rate": 1.45e-06,
"loss": 0.1345,
"num_input_tokens_seen": 1211344,
"step": 174
},
{
"epoch": 1.1263073209975865,
"grad_norm": 27.850522994995117,
"learning_rate": 1.4583333333333335e-06,
"loss": 0.1749,
"num_input_tokens_seen": 1218432,
"step": 175
},
{
"epoch": 1.1327433628318584,
"grad_norm": 30.498666763305664,
"learning_rate": 1.4666666666666669e-06,
"loss": 0.166,
"num_input_tokens_seen": 1225728,
"step": 176
},
{
"epoch": 1.1391794046661303,
"grad_norm": 26.916791915893555,
"learning_rate": 1.475e-06,
"loss": 0.1708,
"num_input_tokens_seen": 1232784,
"step": 177
},
{
"epoch": 1.1456154465004023,
"grad_norm": 13.593954086303711,
"learning_rate": 1.4833333333333337e-06,
"loss": 0.1363,
"num_input_tokens_seen": 1239472,
"step": 178
},
{
"epoch": 1.1520514883346742,
"grad_norm": 17.63590431213379,
"learning_rate": 1.4916666666666669e-06,
"loss": 0.1369,
"num_input_tokens_seen": 1246864,
"step": 179
},
{
"epoch": 1.1584875301689461,
"grad_norm": 12.465302467346191,
"learning_rate": 1.5e-06,
"loss": 0.1632,
"num_input_tokens_seen": 1253936,
"step": 180
},
{
"epoch": 1.164923572003218,
"grad_norm": 18.099266052246094,
"learning_rate": 1.5083333333333336e-06,
"loss": 0.1734,
"num_input_tokens_seen": 1261120,
"step": 181
},
{
"epoch": 1.17135961383749,
"grad_norm": 12.134090423583984,
"learning_rate": 1.5166666666666668e-06,
"loss": 0.135,
"num_input_tokens_seen": 1268208,
"step": 182
},
{
"epoch": 1.177795655671762,
"grad_norm": 5.747508525848389,
"learning_rate": 1.525e-06,
"loss": 0.1355,
"num_input_tokens_seen": 1275296,
"step": 183
},
{
"epoch": 1.1842316975060339,
"grad_norm": 16.193449020385742,
"learning_rate": 1.5333333333333334e-06,
"loss": 0.1324,
"num_input_tokens_seen": 1282320,
"step": 184
},
{
"epoch": 1.1906677393403058,
"grad_norm": 23.576427459716797,
"learning_rate": 1.5416666666666668e-06,
"loss": 0.1754,
"num_input_tokens_seen": 1289008,
"step": 185
},
{
"epoch": 1.1971037811745777,
"grad_norm": 4.542221546173096,
"learning_rate": 1.5500000000000002e-06,
"loss": 0.1484,
"num_input_tokens_seen": 1296208,
"step": 186
},
{
"epoch": 1.2035398230088497,
"grad_norm": 6.084584712982178,
"learning_rate": 1.5583333333333334e-06,
"loss": 0.1315,
"num_input_tokens_seen": 1303072,
"step": 187
},
{
"epoch": 1.2099758648431216,
"grad_norm": 18.8467960357666,
"learning_rate": 1.566666666666667e-06,
"loss": 0.1665,
"num_input_tokens_seen": 1310320,
"step": 188
},
{
"epoch": 1.2164119066773935,
"grad_norm": 6.79512882232666,
"learning_rate": 1.5750000000000002e-06,
"loss": 0.1406,
"num_input_tokens_seen": 1317728,
"step": 189
},
{
"epoch": 1.2228479485116655,
"grad_norm": 11.130036354064941,
"learning_rate": 1.5833333333333333e-06,
"loss": 0.1391,
"num_input_tokens_seen": 1325216,
"step": 190
},
{
"epoch": 1.2292839903459372,
"grad_norm": 17.00998306274414,
"learning_rate": 1.591666666666667e-06,
"loss": 0.1339,
"num_input_tokens_seen": 1332272,
"step": 191
},
{
"epoch": 1.235720032180209,
"grad_norm": 16.623762130737305,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.1613,
"num_input_tokens_seen": 1339008,
"step": 192
},
{
"epoch": 1.242156074014481,
"grad_norm": 15.660219192504883,
"learning_rate": 1.6083333333333333e-06,
"loss": 0.1274,
"num_input_tokens_seen": 1345664,
"step": 193
},
{
"epoch": 1.248592115848753,
"grad_norm": 21.379770278930664,
"learning_rate": 1.6166666666666667e-06,
"loss": 0.1882,
"num_input_tokens_seen": 1352720,
"step": 194
},
{
"epoch": 1.255028157683025,
"grad_norm": 8.196439743041992,
"learning_rate": 1.6250000000000001e-06,
"loss": 0.1106,
"num_input_tokens_seen": 1359616,
"step": 195
},
{
"epoch": 1.2614641995172968,
"grad_norm": 4.444194793701172,
"learning_rate": 1.6333333333333335e-06,
"loss": 0.1249,
"num_input_tokens_seen": 1366656,
"step": 196
},
{
"epoch": 1.2679002413515688,
"grad_norm": 10.585016250610352,
"learning_rate": 1.6416666666666667e-06,
"loss": 0.1499,
"num_input_tokens_seen": 1373904,
"step": 197
},
{
"epoch": 1.2743362831858407,
"grad_norm": 18.406293869018555,
"learning_rate": 1.6500000000000003e-06,
"loss": 0.1512,
"num_input_tokens_seen": 1380528,
"step": 198
},
{
"epoch": 1.2807723250201126,
"grad_norm": 5.323694229125977,
"learning_rate": 1.6583333333333335e-06,
"loss": 0.1166,
"num_input_tokens_seen": 1386912,
"step": 199
},
{
"epoch": 1.2872083668543846,
"grad_norm": 20.726289749145508,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.21,
"num_input_tokens_seen": 1393648,
"step": 200
},
{
"epoch": 1.2936444086886565,
"grad_norm": 24.05786895751953,
"learning_rate": 1.6750000000000003e-06,
"loss": 0.1915,
"num_input_tokens_seen": 1400640,
"step": 201
},
{
"epoch": 1.3000804505229284,
"grad_norm": 19.30237579345703,
"learning_rate": 1.6833333333333335e-06,
"loss": 0.1911,
"num_input_tokens_seen": 1407984,
"step": 202
},
{
"epoch": 1.3065164923572004,
"grad_norm": 6.517977714538574,
"learning_rate": 1.6916666666666666e-06,
"loss": 0.1487,
"num_input_tokens_seen": 1414672,
"step": 203
},
{
"epoch": 1.3129525341914723,
"grad_norm": 30.81540870666504,
"learning_rate": 1.7000000000000002e-06,
"loss": 0.2154,
"num_input_tokens_seen": 1421872,
"step": 204
},
{
"epoch": 1.3193885760257442,
"grad_norm": 44.00107955932617,
"learning_rate": 1.7083333333333334e-06,
"loss": 0.2909,
"num_input_tokens_seen": 1428640,
"step": 205
},
{
"epoch": 1.3258246178600162,
"grad_norm": 41.464210510253906,
"learning_rate": 1.7166666666666668e-06,
"loss": 0.271,
"num_input_tokens_seen": 1435456,
"step": 206
},
{
"epoch": 1.332260659694288,
"grad_norm": 12.14904499053955,
"learning_rate": 1.725e-06,
"loss": 0.1616,
"num_input_tokens_seen": 1442592,
"step": 207
},
{
"epoch": 1.33869670152856,
"grad_norm": 8.393083572387695,
"learning_rate": 1.7333333333333336e-06,
"loss": 0.1427,
"num_input_tokens_seen": 1449200,
"step": 208
},
{
"epoch": 1.3451327433628317,
"grad_norm": 11.04562759399414,
"learning_rate": 1.7416666666666668e-06,
"loss": 0.1602,
"num_input_tokens_seen": 1455920,
"step": 209
},
{
"epoch": 1.3515687851971037,
"grad_norm": 12.494465827941895,
"learning_rate": 1.75e-06,
"loss": 0.169,
"num_input_tokens_seen": 1462624,
"step": 210
},
{
"epoch": 1.3580048270313756,
"grad_norm": 5.395782470703125,
"learning_rate": 1.7583333333333336e-06,
"loss": 0.1285,
"num_input_tokens_seen": 1469520,
"step": 211
},
{
"epoch": 1.3644408688656475,
"grad_norm": 19.773469924926758,
"learning_rate": 1.7666666666666668e-06,
"loss": 0.1636,
"num_input_tokens_seen": 1476592,
"step": 212
},
{
"epoch": 1.3708769106999195,
"grad_norm": 28.318584442138672,
"learning_rate": 1.7750000000000002e-06,
"loss": 0.1702,
"num_input_tokens_seen": 1483632,
"step": 213
},
{
"epoch": 1.3773129525341914,
"grad_norm": 20.225502014160156,
"learning_rate": 1.7833333333333336e-06,
"loss": 0.1562,
"num_input_tokens_seen": 1490528,
"step": 214
},
{
"epoch": 1.3837489943684633,
"grad_norm": 5.386298179626465,
"learning_rate": 1.7916666666666667e-06,
"loss": 0.1537,
"num_input_tokens_seen": 1497648,
"step": 215
},
{
"epoch": 1.3901850362027353,
"grad_norm": 6.181918144226074,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.1114,
"num_input_tokens_seen": 1504800,
"step": 216
},
{
"epoch": 1.3966210780370072,
"grad_norm": 5.554294109344482,
"learning_rate": 1.8083333333333335e-06,
"loss": 0.1017,
"num_input_tokens_seen": 1512240,
"step": 217
},
{
"epoch": 1.4030571198712791,
"grad_norm": 5.2657880783081055,
"learning_rate": 1.816666666666667e-06,
"loss": 0.1184,
"num_input_tokens_seen": 1519200,
"step": 218
},
{
"epoch": 1.409493161705551,
"grad_norm": 8.627300262451172,
"learning_rate": 1.825e-06,
"loss": 0.1343,
"num_input_tokens_seen": 1526272,
"step": 219
},
{
"epoch": 1.415929203539823,
"grad_norm": 7.965896129608154,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.1271,
"num_input_tokens_seen": 1533440,
"step": 220
},
{
"epoch": 1.422365245374095,
"grad_norm": 7.089397430419922,
"learning_rate": 1.8416666666666669e-06,
"loss": 0.1383,
"num_input_tokens_seen": 1540272,
"step": 221
},
{
"epoch": 1.4288012872083669,
"grad_norm": 4.354486465454102,
"learning_rate": 1.85e-06,
"loss": 0.1558,
"num_input_tokens_seen": 1547632,
"step": 222
},
{
"epoch": 1.4352373290426388,
"grad_norm": 7.841838836669922,
"learning_rate": 1.8583333333333335e-06,
"loss": 0.1312,
"num_input_tokens_seen": 1554608,
"step": 223
},
{
"epoch": 1.4416733708769107,
"grad_norm": 6.812905311584473,
"learning_rate": 1.8666666666666669e-06,
"loss": 0.1212,
"num_input_tokens_seen": 1561472,
"step": 224
},
{
"epoch": 1.4481094127111827,
"grad_norm": 5.038280963897705,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.1342,
"num_input_tokens_seen": 1568496,
"step": 225
},
{
"epoch": 1.4545454545454546,
"grad_norm": 4.255394458770752,
"learning_rate": 1.8833333333333334e-06,
"loss": 0.096,
"num_input_tokens_seen": 1575184,
"step": 226
},
{
"epoch": 1.4609814963797265,
"grad_norm": 3.311915397644043,
"learning_rate": 1.8916666666666668e-06,
"loss": 0.0982,
"num_input_tokens_seen": 1582080,
"step": 227
},
{
"epoch": 1.4674175382139985,
"grad_norm": 4.303693771362305,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.1099,
"num_input_tokens_seen": 1588688,
"step": 228
},
{
"epoch": 1.4738535800482704,
"grad_norm": 14.854019165039062,
"learning_rate": 1.9083333333333334e-06,
"loss": 0.1265,
"num_input_tokens_seen": 1595216,
"step": 229
},
{
"epoch": 1.4802896218825423,
"grad_norm": 10.509958267211914,
"learning_rate": 1.916666666666667e-06,
"loss": 0.1066,
"num_input_tokens_seen": 1602336,
"step": 230
},
{
"epoch": 1.4867256637168142,
"grad_norm": 9.096975326538086,
"learning_rate": 1.925e-06,
"loss": 0.1593,
"num_input_tokens_seen": 1609024,
"step": 231
},
{
"epoch": 1.4931617055510862,
"grad_norm": 18.944650650024414,
"learning_rate": 1.9333333333333336e-06,
"loss": 0.1891,
"num_input_tokens_seen": 1615712,
"step": 232
},
{
"epoch": 1.4995977473853581,
"grad_norm": 6.735738754272461,
"learning_rate": 1.9416666666666666e-06,
"loss": 0.0867,
"num_input_tokens_seen": 1622608,
"step": 233
},
{
"epoch": 1.50603378921963,
"grad_norm": 12.395522117614746,
"learning_rate": 1.9500000000000004e-06,
"loss": 0.1286,
"num_input_tokens_seen": 1629520,
"step": 234
},
{
"epoch": 1.512469831053902,
"grad_norm": 13.864114761352539,
"learning_rate": 1.9583333333333334e-06,
"loss": 0.1262,
"num_input_tokens_seen": 1636320,
"step": 235
},
{
"epoch": 1.518905872888174,
"grad_norm": 4.206810474395752,
"learning_rate": 1.9666666666666668e-06,
"loss": 0.0878,
"num_input_tokens_seen": 1643216,
"step": 236
},
{
"epoch": 1.5253419147224458,
"grad_norm": 9.294787406921387,
"learning_rate": 1.975e-06,
"loss": 0.1532,
"num_input_tokens_seen": 1650256,
"step": 237
},
{
"epoch": 1.5317779565567178,
"grad_norm": 5.397519111633301,
"learning_rate": 1.9833333333333335e-06,
"loss": 0.1232,
"num_input_tokens_seen": 1657328,
"step": 238
},
{
"epoch": 1.5382139983909895,
"grad_norm": 4.74614953994751,
"learning_rate": 1.991666666666667e-06,
"loss": 0.1119,
"num_input_tokens_seen": 1664192,
"step": 239
},
{
"epoch": 1.5446500402252614,
"grad_norm": 8.80385971069336,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.1334,
"num_input_tokens_seen": 1670944,
"step": 240
},
{
"epoch": 1.5510860820595334,
"grad_norm": 12.17174243927002,
"learning_rate": 2.0083333333333337e-06,
"loss": 0.1224,
"num_input_tokens_seen": 1677792,
"step": 241
},
{
"epoch": 1.5575221238938053,
"grad_norm": 6.9399800300598145,
"learning_rate": 2.0166666666666667e-06,
"loss": 0.106,
"num_input_tokens_seen": 1684640,
"step": 242
},
{
"epoch": 1.5639581657280772,
"grad_norm": 5.804976463317871,
"learning_rate": 2.025e-06,
"loss": 0.1237,
"num_input_tokens_seen": 1691664,
"step": 243
},
{
"epoch": 1.5703942075623492,
"grad_norm": 5.245293617248535,
"learning_rate": 2.0333333333333335e-06,
"loss": 0.095,
"num_input_tokens_seen": 1698528,
"step": 244
},
{
"epoch": 1.576830249396621,
"grad_norm": 2.9305763244628906,
"learning_rate": 2.041666666666667e-06,
"loss": 0.0741,
"num_input_tokens_seen": 1705600,
"step": 245
},
{
"epoch": 1.583266291230893,
"grad_norm": 10.269381523132324,
"learning_rate": 2.05e-06,
"loss": 0.1239,
"num_input_tokens_seen": 1712704,
"step": 246
},
{
"epoch": 1.589702333065165,
"grad_norm": 4.453558921813965,
"learning_rate": 2.0583333333333337e-06,
"loss": 0.091,
"num_input_tokens_seen": 1719568,
"step": 247
},
{
"epoch": 1.5961383748994369,
"grad_norm": 16.549911499023438,
"learning_rate": 2.0666666666666666e-06,
"loss": 0.1403,
"num_input_tokens_seen": 1726480,
"step": 248
},
{
"epoch": 1.6025744167337088,
"grad_norm": 17.650426864624023,
"learning_rate": 2.075e-06,
"loss": 0.1638,
"num_input_tokens_seen": 1733936,
"step": 249
},
{
"epoch": 1.6090104585679805,
"grad_norm": 5.322378158569336,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.1343,
"num_input_tokens_seen": 1741008,
"step": 250
},
{
"epoch": 1.6154465004022525,
"grad_norm": 11.570721626281738,
"learning_rate": 2.091666666666667e-06,
"loss": 0.1558,
"num_input_tokens_seen": 1748240,
"step": 251
},
{
"epoch": 1.6218825422365244,
"grad_norm": 2.901578426361084,
"learning_rate": 2.1000000000000002e-06,
"loss": 0.0809,
"num_input_tokens_seen": 1755072,
"step": 252
},
{
"epoch": 1.6283185840707963,
"grad_norm": 8.972208023071289,
"learning_rate": 2.1083333333333336e-06,
"loss": 0.1435,
"num_input_tokens_seen": 1762048,
"step": 253
},
{
"epoch": 1.6347546259050683,
"grad_norm": 2.364783525466919,
"learning_rate": 2.116666666666667e-06,
"loss": 0.0887,
"num_input_tokens_seen": 1769200,
"step": 254
},
{
"epoch": 1.6411906677393402,
"grad_norm": 3.7692675590515137,
"learning_rate": 2.125e-06,
"loss": 0.1038,
"num_input_tokens_seen": 1776112,
"step": 255
},
{
"epoch": 1.6476267095736121,
"grad_norm": 3.0572264194488525,
"learning_rate": 2.133333333333334e-06,
"loss": 0.0889,
"num_input_tokens_seen": 1783664,
"step": 256
},
{
"epoch": 1.654062751407884,
"grad_norm": 3.8316140174865723,
"learning_rate": 2.1416666666666668e-06,
"loss": 0.0751,
"num_input_tokens_seen": 1790096,
"step": 257
},
{
"epoch": 1.660498793242156,
"grad_norm": 5.133974552154541,
"learning_rate": 2.15e-06,
"loss": 0.0921,
"num_input_tokens_seen": 1796912,
"step": 258
},
{
"epoch": 1.666934835076428,
"grad_norm": 5.002286911010742,
"learning_rate": 2.1583333333333336e-06,
"loss": 0.1102,
"num_input_tokens_seen": 1804144,
"step": 259
},
{
"epoch": 1.6733708769106999,
"grad_norm": 8.221644401550293,
"learning_rate": 2.166666666666667e-06,
"loss": 0.1036,
"num_input_tokens_seen": 1811040,
"step": 260
},
{
"epoch": 1.6798069187449718,
"grad_norm": 6.029963493347168,
"learning_rate": 2.1750000000000004e-06,
"loss": 0.1093,
"num_input_tokens_seen": 1818064,
"step": 261
},
{
"epoch": 1.6862429605792437,
"grad_norm": 6.715224742889404,
"learning_rate": 2.1833333333333333e-06,
"loss": 0.1714,
"num_input_tokens_seen": 1825056,
"step": 262
},
{
"epoch": 1.6926790024135157,
"grad_norm": 6.136181354522705,
"learning_rate": 2.191666666666667e-06,
"loss": 0.1007,
"num_input_tokens_seen": 1831968,
"step": 263
},
{
"epoch": 1.6991150442477876,
"grad_norm": 5.392821788787842,
"learning_rate": 2.2e-06,
"loss": 0.109,
"num_input_tokens_seen": 1838656,
"step": 264
},
{
"epoch": 1.7055510860820595,
"grad_norm": 3.0743072032928467,
"learning_rate": 2.2083333333333335e-06,
"loss": 0.0574,
"num_input_tokens_seen": 1845760,
"step": 265
},
{
"epoch": 1.7119871279163315,
"grad_norm": 4.986932277679443,
"learning_rate": 2.216666666666667e-06,
"loss": 0.0697,
"num_input_tokens_seen": 1852480,
"step": 266
},
{
"epoch": 1.7184231697506034,
"grad_norm": 3.588496685028076,
"learning_rate": 2.2250000000000003e-06,
"loss": 0.1188,
"num_input_tokens_seen": 1859312,
"step": 267
},
{
"epoch": 1.7248592115848753,
"grad_norm": 3.850637912750244,
"learning_rate": 2.2333333333333333e-06,
"loss": 0.0998,
"num_input_tokens_seen": 1866256,
"step": 268
},
{
"epoch": 1.7312952534191473,
"grad_norm": 10.427441596984863,
"learning_rate": 2.2416666666666667e-06,
"loss": 0.1083,
"num_input_tokens_seen": 1873104,
"step": 269
},
{
"epoch": 1.7377312952534192,
"grad_norm": 6.516834259033203,
"learning_rate": 2.25e-06,
"loss": 0.0749,
"num_input_tokens_seen": 1880192,
"step": 270
},
{
"epoch": 1.7441673370876911,
"grad_norm": 5.243050575256348,
"learning_rate": 2.2583333333333335e-06,
"loss": 0.0771,
"num_input_tokens_seen": 1887008,
"step": 271
},
{
"epoch": 1.750603378921963,
"grad_norm": 3.874545097351074,
"learning_rate": 2.266666666666667e-06,
"loss": 0.0646,
"num_input_tokens_seen": 1894096,
"step": 272
},
{
"epoch": 1.757039420756235,
"grad_norm": 4.2995476722717285,
"learning_rate": 2.2750000000000002e-06,
"loss": 0.1147,
"num_input_tokens_seen": 1901216,
"step": 273
},
{
"epoch": 1.763475462590507,
"grad_norm": 9.720036506652832,
"learning_rate": 2.2833333333333336e-06,
"loss": 0.0917,
"num_input_tokens_seen": 1908160,
"step": 274
},
{
"epoch": 1.7699115044247788,
"grad_norm": 7.985558986663818,
"learning_rate": 2.2916666666666666e-06,
"loss": 0.106,
"num_input_tokens_seen": 1915104,
"step": 275
},
{
"epoch": 1.7763475462590508,
"grad_norm": 4.0768327713012695,
"learning_rate": 2.3000000000000004e-06,
"loss": 0.0849,
"num_input_tokens_seen": 1922128,
"step": 276
},
{
"epoch": 1.7827835880933227,
"grad_norm": 5.870975017547607,
"learning_rate": 2.3083333333333334e-06,
"loss": 0.1074,
"num_input_tokens_seen": 1929200,
"step": 277
},
{
"epoch": 1.7892196299275946,
"grad_norm": 3.490455389022827,
"learning_rate": 2.316666666666667e-06,
"loss": 0.0981,
"num_input_tokens_seen": 1936144,
"step": 278
},
{
"epoch": 1.7956556717618666,
"grad_norm": 4.1171183586120605,
"learning_rate": 2.325e-06,
"loss": 0.1008,
"num_input_tokens_seen": 1943136,
"step": 279
},
{
"epoch": 1.8020917135961385,
"grad_norm": 7.664264678955078,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.1032,
"num_input_tokens_seen": 1950208,
"step": 280
},
{
"epoch": 1.8085277554304104,
"grad_norm": 4.865798473358154,
"learning_rate": 2.341666666666667e-06,
"loss": 0.0711,
"num_input_tokens_seen": 1957056,
"step": 281
},
{
"epoch": 1.8149637972646824,
"grad_norm": 2.5436036586761475,
"learning_rate": 2.35e-06,
"loss": 0.0901,
"num_input_tokens_seen": 1964176,
"step": 282
},
{
"epoch": 1.8213998390989543,
"grad_norm": 6.305140972137451,
"learning_rate": 2.3583333333333338e-06,
"loss": 0.0847,
"num_input_tokens_seen": 1970736,
"step": 283
},
{
"epoch": 1.827835880933226,
"grad_norm": 2.6688449382781982,
"learning_rate": 2.3666666666666667e-06,
"loss": 0.0752,
"num_input_tokens_seen": 1977440,
"step": 284
},
{
"epoch": 1.834271922767498,
"grad_norm": 2.5124077796936035,
"learning_rate": 2.375e-06,
"loss": 0.068,
"num_input_tokens_seen": 1984464,
"step": 285
},
{
"epoch": 1.8407079646017699,
"grad_norm": 6.168980121612549,
"learning_rate": 2.3833333333333335e-06,
"loss": 0.1088,
"num_input_tokens_seen": 1991248,
"step": 286
},
{
"epoch": 1.8471440064360418,
"grad_norm": 5.883851051330566,
"learning_rate": 2.391666666666667e-06,
"loss": 0.1017,
"num_input_tokens_seen": 1998496,
"step": 287
},
{
"epoch": 1.8535800482703138,
"grad_norm": 9.373373985290527,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.13,
"num_input_tokens_seen": 2005552,
"step": 288
},
{
"epoch": 1.8600160901045857,
"grad_norm": 9.111586570739746,
"learning_rate": 2.4083333333333337e-06,
"loss": 0.0998,
"num_input_tokens_seen": 2012272,
"step": 289
},
{
"epoch": 1.8664521319388576,
"grad_norm": 5.353252410888672,
"learning_rate": 2.4166666666666667e-06,
"loss": 0.0779,
"num_input_tokens_seen": 2019056,
"step": 290
},
{
"epoch": 1.8728881737731295,
"grad_norm": 6.586206436157227,
"learning_rate": 2.425e-06,
"loss": 0.0907,
"num_input_tokens_seen": 2025760,
"step": 291
},
{
"epoch": 1.8793242156074015,
"grad_norm": 5.485732555389404,
"learning_rate": 2.4333333333333335e-06,
"loss": 0.0911,
"num_input_tokens_seen": 2032928,
"step": 292
},
{
"epoch": 1.8857602574416734,
"grad_norm": 3.5151724815368652,
"learning_rate": 2.441666666666667e-06,
"loss": 0.0987,
"num_input_tokens_seen": 2039856,
"step": 293
},
{
"epoch": 1.8921962992759453,
"grad_norm": 3.680494546890259,
"learning_rate": 2.4500000000000003e-06,
"loss": 0.1254,
"num_input_tokens_seen": 2046896,
"step": 294
},
{
"epoch": 1.898632341110217,
"grad_norm": 3.302248001098633,
"learning_rate": 2.4583333333333332e-06,
"loss": 0.0494,
"num_input_tokens_seen": 2053600,
"step": 295
},
{
"epoch": 1.905068382944489,
"grad_norm": 3.605039119720459,
"learning_rate": 2.466666666666667e-06,
"loss": 0.1082,
"num_input_tokens_seen": 2060240,
"step": 296
},
{
"epoch": 1.911504424778761,
"grad_norm": 2.6599857807159424,
"learning_rate": 2.475e-06,
"loss": 0.0785,
"num_input_tokens_seen": 2067936,
"step": 297
},
{
"epoch": 1.9179404666130329,
"grad_norm": 7.149720191955566,
"learning_rate": 2.4833333333333334e-06,
"loss": 0.1026,
"num_input_tokens_seen": 2074656,
"step": 298
},
{
"epoch": 1.9243765084473048,
"grad_norm": 4.549108982086182,
"learning_rate": 2.491666666666667e-06,
"loss": 0.0617,
"num_input_tokens_seen": 2081568,
"step": 299
},
{
"epoch": 1.9308125502815767,
"grad_norm": 2.900601625442505,
"learning_rate": 2.5e-06,
"loss": 0.0659,
"num_input_tokens_seen": 2088368,
"step": 300
},
{
"epoch": 1.9372485921158487,
"grad_norm": 6.378200531005859,
"learning_rate": 2.5083333333333336e-06,
"loss": 0.088,
"num_input_tokens_seen": 2095728,
"step": 301
},
{
"epoch": 1.9436846339501206,
"grad_norm": 6.718885898590088,
"learning_rate": 2.5166666666666666e-06,
"loss": 0.0771,
"num_input_tokens_seen": 2103104,
"step": 302
},
{
"epoch": 1.9501206757843925,
"grad_norm": 3.587820291519165,
"learning_rate": 2.5250000000000004e-06,
"loss": 0.0642,
"num_input_tokens_seen": 2110032,
"step": 303
},
{
"epoch": 1.9565567176186645,
"grad_norm": 7.106460094451904,
"learning_rate": 2.5333333333333338e-06,
"loss": 0.0947,
"num_input_tokens_seen": 2117056,
"step": 304
},
{
"epoch": 1.9629927594529364,
"grad_norm": 3.480973243713379,
"learning_rate": 2.5416666666666668e-06,
"loss": 0.0975,
"num_input_tokens_seen": 2123552,
"step": 305
},
{
"epoch": 1.9694288012872083,
"grad_norm": 2.709892511367798,
"learning_rate": 2.55e-06,
"loss": 0.0527,
"num_input_tokens_seen": 2130128,
"step": 306
},
{
"epoch": 1.9758648431214803,
"grad_norm": 3.3756306171417236,
"learning_rate": 2.558333333333334e-06,
"loss": 0.0869,
"num_input_tokens_seen": 2137232,
"step": 307
},
{
"epoch": 1.9823008849557522,
"grad_norm": 6.785555839538574,
"learning_rate": 2.566666666666667e-06,
"loss": 0.0605,
"num_input_tokens_seen": 2143776,
"step": 308
},
{
"epoch": 1.9887369267900241,
"grad_norm": 3.4628372192382812,
"learning_rate": 2.5750000000000003e-06,
"loss": 0.0684,
"num_input_tokens_seen": 2150976,
"step": 309
},
{
"epoch": 1.995172968624296,
"grad_norm": 3.56925892829895,
"learning_rate": 2.5833333333333337e-06,
"loss": 0.0701,
"num_input_tokens_seen": 2158080,
"step": 310
},
{
"epoch": 2.001609010458568,
"grad_norm": 4.06324577331543,
"learning_rate": 2.5916666666666667e-06,
"loss": 0.0699,
"num_input_tokens_seen": 2164992,
"step": 311
},
{
"epoch": 2.00804505229284,
"grad_norm": 7.733395576477051,
"learning_rate": 2.6e-06,
"loss": 0.0949,
"num_input_tokens_seen": 2171952,
"step": 312
},
{
"epoch": 2.014481094127112,
"grad_norm": 7.6149139404296875,
"learning_rate": 2.608333333333333e-06,
"loss": 0.0911,
"num_input_tokens_seen": 2179072,
"step": 313
},
{
"epoch": 2.020917135961384,
"grad_norm": 2.538379192352295,
"learning_rate": 2.616666666666667e-06,
"loss": 0.0615,
"num_input_tokens_seen": 2185872,
"step": 314
},
{
"epoch": 2.0273531777956557,
"grad_norm": 2.5334603786468506,
"learning_rate": 2.6250000000000003e-06,
"loss": 0.0448,
"num_input_tokens_seen": 2192656,
"step": 315
},
{
"epoch": 2.0337892196299276,
"grad_norm": 4.8344340324401855,
"learning_rate": 2.6333333333333332e-06,
"loss": 0.0619,
"num_input_tokens_seen": 2199728,
"step": 316
},
{
"epoch": 2.0402252614641996,
"grad_norm": 4.393861770629883,
"learning_rate": 2.6416666666666666e-06,
"loss": 0.0475,
"num_input_tokens_seen": 2206608,
"step": 317
},
{
"epoch": 2.0466613032984715,
"grad_norm": 2.7922892570495605,
"learning_rate": 2.6500000000000005e-06,
"loss": 0.0438,
"num_input_tokens_seen": 2213856,
"step": 318
},
{
"epoch": 2.0530973451327434,
"grad_norm": 1.5408401489257812,
"learning_rate": 2.6583333333333334e-06,
"loss": 0.0245,
"num_input_tokens_seen": 2220528,
"step": 319
},
{
"epoch": 2.0595333869670154,
"grad_norm": 5.6088433265686035,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0716,
"num_input_tokens_seen": 2227616,
"step": 320
},
{
"epoch": 2.0659694288012873,
"grad_norm": 9.311470985412598,
"learning_rate": 2.6750000000000002e-06,
"loss": 0.1015,
"num_input_tokens_seen": 2234304,
"step": 321
},
{
"epoch": 2.0724054706355592,
"grad_norm": 5.244096279144287,
"learning_rate": 2.683333333333333e-06,
"loss": 0.0753,
"num_input_tokens_seen": 2241088,
"step": 322
},
{
"epoch": 2.078841512469831,
"grad_norm": 3.443998098373413,
"learning_rate": 2.691666666666667e-06,
"loss": 0.0521,
"num_input_tokens_seen": 2247632,
"step": 323
},
{
"epoch": 2.085277554304103,
"grad_norm": 2.4997072219848633,
"learning_rate": 2.7000000000000004e-06,
"loss": 0.0287,
"num_input_tokens_seen": 2254448,
"step": 324
},
{
"epoch": 2.091713596138375,
"grad_norm": 4.817678928375244,
"learning_rate": 2.7083333333333334e-06,
"loss": 0.0471,
"num_input_tokens_seen": 2261424,
"step": 325
},
{
"epoch": 2.098149637972647,
"grad_norm": 6.326369285583496,
"learning_rate": 2.7166666666666668e-06,
"loss": 0.0697,
"num_input_tokens_seen": 2268528,
"step": 326
},
{
"epoch": 2.104585679806919,
"grad_norm": 3.599905490875244,
"learning_rate": 2.7250000000000006e-06,
"loss": 0.0438,
"num_input_tokens_seen": 2275328,
"step": 327
},
{
"epoch": 2.111021721641191,
"grad_norm": 2.8037264347076416,
"learning_rate": 2.7333333333333336e-06,
"loss": 0.0475,
"num_input_tokens_seen": 2282400,
"step": 328
},
{
"epoch": 2.1174577634754628,
"grad_norm": 2.7425622940063477,
"learning_rate": 2.741666666666667e-06,
"loss": 0.0601,
"num_input_tokens_seen": 2289312,
"step": 329
},
{
"epoch": 2.1238938053097347,
"grad_norm": 2.064824342727661,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0355,
"num_input_tokens_seen": 2295824,
"step": 330
},
{
"epoch": 2.1303298471440066,
"grad_norm": 3.695521593093872,
"learning_rate": 2.7583333333333333e-06,
"loss": 0.0515,
"num_input_tokens_seen": 2303024,
"step": 331
},
{
"epoch": 2.136765888978278,
"grad_norm": 3.3290112018585205,
"learning_rate": 2.766666666666667e-06,
"loss": 0.0601,
"num_input_tokens_seen": 2309904,
"step": 332
},
{
"epoch": 2.14320193081255,
"grad_norm": 2.751953363418579,
"learning_rate": 2.7750000000000005e-06,
"loss": 0.0288,
"num_input_tokens_seen": 2316416,
"step": 333
},
{
"epoch": 2.149637972646822,
"grad_norm": 4.679827690124512,
"learning_rate": 2.7833333333333335e-06,
"loss": 0.0563,
"num_input_tokens_seen": 2323088,
"step": 334
},
{
"epoch": 2.156074014481094,
"grad_norm": 9.301896095275879,
"learning_rate": 2.791666666666667e-06,
"loss": 0.1176,
"num_input_tokens_seen": 2329968,
"step": 335
},
{
"epoch": 2.162510056315366,
"grad_norm": 6.16165828704834,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.0965,
"num_input_tokens_seen": 2336656,
"step": 336
},
{
"epoch": 2.168946098149638,
"grad_norm": 2.442518711090088,
"learning_rate": 2.8083333333333333e-06,
"loss": 0.0359,
"num_input_tokens_seen": 2343984,
"step": 337
},
{
"epoch": 2.1753821399839097,
"grad_norm": 3.537282943725586,
"learning_rate": 2.816666666666667e-06,
"loss": 0.0609,
"num_input_tokens_seen": 2350912,
"step": 338
},
{
"epoch": 2.1818181818181817,
"grad_norm": 5.1499223709106445,
"learning_rate": 2.825e-06,
"loss": 0.0768,
"num_input_tokens_seen": 2357680,
"step": 339
},
{
"epoch": 2.1882542236524536,
"grad_norm": 8.193970680236816,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0849,
"num_input_tokens_seen": 2364736,
"step": 340
},
{
"epoch": 2.1946902654867255,
"grad_norm": 2.2035670280456543,
"learning_rate": 2.841666666666667e-06,
"loss": 0.0581,
"num_input_tokens_seen": 2371568,
"step": 341
},
{
"epoch": 2.2011263073209975,
"grad_norm": 2.7924435138702393,
"learning_rate": 2.85e-06,
"loss": 0.046,
"num_input_tokens_seen": 2378384,
"step": 342
},
{
"epoch": 2.2075623491552694,
"grad_norm": 4.6174445152282715,
"learning_rate": 2.8583333333333336e-06,
"loss": 0.0674,
"num_input_tokens_seen": 2385584,
"step": 343
},
{
"epoch": 2.2139983909895413,
"grad_norm": 2.4459989070892334,
"learning_rate": 2.866666666666667e-06,
"loss": 0.0563,
"num_input_tokens_seen": 2392640,
"step": 344
},
{
"epoch": 2.2204344328238133,
"grad_norm": 2.3443846702575684,
"learning_rate": 2.875e-06,
"loss": 0.0621,
"num_input_tokens_seen": 2399936,
"step": 345
},
{
"epoch": 2.226870474658085,
"grad_norm": 2.865879774093628,
"learning_rate": 2.8833333333333334e-06,
"loss": 0.0659,
"num_input_tokens_seen": 2406928,
"step": 346
},
{
"epoch": 2.233306516492357,
"grad_norm": 4.03169059753418,
"learning_rate": 2.8916666666666672e-06,
"loss": 0.039,
"num_input_tokens_seen": 2413888,
"step": 347
},
{
"epoch": 2.239742558326629,
"grad_norm": 1.693605899810791,
"learning_rate": 2.9e-06,
"loss": 0.0239,
"num_input_tokens_seen": 2421104,
"step": 348
},
{
"epoch": 2.246178600160901,
"grad_norm": 2.7058444023132324,
"learning_rate": 2.9083333333333336e-06,
"loss": 0.0521,
"num_input_tokens_seen": 2428128,
"step": 349
},
{
"epoch": 2.252614641995173,
"grad_norm": 3.9503567218780518,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0561,
"num_input_tokens_seen": 2434880,
"step": 350
},
{
"epoch": 2.259050683829445,
"grad_norm": 4.444098472595215,
"learning_rate": 2.925e-06,
"loss": 0.0622,
"num_input_tokens_seen": 2441824,
"step": 351
},
{
"epoch": 2.265486725663717,
"grad_norm": 3.7014055252075195,
"learning_rate": 2.9333333333333338e-06,
"loss": 0.0875,
"num_input_tokens_seen": 2448688,
"step": 352
},
{
"epoch": 2.2719227674979887,
"grad_norm": 4.078037261962891,
"learning_rate": 2.941666666666667e-06,
"loss": 0.0307,
"num_input_tokens_seen": 2455488,
"step": 353
},
{
"epoch": 2.2783588093322606,
"grad_norm": 3.753711700439453,
"learning_rate": 2.95e-06,
"loss": 0.063,
"num_input_tokens_seen": 2462240,
"step": 354
},
{
"epoch": 2.2847948511665326,
"grad_norm": 2.9653706550598145,
"learning_rate": 2.9583333333333335e-06,
"loss": 0.0404,
"num_input_tokens_seen": 2469408,
"step": 355
},
{
"epoch": 2.2912308930008045,
"grad_norm": 3.8090925216674805,
"learning_rate": 2.9666666666666673e-06,
"loss": 0.0759,
"num_input_tokens_seen": 2476240,
"step": 356
},
{
"epoch": 2.2976669348350764,
"grad_norm": 2.4684033393859863,
"learning_rate": 2.9750000000000003e-06,
"loss": 0.0488,
"num_input_tokens_seen": 2482864,
"step": 357
},
{
"epoch": 2.3041029766693484,
"grad_norm": 2.0687243938446045,
"learning_rate": 2.9833333333333337e-06,
"loss": 0.0499,
"num_input_tokens_seen": 2489664,
"step": 358
},
{
"epoch": 2.3105390185036203,
"grad_norm": 3.223965883255005,
"learning_rate": 2.991666666666667e-06,
"loss": 0.0441,
"num_input_tokens_seen": 2496704,
"step": 359
},
{
"epoch": 2.3169750603378922,
"grad_norm": 2.1407270431518555,
"learning_rate": 3e-06,
"loss": 0.0485,
"num_input_tokens_seen": 2503920,
"step": 360
},
{
"epoch": 2.323411102172164,
"grad_norm": 2.632885217666626,
"learning_rate": 3.0083333333333335e-06,
"loss": 0.0674,
"num_input_tokens_seen": 2510544,
"step": 361
},
{
"epoch": 2.329847144006436,
"grad_norm": 3.258030652999878,
"learning_rate": 3.0166666666666673e-06,
"loss": 0.0689,
"num_input_tokens_seen": 2517408,
"step": 362
},
{
"epoch": 2.336283185840708,
"grad_norm": 6.024159908294678,
"learning_rate": 3.0250000000000003e-06,
"loss": 0.0618,
"num_input_tokens_seen": 2524160,
"step": 363
},
{
"epoch": 2.34271922767498,
"grad_norm": 4.7281999588012695,
"learning_rate": 3.0333333333333337e-06,
"loss": 0.0629,
"num_input_tokens_seen": 2531072,
"step": 364
},
{
"epoch": 2.349155269509252,
"grad_norm": 4.178661823272705,
"learning_rate": 3.0416666666666666e-06,
"loss": 0.0499,
"num_input_tokens_seen": 2537920,
"step": 365
},
{
"epoch": 2.355591311343524,
"grad_norm": 1.5715197324752808,
"learning_rate": 3.05e-06,
"loss": 0.0361,
"num_input_tokens_seen": 2544736,
"step": 366
},
{
"epoch": 2.3620273531777958,
"grad_norm": 2.835855722427368,
"learning_rate": 3.058333333333334e-06,
"loss": 0.0471,
"num_input_tokens_seen": 2552016,
"step": 367
},
{
"epoch": 2.3684633950120677,
"grad_norm": 2.870889902114868,
"learning_rate": 3.066666666666667e-06,
"loss": 0.0622,
"num_input_tokens_seen": 2559616,
"step": 368
},
{
"epoch": 2.3748994368463396,
"grad_norm": 1.7411049604415894,
"learning_rate": 3.075e-06,
"loss": 0.0328,
"num_input_tokens_seen": 2566240,
"step": 369
},
{
"epoch": 2.3813354786806116,
"grad_norm": 3.0499918460845947,
"learning_rate": 3.0833333333333336e-06,
"loss": 0.0437,
"num_input_tokens_seen": 2573392,
"step": 370
},
{
"epoch": 2.3877715205148835,
"grad_norm": 4.242414474487305,
"learning_rate": 3.0916666666666666e-06,
"loss": 0.0644,
"num_input_tokens_seen": 2580544,
"step": 371
},
{
"epoch": 2.3942075623491554,
"grad_norm": 2.962906837463379,
"learning_rate": 3.1000000000000004e-06,
"loss": 0.0553,
"num_input_tokens_seen": 2587344,
"step": 372
},
{
"epoch": 2.4006436041834274,
"grad_norm": 4.431301116943359,
"learning_rate": 3.1083333333333338e-06,
"loss": 0.061,
"num_input_tokens_seen": 2594560,
"step": 373
},
{
"epoch": 2.4070796460176993,
"grad_norm": 5.075587272644043,
"learning_rate": 3.1166666666666668e-06,
"loss": 0.0866,
"num_input_tokens_seen": 2601408,
"step": 374
},
{
"epoch": 2.4135156878519712,
"grad_norm": 3.877520799636841,
"learning_rate": 3.125e-06,
"loss": 0.0632,
"num_input_tokens_seen": 2608624,
"step": 375
},
{
"epoch": 2.419951729686243,
"grad_norm": 2.9902503490448,
"learning_rate": 3.133333333333334e-06,
"loss": 0.0395,
"num_input_tokens_seen": 2615456,
"step": 376
},
{
"epoch": 2.426387771520515,
"grad_norm": 3.7800397872924805,
"learning_rate": 3.141666666666667e-06,
"loss": 0.0819,
"num_input_tokens_seen": 2622672,
"step": 377
},
{
"epoch": 2.432823813354787,
"grad_norm": 2.4674911499023438,
"learning_rate": 3.1500000000000003e-06,
"loss": 0.064,
"num_input_tokens_seen": 2629952,
"step": 378
},
{
"epoch": 2.439259855189059,
"grad_norm": 5.3331146240234375,
"learning_rate": 3.1583333333333337e-06,
"loss": 0.0803,
"num_input_tokens_seen": 2637168,
"step": 379
},
{
"epoch": 2.445695897023331,
"grad_norm": 9.950706481933594,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.0798,
"num_input_tokens_seen": 2644144,
"step": 380
},
{
"epoch": 2.4521319388576024,
"grad_norm": 5.1734442710876465,
"learning_rate": 3.175e-06,
"loss": 0.0544,
"num_input_tokens_seen": 2651376,
"step": 381
},
{
"epoch": 2.4585679806918743,
"grad_norm": 2.5671188831329346,
"learning_rate": 3.183333333333334e-06,
"loss": 0.0629,
"num_input_tokens_seen": 2658336,
"step": 382
},
{
"epoch": 2.4650040225261463,
"grad_norm": 4.357182025909424,
"learning_rate": 3.191666666666667e-06,
"loss": 0.0471,
"num_input_tokens_seen": 2665360,
"step": 383
},
{
"epoch": 2.471440064360418,
"grad_norm": 4.694338321685791,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0533,
"num_input_tokens_seen": 2672704,
"step": 384
},
{
"epoch": 2.47787610619469,
"grad_norm": 2.391195774078369,
"learning_rate": 3.2083333333333337e-06,
"loss": 0.0542,
"num_input_tokens_seen": 2679872,
"step": 385
},
{
"epoch": 2.484312148028962,
"grad_norm": 3.859102249145508,
"learning_rate": 3.2166666666666666e-06,
"loss": 0.034,
"num_input_tokens_seen": 2686672,
"step": 386
},
{
"epoch": 2.490748189863234,
"grad_norm": 2.4710166454315186,
"learning_rate": 3.2250000000000005e-06,
"loss": 0.0517,
"num_input_tokens_seen": 2693520,
"step": 387
},
{
"epoch": 2.497184231697506,
"grad_norm": 3.309068202972412,
"learning_rate": 3.2333333333333334e-06,
"loss": 0.0698,
"num_input_tokens_seen": 2700432,
"step": 388
},
{
"epoch": 2.503620273531778,
"grad_norm": 4.21011209487915,
"learning_rate": 3.241666666666667e-06,
"loss": 0.0573,
"num_input_tokens_seen": 2707184,
"step": 389
},
{
"epoch": 2.51005631536605,
"grad_norm": 4.34623908996582,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0568,
"num_input_tokens_seen": 2713936,
"step": 390
},
{
"epoch": 2.5164923572003217,
"grad_norm": 3.361445188522339,
"learning_rate": 3.258333333333333e-06,
"loss": 0.0669,
"num_input_tokens_seen": 2721216,
"step": 391
},
{
"epoch": 2.5229283990345936,
"grad_norm": 2.091728925704956,
"learning_rate": 3.266666666666667e-06,
"loss": 0.027,
"num_input_tokens_seen": 2727968,
"step": 392
},
{
"epoch": 2.5293644408688656,
"grad_norm": 2.1977951526641846,
"learning_rate": 3.2750000000000004e-06,
"loss": 0.0303,
"num_input_tokens_seen": 2734816,
"step": 393
},
{
"epoch": 2.5358004827031375,
"grad_norm": 2.7409942150115967,
"learning_rate": 3.2833333333333334e-06,
"loss": 0.0392,
"num_input_tokens_seen": 2741744,
"step": 394
},
{
"epoch": 2.5422365245374094,
"grad_norm": 3.695770740509033,
"learning_rate": 3.2916666666666668e-06,
"loss": 0.0813,
"num_input_tokens_seen": 2748640,
"step": 395
},
{
"epoch": 2.5486725663716814,
"grad_norm": 3.674891471862793,
"learning_rate": 3.3000000000000006e-06,
"loss": 0.0403,
"num_input_tokens_seen": 2755888,
"step": 396
},
{
"epoch": 2.5551086082059533,
"grad_norm": 1.716131567955017,
"learning_rate": 3.3083333333333336e-06,
"loss": 0.0222,
"num_input_tokens_seen": 2762464,
"step": 397
},
{
"epoch": 2.5615446500402252,
"grad_norm": 2.5081095695495605,
"learning_rate": 3.316666666666667e-06,
"loss": 0.0611,
"num_input_tokens_seen": 2769712,
"step": 398
},
{
"epoch": 2.567980691874497,
"grad_norm": 1.9974850416183472,
"learning_rate": 3.3250000000000004e-06,
"loss": 0.035,
"num_input_tokens_seen": 2776736,
"step": 399
},
{
"epoch": 2.574416733708769,
"grad_norm": 4.233558177947998,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.068,
"num_input_tokens_seen": 2783376,
"step": 400
},
{
"epoch": 2.580852775543041,
"grad_norm": 3.359081983566284,
"learning_rate": 3.341666666666667e-06,
"loss": 0.0543,
"num_input_tokens_seen": 2790528,
"step": 401
},
{
"epoch": 2.587288817377313,
"grad_norm": 2.669712543487549,
"learning_rate": 3.3500000000000005e-06,
"loss": 0.0466,
"num_input_tokens_seen": 2797312,
"step": 402
},
{
"epoch": 2.593724859211585,
"grad_norm": 3.1529603004455566,
"learning_rate": 3.3583333333333335e-06,
"loss": 0.0626,
"num_input_tokens_seen": 2804288,
"step": 403
},
{
"epoch": 2.600160901045857,
"grad_norm": 3.069842576980591,
"learning_rate": 3.366666666666667e-06,
"loss": 0.0589,
"num_input_tokens_seen": 2811456,
"step": 404
},
{
"epoch": 2.6065969428801288,
"grad_norm": 1.881988525390625,
"learning_rate": 3.3750000000000003e-06,
"loss": 0.0415,
"num_input_tokens_seen": 2818080,
"step": 405
},
{
"epoch": 2.6130329847144007,
"grad_norm": 1.862747073173523,
"learning_rate": 3.3833333333333333e-06,
"loss": 0.0344,
"num_input_tokens_seen": 2825136,
"step": 406
},
{
"epoch": 2.6194690265486726,
"grad_norm": 2.6847071647644043,
"learning_rate": 3.391666666666667e-06,
"loss": 0.0423,
"num_input_tokens_seen": 2832400,
"step": 407
},
{
"epoch": 2.6259050683829446,
"grad_norm": 3.631681203842163,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.0838,
"num_input_tokens_seen": 2839712,
"step": 408
},
{
"epoch": 2.6323411102172165,
"grad_norm": 3.7878201007843018,
"learning_rate": 3.4083333333333335e-06,
"loss": 0.0732,
"num_input_tokens_seen": 2846160,
"step": 409
},
{
"epoch": 2.6387771520514884,
"grad_norm": 2.826582431793213,
"learning_rate": 3.416666666666667e-06,
"loss": 0.0464,
"num_input_tokens_seen": 2853520,
"step": 410
},
{
"epoch": 2.6452131938857604,
"grad_norm": 2.330638885498047,
"learning_rate": 3.4250000000000007e-06,
"loss": 0.0387,
"num_input_tokens_seen": 2860384,
"step": 411
},
{
"epoch": 2.6516492357200323,
"grad_norm": 2.330439567565918,
"learning_rate": 3.4333333333333336e-06,
"loss": 0.0507,
"num_input_tokens_seen": 2867360,
"step": 412
},
{
"epoch": 2.6580852775543042,
"grad_norm": 3.929145336151123,
"learning_rate": 3.441666666666667e-06,
"loss": 0.0549,
"num_input_tokens_seen": 2873648,
"step": 413
},
{
"epoch": 2.664521319388576,
"grad_norm": 3.001359224319458,
"learning_rate": 3.45e-06,
"loss": 0.0285,
"num_input_tokens_seen": 2880848,
"step": 414
},
{
"epoch": 2.670957361222848,
"grad_norm": 2.7936651706695557,
"learning_rate": 3.4583333333333334e-06,
"loss": 0.0668,
"num_input_tokens_seen": 2888256,
"step": 415
},
{
"epoch": 2.67739340305712,
"grad_norm": 4.050117015838623,
"learning_rate": 3.4666666666666672e-06,
"loss": 0.0691,
"num_input_tokens_seen": 2895040,
"step": 416
},
{
"epoch": 2.6838294448913915,
"grad_norm": 5.509685516357422,
"learning_rate": 3.475e-06,
"loss": 0.066,
"num_input_tokens_seen": 2902320,
"step": 417
},
{
"epoch": 2.6902654867256635,
"grad_norm": 3.968433380126953,
"learning_rate": 3.4833333333333336e-06,
"loss": 0.0495,
"num_input_tokens_seen": 2908960,
"step": 418
},
{
"epoch": 2.6967015285599354,
"grad_norm": 2.082157611846924,
"learning_rate": 3.491666666666667e-06,
"loss": 0.034,
"num_input_tokens_seen": 2915808,
"step": 419
},
{
"epoch": 2.7031375703942073,
"grad_norm": 2.403968334197998,
"learning_rate": 3.5e-06,
"loss": 0.0604,
"num_input_tokens_seen": 2922608,
"step": 420
},
{
"epoch": 2.7095736122284793,
"grad_norm": 4.667454719543457,
"learning_rate": 3.5083333333333338e-06,
"loss": 0.0535,
"num_input_tokens_seen": 2929728,
"step": 421
},
{
"epoch": 2.716009654062751,
"grad_norm": 2.5968987941741943,
"learning_rate": 3.516666666666667e-06,
"loss": 0.0369,
"num_input_tokens_seen": 2937024,
"step": 422
},
{
"epoch": 2.722445695897023,
"grad_norm": 3.4746780395507812,
"learning_rate": 3.525e-06,
"loss": 0.045,
"num_input_tokens_seen": 2943760,
"step": 423
},
{
"epoch": 2.728881737731295,
"grad_norm": 1.9599398374557495,
"learning_rate": 3.5333333333333335e-06,
"loss": 0.0314,
"num_input_tokens_seen": 2950848,
"step": 424
},
{
"epoch": 2.735317779565567,
"grad_norm": 2.971634864807129,
"learning_rate": 3.5416666666666673e-06,
"loss": 0.0611,
"num_input_tokens_seen": 2957408,
"step": 425
},
{
"epoch": 2.741753821399839,
"grad_norm": 3.1944162845611572,
"learning_rate": 3.5500000000000003e-06,
"loss": 0.0478,
"num_input_tokens_seen": 2964288,
"step": 426
},
{
"epoch": 2.748189863234111,
"grad_norm": 3.3659610748291016,
"learning_rate": 3.5583333333333337e-06,
"loss": 0.038,
"num_input_tokens_seen": 2970912,
"step": 427
},
{
"epoch": 2.754625905068383,
"grad_norm": 2.965097188949585,
"learning_rate": 3.566666666666667e-06,
"loss": 0.043,
"num_input_tokens_seen": 2978032,
"step": 428
},
{
"epoch": 2.7610619469026547,
"grad_norm": 2.4006049633026123,
"learning_rate": 3.575e-06,
"loss": 0.0478,
"num_input_tokens_seen": 2985232,
"step": 429
},
{
"epoch": 2.7674979887369267,
"grad_norm": 3.7348554134368896,
"learning_rate": 3.5833333333333335e-06,
"loss": 0.0977,
"num_input_tokens_seen": 2992240,
"step": 430
},
{
"epoch": 2.7739340305711986,
"grad_norm": 3.1373274326324463,
"learning_rate": 3.5916666666666673e-06,
"loss": 0.0835,
"num_input_tokens_seen": 2999008,
"step": 431
},
{
"epoch": 2.7803700724054705,
"grad_norm": 1.9444302320480347,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.0406,
"num_input_tokens_seen": 3005648,
"step": 432
},
{
"epoch": 2.7868061142397424,
"grad_norm": 1.8665870428085327,
"learning_rate": 3.6083333333333337e-06,
"loss": 0.0661,
"num_input_tokens_seen": 3012224,
"step": 433
},
{
"epoch": 2.7932421560740144,
"grad_norm": 1.9893403053283691,
"learning_rate": 3.616666666666667e-06,
"loss": 0.0647,
"num_input_tokens_seen": 3019104,
"step": 434
},
{
"epoch": 2.7996781979082863,
"grad_norm": 2.656529426574707,
"learning_rate": 3.625e-06,
"loss": 0.0499,
"num_input_tokens_seen": 3026096,
"step": 435
},
{
"epoch": 2.8061142397425582,
"grad_norm": 1.7047683000564575,
"learning_rate": 3.633333333333334e-06,
"loss": 0.0422,
"num_input_tokens_seen": 3032784,
"step": 436
},
{
"epoch": 2.81255028157683,
"grad_norm": 1.6727882623672485,
"learning_rate": 3.6416666666666672e-06,
"loss": 0.048,
"num_input_tokens_seen": 3040096,
"step": 437
},
{
"epoch": 2.818986323411102,
"grad_norm": 4.0175251960754395,
"learning_rate": 3.65e-06,
"loss": 0.0474,
"num_input_tokens_seen": 3046720,
"step": 438
},
{
"epoch": 2.825422365245374,
"grad_norm": 8.139860153198242,
"learning_rate": 3.6583333333333336e-06,
"loss": 0.0801,
"num_input_tokens_seen": 3053712,
"step": 439
},
{
"epoch": 2.831858407079646,
"grad_norm": 3.832087278366089,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0528,
"num_input_tokens_seen": 3060528,
"step": 440
},
{
"epoch": 2.838294448913918,
"grad_norm": 2.881619930267334,
"learning_rate": 3.6750000000000004e-06,
"loss": 0.0461,
"num_input_tokens_seen": 3067440,
"step": 441
},
{
"epoch": 2.84473049074819,
"grad_norm": 4.456245422363281,
"learning_rate": 3.6833333333333338e-06,
"loss": 0.0646,
"num_input_tokens_seen": 3074208,
"step": 442
},
{
"epoch": 2.8511665325824618,
"grad_norm": 5.1570820808410645,
"learning_rate": 3.6916666666666668e-06,
"loss": 0.049,
"num_input_tokens_seen": 3081072,
"step": 443
},
{
"epoch": 2.8576025744167337,
"grad_norm": 2.944526433944702,
"learning_rate": 3.7e-06,
"loss": 0.0531,
"num_input_tokens_seen": 3088240,
"step": 444
},
{
"epoch": 2.8640386162510056,
"grad_norm": 2.021688222885132,
"learning_rate": 3.708333333333334e-06,
"loss": 0.0521,
"num_input_tokens_seen": 3095504,
"step": 445
},
{
"epoch": 2.8704746580852776,
"grad_norm": 6.054248809814453,
"learning_rate": 3.716666666666667e-06,
"loss": 0.0927,
"num_input_tokens_seen": 3102688,
"step": 446
},
{
"epoch": 2.8769106999195495,
"grad_norm": 3.5824503898620605,
"learning_rate": 3.7250000000000003e-06,
"loss": 0.0491,
"num_input_tokens_seen": 3109440,
"step": 447
},
{
"epoch": 2.8833467417538214,
"grad_norm": 2.0240774154663086,
"learning_rate": 3.7333333333333337e-06,
"loss": 0.0399,
"num_input_tokens_seen": 3116720,
"step": 448
},
{
"epoch": 2.8897827835880934,
"grad_norm": 4.0125579833984375,
"learning_rate": 3.7416666666666667e-06,
"loss": 0.0499,
"num_input_tokens_seen": 3123568,
"step": 449
},
{
"epoch": 2.8962188254223653,
"grad_norm": 3.733275890350342,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0569,
"num_input_tokens_seen": 3130768,
"step": 450
},
{
"epoch": 2.9026548672566372,
"grad_norm": 4.261077880859375,
"learning_rate": 3.758333333333334e-06,
"loss": 0.0608,
"num_input_tokens_seen": 3138128,
"step": 451
},
{
"epoch": 2.909090909090909,
"grad_norm": 1.4142907857894897,
"learning_rate": 3.766666666666667e-06,
"loss": 0.0325,
"num_input_tokens_seen": 3145008,
"step": 452
},
{
"epoch": 2.915526950925181,
"grad_norm": 2.610344171524048,
"learning_rate": 3.7750000000000003e-06,
"loss": 0.0643,
"num_input_tokens_seen": 3151792,
"step": 453
},
{
"epoch": 2.921962992759453,
"grad_norm": 2.9687604904174805,
"learning_rate": 3.7833333333333337e-06,
"loss": 0.0479,
"num_input_tokens_seen": 3158800,
"step": 454
},
{
"epoch": 2.928399034593725,
"grad_norm": 2.2706518173217773,
"learning_rate": 3.7916666666666666e-06,
"loss": 0.0549,
"num_input_tokens_seen": 3165744,
"step": 455
},
{
"epoch": 2.934835076427997,
"grad_norm": 3.606792449951172,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.0789,
"num_input_tokens_seen": 3172896,
"step": 456
},
{
"epoch": 2.941271118262269,
"grad_norm": 1.8851637840270996,
"learning_rate": 3.808333333333334e-06,
"loss": 0.0319,
"num_input_tokens_seen": 3179888,
"step": 457
},
{
"epoch": 2.9477071600965408,
"grad_norm": 2.6292834281921387,
"learning_rate": 3.816666666666667e-06,
"loss": 0.05,
"num_input_tokens_seen": 3186960,
"step": 458
},
{
"epoch": 2.9541432019308127,
"grad_norm": 2.099109172821045,
"learning_rate": 3.825000000000001e-06,
"loss": 0.0677,
"num_input_tokens_seen": 3194208,
"step": 459
},
{
"epoch": 2.9605792437650846,
"grad_norm": 2.5214834213256836,
"learning_rate": 3.833333333333334e-06,
"loss": 0.0512,
"num_input_tokens_seen": 3201120,
"step": 460
},
{
"epoch": 2.9670152855993566,
"grad_norm": 6.318456649780273,
"learning_rate": 3.841666666666667e-06,
"loss": 0.0681,
"num_input_tokens_seen": 3208160,
"step": 461
},
{
"epoch": 2.9734513274336285,
"grad_norm": 4.119838714599609,
"learning_rate": 3.85e-06,
"loss": 0.0651,
"num_input_tokens_seen": 3214992,
"step": 462
},
{
"epoch": 2.9798873692679004,
"grad_norm": 3.248420238494873,
"learning_rate": 3.858333333333333e-06,
"loss": 0.0498,
"num_input_tokens_seen": 3222192,
"step": 463
},
{
"epoch": 2.9863234111021724,
"grad_norm": 1.6198488473892212,
"learning_rate": 3.866666666666667e-06,
"loss": 0.0496,
"num_input_tokens_seen": 3229504,
"step": 464
},
{
"epoch": 2.9927594529364443,
"grad_norm": 2.6008763313293457,
"learning_rate": 3.875e-06,
"loss": 0.0446,
"num_input_tokens_seen": 3236400,
"step": 465
},
{
"epoch": 2.9991954947707162,
"grad_norm": 2.349928379058838,
"learning_rate": 3.883333333333333e-06,
"loss": 0.0543,
"num_input_tokens_seen": 3243600,
"step": 466
},
{
"epoch": 3.0056315366049877,
"grad_norm": 0.8590204119682312,
"learning_rate": 3.891666666666667e-06,
"loss": 0.0137,
"num_input_tokens_seen": 3249808,
"step": 467
},
{
"epoch": 3.0120675784392597,
"grad_norm": 1.2689623832702637,
"learning_rate": 3.900000000000001e-06,
"loss": 0.0201,
"num_input_tokens_seen": 3257168,
"step": 468
},
{
"epoch": 3.0185036202735316,
"grad_norm": 1.329512596130371,
"learning_rate": 3.908333333333334e-06,
"loss": 0.0119,
"num_input_tokens_seen": 3264064,
"step": 469
},
{
"epoch": 3.0249396621078035,
"grad_norm": 2.423644781112671,
"learning_rate": 3.916666666666667e-06,
"loss": 0.0305,
"num_input_tokens_seen": 3270688,
"step": 470
},
{
"epoch": 3.0313757039420755,
"grad_norm": 3.6647322177886963,
"learning_rate": 3.9250000000000005e-06,
"loss": 0.0213,
"num_input_tokens_seen": 3277664,
"step": 471
},
{
"epoch": 3.0378117457763474,
"grad_norm": 3.736281156539917,
"learning_rate": 3.9333333333333335e-06,
"loss": 0.035,
"num_input_tokens_seen": 3284352,
"step": 472
},
{
"epoch": 3.0442477876106193,
"grad_norm": 2.274883270263672,
"learning_rate": 3.941666666666667e-06,
"loss": 0.0438,
"num_input_tokens_seen": 3290864,
"step": 473
},
{
"epoch": 3.0506838294448912,
"grad_norm": 3.032172203063965,
"learning_rate": 3.95e-06,
"loss": 0.0464,
"num_input_tokens_seen": 3297856,
"step": 474
},
{
"epoch": 3.057119871279163,
"grad_norm": 2.258751392364502,
"learning_rate": 3.958333333333333e-06,
"loss": 0.0172,
"num_input_tokens_seen": 3305120,
"step": 475
},
{
"epoch": 3.063555913113435,
"grad_norm": 2.925736427307129,
"learning_rate": 3.966666666666667e-06,
"loss": 0.0287,
"num_input_tokens_seen": 3312032,
"step": 476
},
{
"epoch": 3.069991954947707,
"grad_norm": 3.100857734680176,
"learning_rate": 3.975000000000001e-06,
"loss": 0.0579,
"num_input_tokens_seen": 3319424,
"step": 477
},
{
"epoch": 3.076427996781979,
"grad_norm": 1.753515601158142,
"learning_rate": 3.983333333333334e-06,
"loss": 0.0095,
"num_input_tokens_seen": 3326304,
"step": 478
},
{
"epoch": 3.082864038616251,
"grad_norm": 2.3217740058898926,
"learning_rate": 3.991666666666667e-06,
"loss": 0.0238,
"num_input_tokens_seen": 3333184,
"step": 479
},
{
"epoch": 3.089300080450523,
"grad_norm": 2.512751579284668,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0313,
"num_input_tokens_seen": 3340384,
"step": 480
},
{
"epoch": 3.0957361222847948,
"grad_norm": 1.2185322046279907,
"learning_rate": 4.008333333333334e-06,
"loss": 0.0146,
"num_input_tokens_seen": 3347344,
"step": 481
},
{
"epoch": 3.1021721641190667,
"grad_norm": 1.1303057670593262,
"learning_rate": 4.0166666666666675e-06,
"loss": 0.0347,
"num_input_tokens_seen": 3354080,
"step": 482
},
{
"epoch": 3.1086082059533386,
"grad_norm": 2.4247186183929443,
"learning_rate": 4.0250000000000004e-06,
"loss": 0.024,
"num_input_tokens_seen": 3360848,
"step": 483
},
{
"epoch": 3.1150442477876106,
"grad_norm": 1.4767001867294312,
"learning_rate": 4.033333333333333e-06,
"loss": 0.0128,
"num_input_tokens_seen": 3367616,
"step": 484
},
{
"epoch": 3.1214802896218825,
"grad_norm": 2.458953857421875,
"learning_rate": 4.041666666666667e-06,
"loss": 0.0311,
"num_input_tokens_seen": 3374880,
"step": 485
},
{
"epoch": 3.1279163314561544,
"grad_norm": 0.5494964718818665,
"learning_rate": 4.05e-06,
"loss": 0.0178,
"num_input_tokens_seen": 3381696,
"step": 486
},
{
"epoch": 3.1343523732904264,
"grad_norm": 1.5969914197921753,
"learning_rate": 4.058333333333333e-06,
"loss": 0.0379,
"num_input_tokens_seen": 3388880,
"step": 487
},
{
"epoch": 3.1407884151246983,
"grad_norm": 1.7003910541534424,
"learning_rate": 4.066666666666667e-06,
"loss": 0.0299,
"num_input_tokens_seen": 3395984,
"step": 488
},
{
"epoch": 3.1472244569589702,
"grad_norm": 2.297182083129883,
"learning_rate": 4.075e-06,
"loss": 0.0261,
"num_input_tokens_seen": 3402896,
"step": 489
},
{
"epoch": 3.153660498793242,
"grad_norm": 2.3937814235687256,
"learning_rate": 4.083333333333334e-06,
"loss": 0.0347,
"num_input_tokens_seen": 3409888,
"step": 490
},
{
"epoch": 3.160096540627514,
"grad_norm": 1.349425196647644,
"learning_rate": 4.091666666666667e-06,
"loss": 0.011,
"num_input_tokens_seen": 3416928,
"step": 491
},
{
"epoch": 3.166532582461786,
"grad_norm": 3.0355069637298584,
"learning_rate": 4.1e-06,
"loss": 0.0541,
"num_input_tokens_seen": 3423968,
"step": 492
},
{
"epoch": 3.172968624296058,
"grad_norm": 2.680206537246704,
"learning_rate": 4.1083333333333335e-06,
"loss": 0.0465,
"num_input_tokens_seen": 3431120,
"step": 493
},
{
"epoch": 3.17940466613033,
"grad_norm": 1.5906095504760742,
"learning_rate": 4.116666666666667e-06,
"loss": 0.0187,
"num_input_tokens_seen": 3437776,
"step": 494
},
{
"epoch": 3.185840707964602,
"grad_norm": 0.8296425938606262,
"learning_rate": 4.125e-06,
"loss": 0.0089,
"num_input_tokens_seen": 3444480,
"step": 495
},
{
"epoch": 3.1922767497988738,
"grad_norm": 2.857689142227173,
"learning_rate": 4.133333333333333e-06,
"loss": 0.0289,
"num_input_tokens_seen": 3451232,
"step": 496
},
{
"epoch": 3.1987127916331457,
"grad_norm": 1.0910203456878662,
"learning_rate": 4.141666666666667e-06,
"loss": 0.0103,
"num_input_tokens_seen": 3457776,
"step": 497
},
{
"epoch": 3.2051488334674176,
"grad_norm": 1.3560919761657715,
"learning_rate": 4.15e-06,
"loss": 0.0132,
"num_input_tokens_seen": 3465056,
"step": 498
},
{
"epoch": 3.2115848753016896,
"grad_norm": 4.861215591430664,
"learning_rate": 4.158333333333334e-06,
"loss": 0.0375,
"num_input_tokens_seen": 3471968,
"step": 499
},
{
"epoch": 3.2180209171359615,
"grad_norm": 1.8714208602905273,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0143,
"num_input_tokens_seen": 3479648,
"step": 500
},
{
"epoch": 3.2244569589702334,
"grad_norm": 1.6230028867721558,
"learning_rate": 4.175e-06,
"loss": 0.0159,
"num_input_tokens_seen": 3486272,
"step": 501
},
{
"epoch": 3.2308930008045054,
"grad_norm": 0.7852226495742798,
"learning_rate": 4.183333333333334e-06,
"loss": 0.0073,
"num_input_tokens_seen": 3493360,
"step": 502
},
{
"epoch": 3.2373290426387773,
"grad_norm": 2.3990976810455322,
"learning_rate": 4.1916666666666675e-06,
"loss": 0.0186,
"num_input_tokens_seen": 3500336,
"step": 503
},
{
"epoch": 3.2437650844730492,
"grad_norm": 0.796851634979248,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.0035,
"num_input_tokens_seen": 3507232,
"step": 504
},
{
"epoch": 3.250201126307321,
"grad_norm": 2.7951748371124268,
"learning_rate": 4.208333333333333e-06,
"loss": 0.0416,
"num_input_tokens_seen": 3514144,
"step": 505
},
{
"epoch": 3.256637168141593,
"grad_norm": 2.40897274017334,
"learning_rate": 4.216666666666667e-06,
"loss": 0.0266,
"num_input_tokens_seen": 3520976,
"step": 506
},
{
"epoch": 3.263073209975865,
"grad_norm": 2.3974061012268066,
"learning_rate": 4.225e-06,
"loss": 0.0351,
"num_input_tokens_seen": 3527920,
"step": 507
},
{
"epoch": 3.2695092518101365,
"grad_norm": 2.30100154876709,
"learning_rate": 4.233333333333334e-06,
"loss": 0.0209,
"num_input_tokens_seen": 3534864,
"step": 508
},
{
"epoch": 3.2759452936444085,
"grad_norm": 2.1172518730163574,
"learning_rate": 4.241666666666667e-06,
"loss": 0.0434,
"num_input_tokens_seen": 3541872,
"step": 509
},
{
"epoch": 3.2823813354786804,
"grad_norm": 3.7030341625213623,
"learning_rate": 4.25e-06,
"loss": 0.0174,
"num_input_tokens_seen": 3548384,
"step": 510
},
{
"epoch": 3.2888173773129523,
"grad_norm": 2.152125597000122,
"learning_rate": 4.258333333333334e-06,
"loss": 0.0529,
"num_input_tokens_seen": 3555792,
"step": 511
},
{
"epoch": 3.2952534191472242,
"grad_norm": 0.6081152558326721,
"learning_rate": 4.266666666666668e-06,
"loss": 0.0033,
"num_input_tokens_seen": 3562608,
"step": 512
},
{
"epoch": 3.301689460981496,
"grad_norm": 1.7042624950408936,
"learning_rate": 4.2750000000000006e-06,
"loss": 0.0196,
"num_input_tokens_seen": 3569184,
"step": 513
},
{
"epoch": 3.308125502815768,
"grad_norm": 1.3502767086029053,
"learning_rate": 4.2833333333333335e-06,
"loss": 0.0242,
"num_input_tokens_seen": 3576224,
"step": 514
},
{
"epoch": 3.31456154465004,
"grad_norm": 4.480360984802246,
"learning_rate": 4.2916666666666665e-06,
"loss": 0.0316,
"num_input_tokens_seen": 3583328,
"step": 515
},
{
"epoch": 3.320997586484312,
"grad_norm": 2.2217299938201904,
"learning_rate": 4.3e-06,
"loss": 0.0268,
"num_input_tokens_seen": 3590256,
"step": 516
},
{
"epoch": 3.327433628318584,
"grad_norm": 1.5919010639190674,
"learning_rate": 4.308333333333334e-06,
"loss": 0.0248,
"num_input_tokens_seen": 3597328,
"step": 517
},
{
"epoch": 3.333869670152856,
"grad_norm": 2.425961971282959,
"learning_rate": 4.316666666666667e-06,
"loss": 0.032,
"num_input_tokens_seen": 3604576,
"step": 518
},
{
"epoch": 3.340305711987128,
"grad_norm": 2.987424612045288,
"learning_rate": 4.325e-06,
"loss": 0.0202,
"num_input_tokens_seen": 3611520,
"step": 519
},
{
"epoch": 3.3467417538213997,
"grad_norm": 2.633897304534912,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0329,
"num_input_tokens_seen": 3618288,
"step": 520
},
{
"epoch": 3.3531777956556716,
"grad_norm": 1.0696384906768799,
"learning_rate": 4.341666666666667e-06,
"loss": 0.019,
"num_input_tokens_seen": 3625216,
"step": 521
},
{
"epoch": 3.3596138374899436,
"grad_norm": 2.400972604751587,
"learning_rate": 4.350000000000001e-06,
"loss": 0.0182,
"num_input_tokens_seen": 3631888,
"step": 522
},
{
"epoch": 3.3660498793242155,
"grad_norm": 1.3744821548461914,
"learning_rate": 4.358333333333334e-06,
"loss": 0.0124,
"num_input_tokens_seen": 3638848,
"step": 523
},
{
"epoch": 3.3724859211584874,
"grad_norm": 1.613145112991333,
"learning_rate": 4.366666666666667e-06,
"loss": 0.0122,
"num_input_tokens_seen": 3646112,
"step": 524
},
{
"epoch": 3.3789219629927594,
"grad_norm": 2.450824499130249,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.0388,
"num_input_tokens_seen": 3652928,
"step": 525
},
{
"epoch": 3.3853580048270313,
"grad_norm": 1.6122058629989624,
"learning_rate": 4.383333333333334e-06,
"loss": 0.0106,
"num_input_tokens_seen": 3659632,
"step": 526
},
{
"epoch": 3.3917940466613032,
"grad_norm": 1.53513765335083,
"learning_rate": 4.391666666666667e-06,
"loss": 0.0305,
"num_input_tokens_seen": 3666480,
"step": 527
},
{
"epoch": 3.398230088495575,
"grad_norm": 2.103663444519043,
"learning_rate": 4.4e-06,
"loss": 0.0512,
"num_input_tokens_seen": 3673136,
"step": 528
},
{
"epoch": 3.404666130329847,
"grad_norm": 0.41373467445373535,
"learning_rate": 4.408333333333334e-06,
"loss": 0.0031,
"num_input_tokens_seen": 3679760,
"step": 529
},
{
"epoch": 3.411102172164119,
"grad_norm": 2.9610488414764404,
"learning_rate": 4.416666666666667e-06,
"loss": 0.0309,
"num_input_tokens_seen": 3686576,
"step": 530
},
{
"epoch": 3.417538213998391,
"grad_norm": 2.415531873703003,
"learning_rate": 4.425e-06,
"loss": 0.0472,
"num_input_tokens_seen": 3693312,
"step": 531
},
{
"epoch": 3.423974255832663,
"grad_norm": 2.175546407699585,
"learning_rate": 4.433333333333334e-06,
"loss": 0.0222,
"num_input_tokens_seen": 3700000,
"step": 532
},
{
"epoch": 3.430410297666935,
"grad_norm": 1.0903018712997437,
"learning_rate": 4.441666666666667e-06,
"loss": 0.0077,
"num_input_tokens_seen": 3706736,
"step": 533
},
{
"epoch": 3.4368463395012068,
"grad_norm": 0.8305991888046265,
"learning_rate": 4.450000000000001e-06,
"loss": 0.0064,
"num_input_tokens_seen": 3714192,
"step": 534
},
{
"epoch": 3.4432823813354787,
"grad_norm": 0.9347790479660034,
"learning_rate": 4.4583333333333336e-06,
"loss": 0.0104,
"num_input_tokens_seen": 3721408,
"step": 535
},
{
"epoch": 3.4497184231697506,
"grad_norm": 1.7669559717178345,
"learning_rate": 4.4666666666666665e-06,
"loss": 0.0121,
"num_input_tokens_seen": 3728144,
"step": 536
},
{
"epoch": 3.4561544650040226,
"grad_norm": 3.121467351913452,
"learning_rate": 4.475e-06,
"loss": 0.0386,
"num_input_tokens_seen": 3734960,
"step": 537
},
{
"epoch": 3.4625905068382945,
"grad_norm": 2.683410882949829,
"learning_rate": 4.483333333333333e-06,
"loss": 0.0319,
"num_input_tokens_seen": 3741728,
"step": 538
},
{
"epoch": 3.4690265486725664,
"grad_norm": 9.728205680847168,
"learning_rate": 4.491666666666667e-06,
"loss": 0.0579,
"num_input_tokens_seen": 3749200,
"step": 539
},
{
"epoch": 3.4754625905068384,
"grad_norm": 4.415483474731445,
"learning_rate": 4.5e-06,
"loss": 0.0255,
"num_input_tokens_seen": 3755856,
"step": 540
},
{
"epoch": 3.4818986323411103,
"grad_norm": 3.651423692703247,
"learning_rate": 4.508333333333333e-06,
"loss": 0.0301,
"num_input_tokens_seen": 3762528,
"step": 541
},
{
"epoch": 3.4883346741753822,
"grad_norm": 2.318000078201294,
"learning_rate": 4.516666666666667e-06,
"loss": 0.0589,
"num_input_tokens_seen": 3769632,
"step": 542
},
{
"epoch": 3.494770716009654,
"grad_norm": 4.982158660888672,
"learning_rate": 4.525000000000001e-06,
"loss": 0.0442,
"num_input_tokens_seen": 3776592,
"step": 543
},
{
"epoch": 3.501206757843926,
"grad_norm": 3.0872108936309814,
"learning_rate": 4.533333333333334e-06,
"loss": 0.0366,
"num_input_tokens_seen": 3783824,
"step": 544
},
{
"epoch": 3.507642799678198,
"grad_norm": 5.150477886199951,
"learning_rate": 4.541666666666667e-06,
"loss": 0.0643,
"num_input_tokens_seen": 3790864,
"step": 545
},
{
"epoch": 3.51407884151247,
"grad_norm": 3.0513834953308105,
"learning_rate": 4.5500000000000005e-06,
"loss": 0.0213,
"num_input_tokens_seen": 3797664,
"step": 546
},
{
"epoch": 3.520514883346742,
"grad_norm": 1.5530712604522705,
"learning_rate": 4.5583333333333335e-06,
"loss": 0.0154,
"num_input_tokens_seen": 3804576,
"step": 547
},
{
"epoch": 3.526950925181014,
"grad_norm": 2.6350319385528564,
"learning_rate": 4.566666666666667e-06,
"loss": 0.0252,
"num_input_tokens_seen": 3811440,
"step": 548
},
{
"epoch": 3.5333869670152858,
"grad_norm": 2.8993167877197266,
"learning_rate": 4.575e-06,
"loss": 0.038,
"num_input_tokens_seen": 3818352,
"step": 549
},
{
"epoch": 3.5398230088495577,
"grad_norm": 2.0168752670288086,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0169,
"num_input_tokens_seen": 3825360,
"step": 550
},
{
"epoch": 3.5462590506838296,
"grad_norm": 2.4160525798797607,
"learning_rate": 4.591666666666667e-06,
"loss": 0.0253,
"num_input_tokens_seen": 3832416,
"step": 551
},
{
"epoch": 3.5526950925181016,
"grad_norm": 1.543545126914978,
"learning_rate": 4.600000000000001e-06,
"loss": 0.0164,
"num_input_tokens_seen": 3839344,
"step": 552
},
{
"epoch": 3.5591311343523735,
"grad_norm": 2.355316400527954,
"learning_rate": 4.608333333333334e-06,
"loss": 0.0269,
"num_input_tokens_seen": 3846688,
"step": 553
},
{
"epoch": 3.5655671761866454,
"grad_norm": 1.4751020669937134,
"learning_rate": 4.616666666666667e-06,
"loss": 0.0192,
"num_input_tokens_seen": 3853696,
"step": 554
},
{
"epoch": 3.5720032180209174,
"grad_norm": 0.9673195481300354,
"learning_rate": 4.625000000000001e-06,
"loss": 0.0132,
"num_input_tokens_seen": 3860832,
"step": 555
},
{
"epoch": 3.5784392598551893,
"grad_norm": 1.1592040061950684,
"learning_rate": 4.633333333333334e-06,
"loss": 0.0156,
"num_input_tokens_seen": 3868000,
"step": 556
},
{
"epoch": 3.5848753016894612,
"grad_norm": 1.01143217086792,
"learning_rate": 4.641666666666667e-06,
"loss": 0.0081,
"num_input_tokens_seen": 3874672,
"step": 557
},
{
"epoch": 3.591311343523733,
"grad_norm": 2.855041980743408,
"learning_rate": 4.65e-06,
"loss": 0.0351,
"num_input_tokens_seen": 3881744,
"step": 558
},
{
"epoch": 3.597747385358005,
"grad_norm": 2.0597968101501465,
"learning_rate": 4.658333333333333e-06,
"loss": 0.0288,
"num_input_tokens_seen": 3888256,
"step": 559
},
{
"epoch": 3.604183427192277,
"grad_norm": 2.9965226650238037,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0335,
"num_input_tokens_seen": 3895104,
"step": 560
},
{
"epoch": 3.6106194690265485,
"grad_norm": 3.625206708908081,
"learning_rate": 4.675000000000001e-06,
"loss": 0.0492,
"num_input_tokens_seen": 3902208,
"step": 561
},
{
"epoch": 3.6170555108608204,
"grad_norm": 2.021160840988159,
"learning_rate": 4.683333333333334e-06,
"loss": 0.0082,
"num_input_tokens_seen": 3909040,
"step": 562
},
{
"epoch": 3.6234915526950924,
"grad_norm": 3.4565329551696777,
"learning_rate": 4.691666666666667e-06,
"loss": 0.0491,
"num_input_tokens_seen": 3916304,
"step": 563
},
{
"epoch": 3.6299275945293643,
"grad_norm": 3.2362654209136963,
"learning_rate": 4.7e-06,
"loss": 0.0568,
"num_input_tokens_seen": 3923216,
"step": 564
},
{
"epoch": 3.6363636363636362,
"grad_norm": 3.234666347503662,
"learning_rate": 4.708333333333334e-06,
"loss": 0.0414,
"num_input_tokens_seen": 3930448,
"step": 565
},
{
"epoch": 3.642799678197908,
"grad_norm": 2.1742103099823,
"learning_rate": 4.7166666666666675e-06,
"loss": 0.034,
"num_input_tokens_seen": 3937424,
"step": 566
},
{
"epoch": 3.64923572003218,
"grad_norm": 2.9156923294067383,
"learning_rate": 4.7250000000000005e-06,
"loss": 0.0392,
"num_input_tokens_seen": 3944112,
"step": 567
},
{
"epoch": 3.655671761866452,
"grad_norm": 4.092429161071777,
"learning_rate": 4.7333333333333335e-06,
"loss": 0.051,
"num_input_tokens_seen": 3951504,
"step": 568
},
{
"epoch": 3.662107803700724,
"grad_norm": 3.9395768642425537,
"learning_rate": 4.741666666666667e-06,
"loss": 0.034,
"num_input_tokens_seen": 3958352,
"step": 569
},
{
"epoch": 3.668543845534996,
"grad_norm": 1.9961844682693481,
"learning_rate": 4.75e-06,
"loss": 0.014,
"num_input_tokens_seen": 3965552,
"step": 570
},
{
"epoch": 3.674979887369268,
"grad_norm": 1.8078194856643677,
"learning_rate": 4.758333333333334e-06,
"loss": 0.0406,
"num_input_tokens_seen": 3972544,
"step": 571
},
{
"epoch": 3.6814159292035398,
"grad_norm": 2.048532485961914,
"learning_rate": 4.766666666666667e-06,
"loss": 0.0407,
"num_input_tokens_seen": 3979264,
"step": 572
},
{
"epoch": 3.6878519710378117,
"grad_norm": 1.9979974031448364,
"learning_rate": 4.775e-06,
"loss": 0.0282,
"num_input_tokens_seen": 3986240,
"step": 573
},
{
"epoch": 3.6942880128720836,
"grad_norm": 3.6126463413238525,
"learning_rate": 4.783333333333334e-06,
"loss": 0.0326,
"num_input_tokens_seen": 3993232,
"step": 574
},
{
"epoch": 3.7007240547063556,
"grad_norm": 3.131657838821411,
"learning_rate": 4.791666666666668e-06,
"loss": 0.0348,
"num_input_tokens_seen": 3999952,
"step": 575
},
{
"epoch": 3.7071600965406275,
"grad_norm": 2.2662060260772705,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0256,
"num_input_tokens_seen": 4007456,
"step": 576
},
{
"epoch": 3.7135961383748994,
"grad_norm": 4.874523639678955,
"learning_rate": 4.808333333333334e-06,
"loss": 0.0765,
"num_input_tokens_seen": 4015024,
"step": 577
},
{
"epoch": 3.7200321802091714,
"grad_norm": 0.882166862487793,
"learning_rate": 4.816666666666667e-06,
"loss": 0.0099,
"num_input_tokens_seen": 4021920,
"step": 578
},
{
"epoch": 3.7264682220434433,
"grad_norm": 3.1239066123962402,
"learning_rate": 4.825e-06,
"loss": 0.0173,
"num_input_tokens_seen": 4028720,
"step": 579
},
{
"epoch": 3.7329042638777152,
"grad_norm": 1.5819370746612549,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0084,
"num_input_tokens_seen": 4035584,
"step": 580
},
{
"epoch": 3.739340305711987,
"grad_norm": 2.6252429485321045,
"learning_rate": 4.841666666666667e-06,
"loss": 0.0251,
"num_input_tokens_seen": 4042464,
"step": 581
},
{
"epoch": 3.745776347546259,
"grad_norm": 2.0619590282440186,
"learning_rate": 4.85e-06,
"loss": 0.0909,
"num_input_tokens_seen": 4049600,
"step": 582
},
{
"epoch": 3.752212389380531,
"grad_norm": 2.547422409057617,
"learning_rate": 4.858333333333334e-06,
"loss": 0.039,
"num_input_tokens_seen": 4056320,
"step": 583
},
{
"epoch": 3.758648431214803,
"grad_norm": 1.3179091215133667,
"learning_rate": 4.866666666666667e-06,
"loss": 0.0079,
"num_input_tokens_seen": 4063200,
"step": 584
},
{
"epoch": 3.765084473049075,
"grad_norm": 3.090376377105713,
"learning_rate": 4.875e-06,
"loss": 0.0242,
"num_input_tokens_seen": 4070112,
"step": 585
},
{
"epoch": 3.771520514883347,
"grad_norm": 2.50468111038208,
"learning_rate": 4.883333333333334e-06,
"loss": 0.0138,
"num_input_tokens_seen": 4076928,
"step": 586
},
{
"epoch": 3.7779565567176188,
"grad_norm": 3.921415090560913,
"learning_rate": 4.8916666666666675e-06,
"loss": 0.0467,
"num_input_tokens_seen": 4083792,
"step": 587
},
{
"epoch": 3.7843925985518907,
"grad_norm": 1.2243348360061646,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.0241,
"num_input_tokens_seen": 4090672,
"step": 588
},
{
"epoch": 3.7908286403861626,
"grad_norm": 1.4968576431274414,
"learning_rate": 4.9083333333333335e-06,
"loss": 0.0404,
"num_input_tokens_seen": 4097472,
"step": 589
},
{
"epoch": 3.7972646822204346,
"grad_norm": 1.235217809677124,
"learning_rate": 4.9166666666666665e-06,
"loss": 0.0094,
"num_input_tokens_seen": 4104016,
"step": 590
},
{
"epoch": 3.8037007240547065,
"grad_norm": 1.3862783908843994,
"learning_rate": 4.925e-06,
"loss": 0.0196,
"num_input_tokens_seen": 4110784,
"step": 591
},
{
"epoch": 3.8101367658889784,
"grad_norm": 3.560793399810791,
"learning_rate": 4.933333333333334e-06,
"loss": 0.0514,
"num_input_tokens_seen": 4117984,
"step": 592
},
{
"epoch": 3.8165728077232504,
"grad_norm": 2.008575677871704,
"learning_rate": 4.941666666666667e-06,
"loss": 0.0286,
"num_input_tokens_seen": 4125072,
"step": 593
},
{
"epoch": 3.823008849557522,
"grad_norm": 2.3213093280792236,
"learning_rate": 4.95e-06,
"loss": 0.0417,
"num_input_tokens_seen": 4132160,
"step": 594
},
{
"epoch": 3.829444891391794,
"grad_norm": 1.3540257215499878,
"learning_rate": 4.958333333333334e-06,
"loss": 0.0347,
"num_input_tokens_seen": 4139136,
"step": 595
},
{
"epoch": 3.8358809332260657,
"grad_norm": 1.289825677871704,
"learning_rate": 4.966666666666667e-06,
"loss": 0.0229,
"num_input_tokens_seen": 4146240,
"step": 596
},
{
"epoch": 3.8423169750603376,
"grad_norm": 2.4050135612487793,
"learning_rate": 4.975000000000001e-06,
"loss": 0.0176,
"num_input_tokens_seen": 4153152,
"step": 597
},
{
"epoch": 3.8487530168946096,
"grad_norm": 1.523977518081665,
"learning_rate": 4.983333333333334e-06,
"loss": 0.0274,
"num_input_tokens_seen": 4160080,
"step": 598
},
{
"epoch": 3.8551890587288815,
"grad_norm": 1.1898863315582275,
"learning_rate": 4.991666666666667e-06,
"loss": 0.0253,
"num_input_tokens_seen": 4167008,
"step": 599
},
{
"epoch": 3.8616251005631534,
"grad_norm": 1.992311954498291,
"learning_rate": 5e-06,
"loss": 0.0429,
"num_input_tokens_seen": 4174080,
"step": 600
},
{
"epoch": 3.8680611423974254,
"grad_norm": 0.9558950066566467,
"learning_rate": 4.999597169822646e-06,
"loss": 0.0142,
"num_input_tokens_seen": 4181104,
"step": 601
},
{
"epoch": 3.8744971842316973,
"grad_norm": 0.9275301694869995,
"learning_rate": 4.998388809108304e-06,
"loss": 0.0148,
"num_input_tokens_seen": 4188096,
"step": 602
},
{
"epoch": 3.8809332260659692,
"grad_norm": 1.6707432270050049,
"learning_rate": 4.996375307268303e-06,
"loss": 0.0166,
"num_input_tokens_seen": 4195152,
"step": 603
},
{
"epoch": 3.887369267900241,
"grad_norm": 5.857227325439453,
"learning_rate": 4.993557313182086e-06,
"loss": 0.0224,
"num_input_tokens_seen": 4201952,
"step": 604
},
{
"epoch": 3.893805309734513,
"grad_norm": 5.273613452911377,
"learning_rate": 4.989935734988098e-06,
"loss": 0.0227,
"num_input_tokens_seen": 4209104,
"step": 605
},
{
"epoch": 3.900241351568785,
"grad_norm": 6.268670082092285,
"learning_rate": 4.985511739791129e-06,
"loss": 0.0597,
"num_input_tokens_seen": 4216496,
"step": 606
},
{
"epoch": 3.906677393403057,
"grad_norm": 3.373368501663208,
"learning_rate": 4.980286753286196e-06,
"loss": 0.0339,
"num_input_tokens_seen": 4223840,
"step": 607
},
{
"epoch": 3.913113435237329,
"grad_norm": 1.3991198539733887,
"learning_rate": 4.974262459299088e-06,
"loss": 0.0192,
"num_input_tokens_seen": 4230752,
"step": 608
},
{
"epoch": 3.919549477071601,
"grad_norm": 0.7424534559249878,
"learning_rate": 4.967440799243739e-06,
"loss": 0.007,
"num_input_tokens_seen": 4237360,
"step": 609
},
{
"epoch": 3.9259855189058728,
"grad_norm": 3.0347440242767334,
"learning_rate": 4.959823971496575e-06,
"loss": 0.017,
"num_input_tokens_seen": 4244128,
"step": 610
},
{
"epoch": 3.9324215607401447,
"grad_norm": 2.929175853729248,
"learning_rate": 4.9514144306880506e-06,
"loss": 0.0296,
"num_input_tokens_seen": 4251264,
"step": 611
},
{
"epoch": 3.9388576025744166,
"grad_norm": 4.076401710510254,
"learning_rate": 4.942214886911619e-06,
"loss": 0.0429,
"num_input_tokens_seen": 4258256,
"step": 612
},
{
"epoch": 3.9452936444086886,
"grad_norm": 0.7720851302146912,
"learning_rate": 4.932228304850363e-06,
"loss": 0.0027,
"num_input_tokens_seen": 4265280,
"step": 613
},
{
"epoch": 3.9517296862429605,
"grad_norm": 1.500545859336853,
"learning_rate": 4.921457902821578e-06,
"loss": 0.0395,
"num_input_tokens_seen": 4271968,
"step": 614
},
{
"epoch": 3.9581657280772324,
"grad_norm": 3.0767860412597656,
"learning_rate": 4.909907151739634e-06,
"loss": 0.03,
"num_input_tokens_seen": 4278848,
"step": 615
},
{
"epoch": 3.9646017699115044,
"grad_norm": 1.5455620288848877,
"learning_rate": 4.897579773997415e-06,
"loss": 0.0178,
"num_input_tokens_seen": 4285808,
"step": 616
},
{
"epoch": 3.9710378117457763,
"grad_norm": 1.1472654342651367,
"learning_rate": 4.884479742266731e-06,
"loss": 0.0139,
"num_input_tokens_seen": 4292912,
"step": 617
},
{
"epoch": 3.9774738535800482,
"grad_norm": 1.3290921449661255,
"learning_rate": 4.870611278218066e-06,
"loss": 0.0076,
"num_input_tokens_seen": 4300176,
"step": 618
},
{
"epoch": 3.98390989541432,
"grad_norm": 4.543910026550293,
"learning_rate": 4.855978851160088e-06,
"loss": 0.0683,
"num_input_tokens_seen": 4307776,
"step": 619
},
{
"epoch": 3.990345937248592,
"grad_norm": 3.424959421157837,
"learning_rate": 4.8405871765993435e-06,
"loss": 0.0367,
"num_input_tokens_seen": 4314688,
"step": 620
},
{
"epoch": 3.996781979082864,
"grad_norm": 1.5345810651779175,
"learning_rate": 4.824441214720629e-06,
"loss": 0.0497,
"num_input_tokens_seen": 4321840,
"step": 621
},
{
"epoch": 4.003218020917136,
"grad_norm": 0.5405219793319702,
"learning_rate": 4.8075461687884935e-06,
"loss": 0.0054,
"num_input_tokens_seen": 4328736,
"step": 622
},
{
"epoch": 4.009654062751408,
"grad_norm": 2.3540198802948,
"learning_rate": 4.7899074834704165e-06,
"loss": 0.0259,
"num_input_tokens_seen": 4335952,
"step": 623
},
{
"epoch": 4.01609010458568,
"grad_norm": 0.7733599543571472,
"learning_rate": 4.771530843082187e-06,
"loss": 0.0082,
"num_input_tokens_seen": 4342816,
"step": 624
},
{
"epoch": 4.022526146419952,
"grad_norm": 3.051017999649048,
"learning_rate": 4.752422169756048e-06,
"loss": 0.0359,
"num_input_tokens_seen": 4349456,
"step": 625
},
{
"epoch": 4.028962188254224,
"grad_norm": 0.4645274579524994,
"learning_rate": 4.732587621532214e-06,
"loss": 0.0081,
"num_input_tokens_seen": 4356032,
"step": 626
},
{
"epoch": 4.035398230088496,
"grad_norm": 1.9294419288635254,
"learning_rate": 4.712033590374346e-06,
"loss": 0.0118,
"num_input_tokens_seen": 4362928,
"step": 627
},
{
"epoch": 4.041834271922768,
"grad_norm": 2.5432851314544678,
"learning_rate": 4.690766700109659e-06,
"loss": 0.0235,
"num_input_tokens_seen": 4369616,
"step": 628
},
{
"epoch": 4.0482703137570395,
"grad_norm": 1.8334590196609497,
"learning_rate": 4.668793804294294e-06,
"loss": 0.0145,
"num_input_tokens_seen": 4376656,
"step": 629
},
{
"epoch": 4.054706355591311,
"grad_norm": 0.6473208069801331,
"learning_rate": 4.646121984004666e-06,
"loss": 0.006,
"num_input_tokens_seen": 4383696,
"step": 630
},
{
"epoch": 4.061142397425583,
"grad_norm": 2.0988128185272217,
"learning_rate": 4.622758545555485e-06,
"loss": 0.0191,
"num_input_tokens_seen": 4390880,
"step": 631
},
{
"epoch": 4.067578439259855,
"grad_norm": 1.8957973718643188,
"learning_rate": 4.598711018145193e-06,
"loss": 0.0075,
"num_input_tokens_seen": 4398000,
"step": 632
},
{
"epoch": 4.074014481094127,
"grad_norm": 1.117255449295044,
"learning_rate": 4.573987151429579e-06,
"loss": 0.0253,
"num_input_tokens_seen": 4404640,
"step": 633
},
{
"epoch": 4.080450522928399,
"grad_norm": 2.326129198074341,
"learning_rate": 4.54859491302433e-06,
"loss": 0.0317,
"num_input_tokens_seen": 4411760,
"step": 634
},
{
"epoch": 4.086886564762671,
"grad_norm": 1.6843276023864746,
"learning_rate": 4.522542485937369e-06,
"loss": 0.0082,
"num_input_tokens_seen": 4418896,
"step": 635
},
{
"epoch": 4.093322606596943,
"grad_norm": 2.301496744155884,
"learning_rate": 4.495838265931754e-06,
"loss": 0.0101,
"num_input_tokens_seen": 4425776,
"step": 636
},
{
"epoch": 4.099758648431215,
"grad_norm": 1.434444546699524,
"learning_rate": 4.4684908588200305e-06,
"loss": 0.0112,
"num_input_tokens_seen": 4432656,
"step": 637
},
{
"epoch": 4.106194690265487,
"grad_norm": 1.3446779251098633,
"learning_rate": 4.440509077690883e-06,
"loss": 0.0034,
"num_input_tokens_seen": 4439424,
"step": 638
},
{
"epoch": 4.112630732099759,
"grad_norm": 0.6733867526054382,
"learning_rate": 4.411901940068997e-06,
"loss": 0.0037,
"num_input_tokens_seen": 4446160,
"step": 639
},
{
"epoch": 4.119066773934031,
"grad_norm": 1.339034080505371,
"learning_rate": 4.382678665009028e-06,
"loss": 0.0085,
"num_input_tokens_seen": 4453376,
"step": 640
},
{
"epoch": 4.125502815768303,
"grad_norm": 3.2036638259887695,
"learning_rate": 4.352848670124637e-06,
"loss": 0.0328,
"num_input_tokens_seen": 4459952,
"step": 641
},
{
"epoch": 4.131938857602575,
"grad_norm": 1.1791878938674927,
"learning_rate": 4.322421568553529e-06,
"loss": 0.0098,
"num_input_tokens_seen": 4466880,
"step": 642
},
{
"epoch": 4.1383748994368466,
"grad_norm": 1.8526674509048462,
"learning_rate": 4.291407165859481e-06,
"loss": 0.0051,
"num_input_tokens_seen": 4474064,
"step": 643
},
{
"epoch": 4.1448109412711185,
"grad_norm": 0.4795032739639282,
"learning_rate": 4.259815456872363e-06,
"loss": 0.0047,
"num_input_tokens_seen": 4480864,
"step": 644
},
{
"epoch": 4.15124698310539,
"grad_norm": 1.4392155408859253,
"learning_rate": 4.227656622467162e-06,
"loss": 0.0111,
"num_input_tokens_seen": 4487504,
"step": 645
},
{
"epoch": 4.157683024939662,
"grad_norm": 3.185128688812256,
"learning_rate": 4.194941026283053e-06,
"loss": 0.0334,
"num_input_tokens_seen": 4494512,
"step": 646
},
{
"epoch": 4.164119066773934,
"grad_norm": 1.7285927534103394,
"learning_rate": 4.161679211383565e-06,
"loss": 0.013,
"num_input_tokens_seen": 4501296,
"step": 647
},
{
"epoch": 4.170555108608206,
"grad_norm": 4.266958713531494,
"learning_rate": 4.127881896858934e-06,
"loss": 0.0305,
"num_input_tokens_seen": 4508128,
"step": 648
},
{
"epoch": 4.176991150442478,
"grad_norm": 1.000532627105713,
"learning_rate": 4.093559974371725e-06,
"loss": 0.0092,
"num_input_tokens_seen": 4515008,
"step": 649
},
{
"epoch": 4.18342719227675,
"grad_norm": 1.1824270486831665,
"learning_rate": 4.058724504646834e-06,
"loss": 0.0223,
"num_input_tokens_seen": 4521920,
"step": 650
},
{
"epoch": 4.189863234111022,
"grad_norm": 2.444427728652954,
"learning_rate": 4.023386713907021e-06,
"loss": 0.0234,
"num_input_tokens_seen": 4528912,
"step": 651
},
{
"epoch": 4.196299275945294,
"grad_norm": 1.421184778213501,
"learning_rate": 3.987557990255093e-06,
"loss": 0.0185,
"num_input_tokens_seen": 4535664,
"step": 652
},
{
"epoch": 4.202735317779566,
"grad_norm": 0.9019869565963745,
"learning_rate": 3.951249880003934e-06,
"loss": 0.0075,
"num_input_tokens_seen": 4542832,
"step": 653
},
{
"epoch": 4.209171359613838,
"grad_norm": 1.7373372316360474,
"learning_rate": 3.914474083955537e-06,
"loss": 0.0217,
"num_input_tokens_seen": 4549552,
"step": 654
},
{
"epoch": 4.21560740144811,
"grad_norm": 0.31386592984199524,
"learning_rate": 3.8772424536302565e-06,
"loss": 0.0027,
"num_input_tokens_seen": 4556192,
"step": 655
},
{
"epoch": 4.222043443282382,
"grad_norm": 1.8379613161087036,
"learning_rate": 3.839566987447492e-06,
"loss": 0.0153,
"num_input_tokens_seen": 4563168,
"step": 656
},
{
"epoch": 4.228479485116654,
"grad_norm": 1.221056342124939,
"learning_rate": 3.801459826859022e-06,
"loss": 0.0092,
"num_input_tokens_seen": 4570704,
"step": 657
},
{
"epoch": 4.2349155269509255,
"grad_norm": 0.7823006510734558,
"learning_rate": 3.7629332524362532e-06,
"loss": 0.0082,
"num_input_tokens_seen": 4578016,
"step": 658
},
{
"epoch": 4.2413515687851975,
"grad_norm": 1.149715781211853,
"learning_rate": 3.7239996799126315e-06,
"loss": 0.0163,
"num_input_tokens_seen": 4584896,
"step": 659
},
{
"epoch": 4.247787610619469,
"grad_norm": 0.6069539189338684,
"learning_rate": 3.684671656182497e-06,
"loss": 0.0099,
"num_input_tokens_seen": 4591984,
"step": 660
},
{
"epoch": 4.254223652453741,
"grad_norm": 2.427281141281128,
"learning_rate": 3.644961855257669e-06,
"loss": 0.0269,
"num_input_tokens_seen": 4598656,
"step": 661
},
{
"epoch": 4.260659694288013,
"grad_norm": 1.0770633220672607,
"learning_rate": 3.6048830741830678e-06,
"loss": 0.007,
"num_input_tokens_seen": 4606032,
"step": 662
},
{
"epoch": 4.267095736122285,
"grad_norm": 2.4310688972473145,
"learning_rate": 3.564448228912682e-06,
"loss": 0.0427,
"num_input_tokens_seen": 4613056,
"step": 663
},
{
"epoch": 4.273531777956556,
"grad_norm": 1.2328161001205444,
"learning_rate": 3.523670350147227e-06,
"loss": 0.0122,
"num_input_tokens_seen": 4619776,
"step": 664
},
{
"epoch": 4.279967819790828,
"grad_norm": 1.519998550415039,
"learning_rate": 3.4825625791348093e-06,
"loss": 0.0137,
"num_input_tokens_seen": 4626240,
"step": 665
},
{
"epoch": 4.2864038616251,
"grad_norm": 1.4114880561828613,
"learning_rate": 3.44113816343598e-06,
"loss": 0.02,
"num_input_tokens_seen": 4633216,
"step": 666
},
{
"epoch": 4.292839903459372,
"grad_norm": 1.4585809707641602,
"learning_rate": 3.399410452654518e-06,
"loss": 0.006,
"num_input_tokens_seen": 4639856,
"step": 667
},
{
"epoch": 4.299275945293644,
"grad_norm": 1.594936490058899,
"learning_rate": 3.357392894135329e-06,
"loss": 0.0085,
"num_input_tokens_seen": 4646832,
"step": 668
},
{
"epoch": 4.305711987127916,
"grad_norm": 2.5802690982818604,
"learning_rate": 3.315099028630855e-06,
"loss": 0.0112,
"num_input_tokens_seen": 4653648,
"step": 669
},
{
"epoch": 4.312148028962188,
"grad_norm": 1.3826483488082886,
"learning_rate": 3.272542485937369e-06,
"loss": 0.0131,
"num_input_tokens_seen": 4660672,
"step": 670
},
{
"epoch": 4.31858407079646,
"grad_norm": 2.1874148845672607,
"learning_rate": 3.229736980502584e-06,
"loss": 0.0124,
"num_input_tokens_seen": 4667888,
"step": 671
},
{
"epoch": 4.325020112630732,
"grad_norm": 1.61604642868042,
"learning_rate": 3.186696307005976e-06,
"loss": 0.0042,
"num_input_tokens_seen": 4675072,
"step": 672
},
{
"epoch": 4.331456154465004,
"grad_norm": 0.40999871492385864,
"learning_rate": 3.1434343359132565e-06,
"loss": 0.0011,
"num_input_tokens_seen": 4682016,
"step": 673
},
{
"epoch": 4.337892196299276,
"grad_norm": 0.1305094212293625,
"learning_rate": 3.099965009006415e-06,
"loss": 0.0008,
"num_input_tokens_seen": 4688912,
"step": 674
},
{
"epoch": 4.3443282381335475,
"grad_norm": 1.6623185873031616,
"learning_rate": 3.056302334890786e-06,
"loss": 0.0056,
"num_input_tokens_seen": 4695936,
"step": 675
},
{
"epoch": 4.3507642799678194,
"grad_norm": 1.034837007522583,
"learning_rate": 3.0124603844805767e-06,
"loss": 0.0079,
"num_input_tokens_seen": 4703184,
"step": 676
},
{
"epoch": 4.357200321802091,
"grad_norm": 2.2049107551574707,
"learning_rate": 2.9684532864643123e-06,
"loss": 0.0216,
"num_input_tokens_seen": 4710064,
"step": 677
},
{
"epoch": 4.363636363636363,
"grad_norm": 4.32258939743042,
"learning_rate": 2.9242952227516726e-06,
"loss": 0.0258,
"num_input_tokens_seen": 4716336,
"step": 678
},
{
"epoch": 4.370072405470635,
"grad_norm": 1.0949031114578247,
"learning_rate": 2.8800004239031687e-06,
"loss": 0.0049,
"num_input_tokens_seen": 4723360,
"step": 679
},
{
"epoch": 4.376508447304907,
"grad_norm": 1.563004493713379,
"learning_rate": 2.835583164544139e-06,
"loss": 0.0034,
"num_input_tokens_seen": 4730464,
"step": 680
},
{
"epoch": 4.382944489139179,
"grad_norm": 2.775270938873291,
"learning_rate": 2.791057758764557e-06,
"loss": 0.0341,
"num_input_tokens_seen": 4737056,
"step": 681
},
{
"epoch": 4.389380530973451,
"grad_norm": 3.1517560482025146,
"learning_rate": 2.7464385555061092e-06,
"loss": 0.0074,
"num_input_tokens_seen": 4743936,
"step": 682
},
{
"epoch": 4.395816572807723,
"grad_norm": 1.2521913051605225,
"learning_rate": 2.7017399339380435e-06,
"loss": 0.0272,
"num_input_tokens_seen": 4751024,
"step": 683
},
{
"epoch": 4.402252614641995,
"grad_norm": 3.4706435203552246,
"learning_rate": 2.6569762988232838e-06,
"loss": 0.0168,
"num_input_tokens_seen": 4758000,
"step": 684
},
{
"epoch": 4.408688656476267,
"grad_norm": 0.8021034598350525,
"learning_rate": 2.6121620758762877e-06,
"loss": 0.0047,
"num_input_tokens_seen": 4764816,
"step": 685
},
{
"epoch": 4.415124698310539,
"grad_norm": 4.709753036499023,
"learning_rate": 2.5673117071141574e-06,
"loss": 0.0198,
"num_input_tokens_seen": 4772144,
"step": 686
},
{
"epoch": 4.421560740144811,
"grad_norm": 0.40973323583602905,
"learning_rate": 2.522439646202495e-06,
"loss": 0.0012,
"num_input_tokens_seen": 4778960,
"step": 687
},
{
"epoch": 4.427996781979083,
"grad_norm": 3.179236888885498,
"learning_rate": 2.4775603537975055e-06,
"loss": 0.0256,
"num_input_tokens_seen": 4785952,
"step": 688
},
{
"epoch": 4.434432823813355,
"grad_norm": 2.5204341411590576,
"learning_rate": 2.4326882928858435e-06,
"loss": 0.0187,
"num_input_tokens_seen": 4792608,
"step": 689
},
{
"epoch": 4.4408688656476265,
"grad_norm": 3.6536998748779297,
"learning_rate": 2.3878379241237136e-06,
"loss": 0.0135,
"num_input_tokens_seen": 4799232,
"step": 690
},
{
"epoch": 4.447304907481898,
"grad_norm": 1.0689839124679565,
"learning_rate": 2.3430237011767166e-06,
"loss": 0.0036,
"num_input_tokens_seen": 4806080,
"step": 691
},
{
"epoch": 4.45374094931617,
"grad_norm": 2.071629762649536,
"learning_rate": 2.2982600660619574e-06,
"loss": 0.0135,
"num_input_tokens_seen": 4813728,
"step": 692
},
{
"epoch": 4.460176991150442,
"grad_norm": 3.4168224334716797,
"learning_rate": 2.253561444493891e-06,
"loss": 0.0046,
"num_input_tokens_seen": 4820608,
"step": 693
},
{
"epoch": 4.466613032984714,
"grad_norm": 0.3058677017688751,
"learning_rate": 2.2089422412354434e-06,
"loss": 0.0019,
"num_input_tokens_seen": 4827056,
"step": 694
},
{
"epoch": 4.473049074818986,
"grad_norm": 0.4175882935523987,
"learning_rate": 2.1644168354558623e-06,
"loss": 0.0022,
"num_input_tokens_seen": 4834080,
"step": 695
},
{
"epoch": 4.479485116653258,
"grad_norm": 0.7226863503456116,
"learning_rate": 2.119999576096832e-06,
"loss": 0.0093,
"num_input_tokens_seen": 4840912,
"step": 696
},
{
"epoch": 4.48592115848753,
"grad_norm": 0.1190720871090889,
"learning_rate": 2.0757047772483278e-06,
"loss": 0.0012,
"num_input_tokens_seen": 4848112,
"step": 697
},
{
"epoch": 4.492357200321802,
"grad_norm": 1.0061287879943848,
"learning_rate": 2.031546713535688e-06,
"loss": 0.0036,
"num_input_tokens_seen": 4855072,
"step": 698
},
{
"epoch": 4.498793242156074,
"grad_norm": 0.9472126364707947,
"learning_rate": 1.987539615519424e-06,
"loss": 0.0071,
"num_input_tokens_seen": 4862064,
"step": 699
},
{
"epoch": 4.505229283990346,
"grad_norm": 0.8338857889175415,
"learning_rate": 1.9436976651092143e-06,
"loss": 0.0055,
"num_input_tokens_seen": 4869104,
"step": 700
},
{
"epoch": 4.511665325824618,
"grad_norm": 3.2061474323272705,
"learning_rate": 1.9000349909935852e-06,
"loss": 0.0291,
"num_input_tokens_seen": 4876112,
"step": 701
},
{
"epoch": 4.51810136765889,
"grad_norm": 3.644125461578369,
"learning_rate": 1.8565656640867448e-06,
"loss": 0.0407,
"num_input_tokens_seen": 4883264,
"step": 702
},
{
"epoch": 4.524537409493162,
"grad_norm": 2.2370316982269287,
"learning_rate": 1.813303692994025e-06,
"loss": 0.0245,
"num_input_tokens_seen": 4890192,
"step": 703
},
{
"epoch": 4.530973451327434,
"grad_norm": 3.3120510578155518,
"learning_rate": 1.770263019497417e-06,
"loss": 0.0207,
"num_input_tokens_seen": 4897200,
"step": 704
},
{
"epoch": 4.5374094931617055,
"grad_norm": 1.256335973739624,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.0269,
"num_input_tokens_seen": 4904016,
"step": 705
},
{
"epoch": 4.543845534995977,
"grad_norm": 0.10977872461080551,
"learning_rate": 1.6849009713691456e-06,
"loss": 0.001,
"num_input_tokens_seen": 4910944,
"step": 706
},
{
"epoch": 4.550281576830249,
"grad_norm": 1.9825077056884766,
"learning_rate": 1.6426071058646718e-06,
"loss": 0.0205,
"num_input_tokens_seen": 4917424,
"step": 707
},
{
"epoch": 4.556717618664521,
"grad_norm": 0.7529383897781372,
"learning_rate": 1.6005895473454836e-06,
"loss": 0.0148,
"num_input_tokens_seen": 4924288,
"step": 708
},
{
"epoch": 4.563153660498793,
"grad_norm": 2.29215145111084,
"learning_rate": 1.55886183656402e-06,
"loss": 0.0239,
"num_input_tokens_seen": 4931040,
"step": 709
},
{
"epoch": 4.569589702333065,
"grad_norm": 1.639636754989624,
"learning_rate": 1.5174374208651913e-06,
"loss": 0.0165,
"num_input_tokens_seen": 4937968,
"step": 710
},
{
"epoch": 4.576025744167337,
"grad_norm": 1.8043317794799805,
"learning_rate": 1.4763296498527744e-06,
"loss": 0.0079,
"num_input_tokens_seen": 4945456,
"step": 711
},
{
"epoch": 4.582461786001609,
"grad_norm": 1.8007737398147583,
"learning_rate": 1.4355517710873184e-06,
"loss": 0.0338,
"num_input_tokens_seen": 4952080,
"step": 712
},
{
"epoch": 4.588897827835881,
"grad_norm": 0.6810876131057739,
"learning_rate": 1.395116925816934e-06,
"loss": 0.0136,
"num_input_tokens_seen": 4958944,
"step": 713
},
{
"epoch": 4.595333869670153,
"grad_norm": 1.0080180168151855,
"learning_rate": 1.3550381447423317e-06,
"loss": 0.0126,
"num_input_tokens_seen": 4966320,
"step": 714
},
{
"epoch": 4.601769911504425,
"grad_norm": 1.1210750341415405,
"learning_rate": 1.3153283438175036e-06,
"loss": 0.0174,
"num_input_tokens_seen": 4973344,
"step": 715
},
{
"epoch": 4.608205953338697,
"grad_norm": 2.2793147563934326,
"learning_rate": 1.27600032008737e-06,
"loss": 0.0155,
"num_input_tokens_seen": 4980304,
"step": 716
},
{
"epoch": 4.614641995172969,
"grad_norm": 2.0746471881866455,
"learning_rate": 1.2370667475637474e-06,
"loss": 0.0349,
"num_input_tokens_seen": 4987616,
"step": 717
},
{
"epoch": 4.621078037007241,
"grad_norm": 1.9974377155303955,
"learning_rate": 1.1985401731409793e-06,
"loss": 0.0082,
"num_input_tokens_seen": 4994656,
"step": 718
},
{
"epoch": 4.627514078841513,
"grad_norm": 0.9225305914878845,
"learning_rate": 1.160433012552508e-06,
"loss": 0.0204,
"num_input_tokens_seen": 5001776,
"step": 719
},
{
"epoch": 4.6339501206757845,
"grad_norm": 0.6030845642089844,
"learning_rate": 1.122757546369744e-06,
"loss": 0.0074,
"num_input_tokens_seen": 5008688,
"step": 720
},
{
"epoch": 4.640386162510056,
"grad_norm": 1.1969950199127197,
"learning_rate": 1.085525916044464e-06,
"loss": 0.0154,
"num_input_tokens_seen": 5015680,
"step": 721
},
{
"epoch": 4.646822204344328,
"grad_norm": 1.7312675714492798,
"learning_rate": 1.048750119996066e-06,
"loss": 0.0101,
"num_input_tokens_seen": 5022336,
"step": 722
},
{
"epoch": 4.6532582461786,
"grad_norm": 0.9403418898582458,
"learning_rate": 1.0124420097449077e-06,
"loss": 0.0107,
"num_input_tokens_seen": 5029184,
"step": 723
},
{
"epoch": 4.659694288012872,
"grad_norm": 2.2545931339263916,
"learning_rate": 9.7661328609298e-07,
"loss": 0.0279,
"num_input_tokens_seen": 5036000,
"step": 724
},
{
"epoch": 4.666130329847144,
"grad_norm": 0.5637010931968689,
"learning_rate": 9.412754953531664e-07,
"loss": 0.0044,
"num_input_tokens_seen": 5042944,
"step": 725
},
{
"epoch": 4.672566371681416,
"grad_norm": 0.24136967957019806,
"learning_rate": 9.064400256282757e-07,
"loss": 0.0021,
"num_input_tokens_seen": 5049840,
"step": 726
},
{
"epoch": 4.679002413515688,
"grad_norm": 1.0340116024017334,
"learning_rate": 8.721181031410661e-07,
"loss": 0.0086,
"num_input_tokens_seen": 5057296,
"step": 727
},
{
"epoch": 4.68543845534996,
"grad_norm": 0.548861026763916,
"learning_rate": 8.383207886164366e-07,
"loss": 0.005,
"num_input_tokens_seen": 5064560,
"step": 728
},
{
"epoch": 4.691874497184232,
"grad_norm": 1.089135766029358,
"learning_rate": 8.050589737169485e-07,
"loss": 0.0096,
"num_input_tokens_seen": 5071472,
"step": 729
},
{
"epoch": 4.698310539018504,
"grad_norm": 0.3106631636619568,
"learning_rate": 7.723433775328385e-07,
"loss": 0.0029,
"num_input_tokens_seen": 5078512,
"step": 730
},
{
"epoch": 4.704746580852776,
"grad_norm": 1.3499066829681396,
"learning_rate": 7.401845431276378e-07,
"loss": 0.0082,
"num_input_tokens_seen": 5085248,
"step": 731
},
{
"epoch": 4.711182622687048,
"grad_norm": 0.30332618951797485,
"learning_rate": 7.085928341405193e-07,
"loss": 0.0033,
"num_input_tokens_seen": 5092160,
"step": 732
},
{
"epoch": 4.71761866452132,
"grad_norm": 0.7549375295639038,
"learning_rate": 6.775784314464717e-07,
"loss": 0.0253,
"num_input_tokens_seen": 5099360,
"step": 733
},
{
"epoch": 4.7240547063555915,
"grad_norm": 1.567395567893982,
"learning_rate": 6.471513298753634e-07,
"loss": 0.0117,
"num_input_tokens_seen": 5106160,
"step": 734
},
{
"epoch": 4.7304907481898635,
"grad_norm": 1.192610502243042,
"learning_rate": 6.17321334990973e-07,
"loss": 0.0052,
"num_input_tokens_seen": 5113264,
"step": 735
},
{
"epoch": 4.736926790024135,
"grad_norm": 3.9402077198028564,
"learning_rate": 5.880980599310041e-07,
"loss": 0.0305,
"num_input_tokens_seen": 5120032,
"step": 736
},
{
"epoch": 4.743362831858407,
"grad_norm": 0.3623356223106384,
"learning_rate": 5.59490922309118e-07,
"loss": 0.0018,
"num_input_tokens_seen": 5127280,
"step": 737
},
{
"epoch": 4.749798873692679,
"grad_norm": 0.815592885017395,
"learning_rate": 5.3150914117997e-07,
"loss": 0.0066,
"num_input_tokens_seen": 5134400,
"step": 738
},
{
"epoch": 4.756234915526951,
"grad_norm": 0.4423564076423645,
"learning_rate": 5.041617340682467e-07,
"loss": 0.0032,
"num_input_tokens_seen": 5141488,
"step": 739
},
{
"epoch": 4.762670957361223,
"grad_norm": 0.5768114924430847,
"learning_rate": 4.774575140626317e-07,
"loss": 0.0089,
"num_input_tokens_seen": 5148432,
"step": 740
},
{
"epoch": 4.769106999195495,
"grad_norm": 1.2286343574523926,
"learning_rate": 4.514050869756703e-07,
"loss": 0.0124,
"num_input_tokens_seen": 5155328,
"step": 741
},
{
"epoch": 4.775543041029767,
"grad_norm": 0.552872359752655,
"learning_rate": 4.2601284857042263e-07,
"loss": 0.0022,
"num_input_tokens_seen": 5163008,
"step": 742
},
{
"epoch": 4.781979082864039,
"grad_norm": 0.6165493726730347,
"learning_rate": 4.012889818548069e-07,
"loss": 0.0063,
"num_input_tokens_seen": 5170096,
"step": 743
},
{
"epoch": 4.788415124698311,
"grad_norm": 1.1403653621673584,
"learning_rate": 3.772414544445163e-07,
"loss": 0.0149,
"num_input_tokens_seen": 5177536,
"step": 744
},
{
"epoch": 4.794851166532583,
"grad_norm": 0.1795167326927185,
"learning_rate": 3.538780159953348e-07,
"loss": 0.0012,
"num_input_tokens_seen": 5184608,
"step": 745
},
{
"epoch": 4.801287208366855,
"grad_norm": 0.9326004981994629,
"learning_rate": 3.312061957057061e-07,
"loss": 0.0127,
"num_input_tokens_seen": 5191344,
"step": 746
},
{
"epoch": 4.807723250201127,
"grad_norm": 0.41363996267318726,
"learning_rate": 3.092332998903416e-07,
"loss": 0.0018,
"num_input_tokens_seen": 5198416,
"step": 747
},
{
"epoch": 4.814159292035399,
"grad_norm": 0.538027286529541,
"learning_rate": 2.8796640962565374e-07,
"loss": 0.0034,
"num_input_tokens_seen": 5205392,
"step": 748
},
{
"epoch": 4.8205953338696705,
"grad_norm": 1.531555414199829,
"learning_rate": 2.674123784677868e-07,
"loss": 0.0137,
"num_input_tokens_seen": 5213216,
"step": 749
},
{
"epoch": 4.8270313757039425,
"grad_norm": 1.671035647392273,
"learning_rate": 2.4757783024395244e-07,
"loss": 0.0219,
"num_input_tokens_seen": 5220032,
"step": 750
},
{
"epoch": 4.833467417538214,
"grad_norm": 0.30722492933273315,
"learning_rate": 2.284691569178138e-07,
"loss": 0.0014,
"num_input_tokens_seen": 5226816,
"step": 751
},
{
"epoch": 4.839903459372486,
"grad_norm": 1.3107943534851074,
"learning_rate": 2.100925165295839e-07,
"loss": 0.019,
"num_input_tokens_seen": 5233920,
"step": 752
},
{
"epoch": 4.846339501206758,
"grad_norm": 2.1163885593414307,
"learning_rate": 1.9245383121150678e-07,
"loss": 0.0075,
"num_input_tokens_seen": 5241344,
"step": 753
},
{
"epoch": 4.85277554304103,
"grad_norm": 1.2636387348175049,
"learning_rate": 1.7555878527937164e-07,
"loss": 0.0078,
"num_input_tokens_seen": 5248256,
"step": 754
},
{
"epoch": 4.859211584875302,
"grad_norm": 4.166254997253418,
"learning_rate": 1.59412823400657e-07,
"loss": 0.0244,
"num_input_tokens_seen": 5255248,
"step": 755
},
{
"epoch": 4.865647626709574,
"grad_norm": 1.078273892402649,
"learning_rate": 1.4402114883991318e-07,
"loss": 0.0218,
"num_input_tokens_seen": 5262048,
"step": 756
},
{
"epoch": 4.872083668543846,
"grad_norm": 2.091312885284424,
"learning_rate": 1.2938872178193395e-07,
"loss": 0.0044,
"num_input_tokens_seen": 5268848,
"step": 757
},
{
"epoch": 4.878519710378118,
"grad_norm": 1.7236751317977905,
"learning_rate": 1.1552025773327008e-07,
"loss": 0.0122,
"num_input_tokens_seen": 5275664,
"step": 758
},
{
"epoch": 4.88495575221239,
"grad_norm": 0.9874201416969299,
"learning_rate": 1.0242022600258611e-07,
"loss": 0.007,
"num_input_tokens_seen": 5282112,
"step": 759
},
{
"epoch": 4.891391794046662,
"grad_norm": 0.6303602457046509,
"learning_rate": 9.00928482603669e-08,
"loss": 0.0019,
"num_input_tokens_seen": 5288912,
"step": 760
},
{
"epoch": 4.897827835880933,
"grad_norm": 0.7971038818359375,
"learning_rate": 7.854209717842231e-08,
"loss": 0.0147,
"num_input_tokens_seen": 5295920,
"step": 761
},
{
"epoch": 4.904263877715205,
"grad_norm": 1.0757670402526855,
"learning_rate": 6.777169514963766e-08,
"loss": 0.0087,
"num_input_tokens_seen": 5302816,
"step": 762
},
{
"epoch": 4.910699919549477,
"grad_norm": 1.8044992685317993,
"learning_rate": 5.778511308838108e-08,
"loss": 0.0085,
"num_input_tokens_seen": 5309680,
"step": 763
},
{
"epoch": 4.917135961383749,
"grad_norm": 0.3801545202732086,
"learning_rate": 4.8585569311949966e-08,
"loss": 0.0026,
"num_input_tokens_seen": 5316848,
"step": 764
},
{
"epoch": 4.923572003218021,
"grad_norm": 0.20918627083301544,
"learning_rate": 4.017602850342584e-08,
"loss": 0.0018,
"num_input_tokens_seen": 5323760,
"step": 765
},
{
"epoch": 4.9300080450522925,
"grad_norm": 2.037950277328491,
"learning_rate": 3.2559200756260845e-08,
"loss": 0.0072,
"num_input_tokens_seen": 5330336,
"step": 766
},
{
"epoch": 4.936444086886564,
"grad_norm": 0.8903030753135681,
"learning_rate": 2.5737540700912777e-08,
"loss": 0.0079,
"num_input_tokens_seen": 5336816,
"step": 767
},
{
"epoch": 4.942880128720836,
"grad_norm": 1.0508862733840942,
"learning_rate": 1.9713246713805588e-08,
"loss": 0.0275,
"num_input_tokens_seen": 5344064,
"step": 768
},
{
"epoch": 4.949316170555108,
"grad_norm": 1.0068142414093018,
"learning_rate": 1.4488260208871397e-08,
"loss": 0.0036,
"num_input_tokens_seen": 5351328,
"step": 769
},
{
"epoch": 4.95575221238938,
"grad_norm": 1.5033273696899414,
"learning_rate": 1.006426501190233e-08,
"loss": 0.0501,
"num_input_tokens_seen": 5358672,
"step": 770
},
{
"epoch": 4.962188254223652,
"grad_norm": 0.667352557182312,
"learning_rate": 6.442686817914878e-09,
"loss": 0.0082,
"num_input_tokens_seen": 5365648,
"step": 771
},
{
"epoch": 4.968624296057924,
"grad_norm": 0.9037322998046875,
"learning_rate": 3.6246927316976875e-09,
"loss": 0.0032,
"num_input_tokens_seen": 5372432,
"step": 772
},
{
"epoch": 4.975060337892196,
"grad_norm": 0.3071233630180359,
"learning_rate": 1.6111908916965902e-09,
"loss": 0.0017,
"num_input_tokens_seen": 5379648,
"step": 773
},
{
"epoch": 4.981496379726468,
"grad_norm": 0.7171315550804138,
"learning_rate": 4.0283017735454066e-10,
"loss": 0.0042,
"num_input_tokens_seen": 5386864,
"step": 774
},
{
"epoch": 4.98793242156074,
"grad_norm": 2.855295181274414,
"learning_rate": 0.0,
"loss": 0.0176,
"num_input_tokens_seen": 5393616,
"step": 775
},
{
"epoch": 4.98793242156074,
"num_input_tokens_seen": 5393616,
"step": 775,
"total_flos": 2.1382484588285133e+17,
"train_loss": 0.5414434323177463,
"train_runtime": 8640.8816,
"train_samples_per_second": 11.503,
"train_steps_per_second": 0.09
}
],
"logging_steps": 1,
"max_steps": 775,
"num_input_tokens_seen": 5393616,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1382484588285133e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}