{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.98793242156074, "eval_steps": 500, "global_step": 775, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006436041834271922, "grad_norm": 181.50096130371094, "learning_rate": 8.333333333333335e-09, "loss": 8.4196, "num_input_tokens_seen": 6848, "step": 1 }, { "epoch": 0.012872083668543845, "grad_norm": 187.05642700195312, "learning_rate": 1.666666666666667e-08, "loss": 8.44, "num_input_tokens_seen": 14000, "step": 2 }, { "epoch": 0.019308125502815767, "grad_norm": 182.92320251464844, "learning_rate": 2.5000000000000002e-08, "loss": 8.3839, "num_input_tokens_seen": 21152, "step": 3 }, { "epoch": 0.02574416733708769, "grad_norm": 186.71311950683594, "learning_rate": 3.333333333333334e-08, "loss": 8.4024, "num_input_tokens_seen": 28224, "step": 4 }, { "epoch": 0.032180209171359615, "grad_norm": 180.32656860351562, "learning_rate": 4.166666666666667e-08, "loss": 8.4594, "num_input_tokens_seen": 35360, "step": 5 }, { "epoch": 0.038616251005631534, "grad_norm": 189.87557983398438, "learning_rate": 5.0000000000000004e-08, "loss": 8.4107, "num_input_tokens_seen": 42192, "step": 6 }, { "epoch": 0.04505229283990346, "grad_norm": 185.89984130859375, "learning_rate": 5.833333333333334e-08, "loss": 8.4551, "num_input_tokens_seen": 49088, "step": 7 }, { "epoch": 0.05148833467417538, "grad_norm": 188.8160400390625, "learning_rate": 6.666666666666668e-08, "loss": 8.4415, "num_input_tokens_seen": 55856, "step": 8 }, { "epoch": 0.057924376508447305, "grad_norm": 190.1417236328125, "learning_rate": 7.500000000000001e-08, "loss": 8.4965, "num_input_tokens_seen": 63120, "step": 9 }, { "epoch": 0.06436041834271923, "grad_norm": 185.3598175048828, "learning_rate": 8.333333333333334e-08, "loss": 8.4251, "num_input_tokens_seen": 69968, "step": 10 }, { "epoch": 0.07079646017699115, "grad_norm": 183.81944274902344, "learning_rate": 9.166666666666668e-08, "loss": 8.4291, "num_input_tokens_seen": 77168, "step": 11 }, { "epoch": 0.07723250201126307, "grad_norm": 196.39779663085938, "learning_rate": 1.0000000000000001e-07, "loss": 8.4463, "num_input_tokens_seen": 84272, "step": 12 }, { "epoch": 0.083668543845535, "grad_norm": 181.4925994873047, "learning_rate": 1.0833333333333335e-07, "loss": 8.5116, "num_input_tokens_seen": 91232, "step": 13 }, { "epoch": 0.09010458567980692, "grad_norm": 190.0314178466797, "learning_rate": 1.1666666666666668e-07, "loss": 8.4749, "num_input_tokens_seen": 97968, "step": 14 }, { "epoch": 0.09654062751407884, "grad_norm": 188.7615203857422, "learning_rate": 1.2500000000000002e-07, "loss": 8.3311, "num_input_tokens_seen": 104864, "step": 15 }, { "epoch": 0.10297666934835076, "grad_norm": 184.1820526123047, "learning_rate": 1.3333333333333336e-07, "loss": 8.3729, "num_input_tokens_seen": 111488, "step": 16 }, { "epoch": 0.10941271118262269, "grad_norm": 181.39308166503906, "learning_rate": 1.4166666666666668e-07, "loss": 8.4261, "num_input_tokens_seen": 118384, "step": 17 }, { "epoch": 0.11584875301689461, "grad_norm": 181.79583740234375, "learning_rate": 1.5000000000000002e-07, "loss": 8.3051, "num_input_tokens_seen": 125360, "step": 18 }, { "epoch": 0.12228479485116653, "grad_norm": 181.36965942382812, "learning_rate": 1.5833333333333336e-07, "loss": 8.2461, "num_input_tokens_seen": 132320, "step": 19 }, { "epoch": 0.12872083668543846, "grad_norm": 182.36839294433594, "learning_rate": 1.6666666666666668e-07, "loss": 8.2894, "num_input_tokens_seen": 139376, "step": 20 }, { "epoch": 0.13515687851971037, "grad_norm": 189.7889404296875, "learning_rate": 1.7500000000000002e-07, "loss": 8.2484, "num_input_tokens_seen": 146544, "step": 21 }, { "epoch": 0.1415929203539823, "grad_norm": 190.1185302734375, "learning_rate": 1.8333333333333336e-07, "loss": 8.3034, "num_input_tokens_seen": 153472, "step": 22 }, { "epoch": 0.14802896218825423, "grad_norm": 183.1331024169922, "learning_rate": 1.9166666666666668e-07, "loss": 8.054, "num_input_tokens_seen": 159856, "step": 23 }, { "epoch": 0.15446500402252614, "grad_norm": 168.13046264648438, "learning_rate": 2.0000000000000002e-07, "loss": 7.9583, "num_input_tokens_seen": 166528, "step": 24 }, { "epoch": 0.16090104585679807, "grad_norm": 167.57830810546875, "learning_rate": 2.0833333333333333e-07, "loss": 7.9626, "num_input_tokens_seen": 173056, "step": 25 }, { "epoch": 0.16733708769107, "grad_norm": 170.6557159423828, "learning_rate": 2.166666666666667e-07, "loss": 7.8761, "num_input_tokens_seen": 179616, "step": 26 }, { "epoch": 0.1737731295253419, "grad_norm": 179.7693328857422, "learning_rate": 2.2500000000000002e-07, "loss": 7.8896, "num_input_tokens_seen": 186912, "step": 27 }, { "epoch": 0.18020917135961384, "grad_norm": 180.4197998046875, "learning_rate": 2.3333333333333336e-07, "loss": 7.8352, "num_input_tokens_seen": 193936, "step": 28 }, { "epoch": 0.18664521319388577, "grad_norm": 164.2944793701172, "learning_rate": 2.416666666666667e-07, "loss": 7.691, "num_input_tokens_seen": 200672, "step": 29 }, { "epoch": 0.19308125502815768, "grad_norm": 167.71722412109375, "learning_rate": 2.5000000000000004e-07, "loss": 7.7851, "num_input_tokens_seen": 207536, "step": 30 }, { "epoch": 0.1995172968624296, "grad_norm": 169.2217254638672, "learning_rate": 2.5833333333333333e-07, "loss": 7.7249, "num_input_tokens_seen": 214640, "step": 31 }, { "epoch": 0.20595333869670152, "grad_norm": 155.74537658691406, "learning_rate": 2.666666666666667e-07, "loss": 6.8838, "num_input_tokens_seen": 221744, "step": 32 }, { "epoch": 0.21238938053097345, "grad_norm": 148.12120056152344, "learning_rate": 2.75e-07, "loss": 6.7173, "num_input_tokens_seen": 228624, "step": 33 }, { "epoch": 0.21882542236524538, "grad_norm": 150.97012329101562, "learning_rate": 2.8333333333333336e-07, "loss": 6.6793, "num_input_tokens_seen": 235456, "step": 34 }, { "epoch": 0.2252614641995173, "grad_norm": 149.623291015625, "learning_rate": 2.916666666666667e-07, "loss": 6.725, "num_input_tokens_seen": 242768, "step": 35 }, { "epoch": 0.23169750603378922, "grad_norm": 147.1656036376953, "learning_rate": 3.0000000000000004e-07, "loss": 6.6905, "num_input_tokens_seen": 249552, "step": 36 }, { "epoch": 0.23813354786806115, "grad_norm": 151.0162811279297, "learning_rate": 3.083333333333334e-07, "loss": 6.6179, "num_input_tokens_seen": 256160, "step": 37 }, { "epoch": 0.24456958970233306, "grad_norm": 150.03030395507812, "learning_rate": 3.166666666666667e-07, "loss": 6.501, "num_input_tokens_seen": 262912, "step": 38 }, { "epoch": 0.251005631536605, "grad_norm": 145.5784149169922, "learning_rate": 3.25e-07, "loss": 6.4588, "num_input_tokens_seen": 269600, "step": 39 }, { "epoch": 0.2574416733708769, "grad_norm": 143.5873565673828, "learning_rate": 3.3333333333333335e-07, "loss": 6.3614, "num_input_tokens_seen": 276560, "step": 40 }, { "epoch": 0.26387771520514886, "grad_norm": 144.9624481201172, "learning_rate": 3.416666666666667e-07, "loss": 6.2775, "num_input_tokens_seen": 283696, "step": 41 }, { "epoch": 0.27031375703942073, "grad_norm": 146.71554565429688, "learning_rate": 3.5000000000000004e-07, "loss": 5.9868, "num_input_tokens_seen": 290832, "step": 42 }, { "epoch": 0.27674979887369267, "grad_norm": 138.25450134277344, "learning_rate": 3.583333333333334e-07, "loss": 5.2286, "num_input_tokens_seen": 298096, "step": 43 }, { "epoch": 0.2831858407079646, "grad_norm": 156.28713989257812, "learning_rate": 3.666666666666667e-07, "loss": 4.5076, "num_input_tokens_seen": 305120, "step": 44 }, { "epoch": 0.28962188254223653, "grad_norm": 178.4820556640625, "learning_rate": 3.75e-07, "loss": 4.1167, "num_input_tokens_seen": 312000, "step": 45 }, { "epoch": 0.29605792437650846, "grad_norm": 317.7680358886719, "learning_rate": 3.8333333333333335e-07, "loss": 3.6585, "num_input_tokens_seen": 319008, "step": 46 }, { "epoch": 0.3024939662107804, "grad_norm": 282.17803955078125, "learning_rate": 3.9166666666666675e-07, "loss": 3.3613, "num_input_tokens_seen": 326192, "step": 47 }, { "epoch": 0.3089300080450523, "grad_norm": 257.7794494628906, "learning_rate": 4.0000000000000003e-07, "loss": 3.1068, "num_input_tokens_seen": 333664, "step": 48 }, { "epoch": 0.3153660498793242, "grad_norm": 255.1024169921875, "learning_rate": 4.083333333333334e-07, "loss": 2.9368, "num_input_tokens_seen": 340912, "step": 49 }, { "epoch": 0.32180209171359614, "grad_norm": 259.47015380859375, "learning_rate": 4.1666666666666667e-07, "loss": 2.3466, "num_input_tokens_seen": 347712, "step": 50 }, { "epoch": 0.32823813354786807, "grad_norm": 263.3533935546875, "learning_rate": 4.2500000000000006e-07, "loss": 2.0645, "num_input_tokens_seen": 355232, "step": 51 }, { "epoch": 0.33467417538214, "grad_norm": 239.1399688720703, "learning_rate": 4.333333333333334e-07, "loss": 1.7729, "num_input_tokens_seen": 361968, "step": 52 }, { "epoch": 0.3411102172164119, "grad_norm": 257.4410095214844, "learning_rate": 4.416666666666667e-07, "loss": 1.6199, "num_input_tokens_seen": 369136, "step": 53 }, { "epoch": 0.3475462590506838, "grad_norm": 169.56935119628906, "learning_rate": 4.5000000000000003e-07, "loss": 1.1593, "num_input_tokens_seen": 375904, "step": 54 }, { "epoch": 0.35398230088495575, "grad_norm": 95.25677490234375, "learning_rate": 4.583333333333333e-07, "loss": 0.7199, "num_input_tokens_seen": 382848, "step": 55 }, { "epoch": 0.3604183427192277, "grad_norm": 48.7137451171875, "learning_rate": 4.666666666666667e-07, "loss": 0.4394, "num_input_tokens_seen": 389680, "step": 56 }, { "epoch": 0.3668543845534996, "grad_norm": 62.34474563598633, "learning_rate": 4.7500000000000006e-07, "loss": 0.3806, "num_input_tokens_seen": 396192, "step": 57 }, { "epoch": 0.37329042638777155, "grad_norm": 30.711780548095703, "learning_rate": 4.833333333333334e-07, "loss": 0.3185, "num_input_tokens_seen": 403104, "step": 58 }, { "epoch": 0.3797264682220434, "grad_norm": 34.46913528442383, "learning_rate": 4.916666666666667e-07, "loss": 0.3056, "num_input_tokens_seen": 410176, "step": 59 }, { "epoch": 0.38616251005631536, "grad_norm": 25.92363166809082, "learning_rate": 5.000000000000001e-07, "loss": 0.2981, "num_input_tokens_seen": 416928, "step": 60 }, { "epoch": 0.3925985518905873, "grad_norm": 11.064619064331055, "learning_rate": 5.083333333333334e-07, "loss": 0.2473, "num_input_tokens_seen": 424128, "step": 61 }, { "epoch": 0.3990345937248592, "grad_norm": 55.367347717285156, "learning_rate": 5.166666666666667e-07, "loss": 0.2924, "num_input_tokens_seen": 430864, "step": 62 }, { "epoch": 0.40547063555913115, "grad_norm": 42.00873947143555, "learning_rate": 5.250000000000001e-07, "loss": 0.2656, "num_input_tokens_seen": 437744, "step": 63 }, { "epoch": 0.41190667739340303, "grad_norm": 13.313591003417969, "learning_rate": 5.333333333333335e-07, "loss": 0.2335, "num_input_tokens_seen": 444624, "step": 64 }, { "epoch": 0.41834271922767496, "grad_norm": 60.489715576171875, "learning_rate": 5.416666666666667e-07, "loss": 0.2647, "num_input_tokens_seen": 451696, "step": 65 }, { "epoch": 0.4247787610619469, "grad_norm": 77.01821899414062, "learning_rate": 5.5e-07, "loss": 0.3003, "num_input_tokens_seen": 458784, "step": 66 }, { "epoch": 0.43121480289621883, "grad_norm": 58.067596435546875, "learning_rate": 5.583333333333333e-07, "loss": 0.2656, "num_input_tokens_seen": 465920, "step": 67 }, { "epoch": 0.43765084473049076, "grad_norm": 12.40570068359375, "learning_rate": 5.666666666666667e-07, "loss": 0.2212, "num_input_tokens_seen": 473152, "step": 68 }, { "epoch": 0.4440868865647627, "grad_norm": 35.392276763916016, "learning_rate": 5.750000000000001e-07, "loss": 0.2532, "num_input_tokens_seen": 480544, "step": 69 }, { "epoch": 0.4505229283990346, "grad_norm": 51.42181396484375, "learning_rate": 5.833333333333334e-07, "loss": 0.2799, "num_input_tokens_seen": 487552, "step": 70 }, { "epoch": 0.4569589702333065, "grad_norm": 45.73934555053711, "learning_rate": 5.916666666666667e-07, "loss": 0.2876, "num_input_tokens_seen": 494256, "step": 71 }, { "epoch": 0.46339501206757844, "grad_norm": 20.654096603393555, "learning_rate": 6.000000000000001e-07, "loss": 0.2191, "num_input_tokens_seen": 500768, "step": 72 }, { "epoch": 0.46983105390185037, "grad_norm": 21.078027725219727, "learning_rate": 6.083333333333334e-07, "loss": 0.2344, "num_input_tokens_seen": 507136, "step": 73 }, { "epoch": 0.4762670957361223, "grad_norm": 36.7335205078125, "learning_rate": 6.166666666666668e-07, "loss": 0.2547, "num_input_tokens_seen": 514208, "step": 74 }, { "epoch": 0.4827031375703942, "grad_norm": 34.47271728515625, "learning_rate": 6.25e-07, "loss": 0.2349, "num_input_tokens_seen": 521120, "step": 75 }, { "epoch": 0.4891391794046661, "grad_norm": 5.103244781494141, "learning_rate": 6.333333333333334e-07, "loss": 0.2045, "num_input_tokens_seen": 527824, "step": 76 }, { "epoch": 0.49557522123893805, "grad_norm": 22.47526741027832, "learning_rate": 6.416666666666667e-07, "loss": 0.2262, "num_input_tokens_seen": 534832, "step": 77 }, { "epoch": 0.50201126307321, "grad_norm": 30.610803604125977, "learning_rate": 6.5e-07, "loss": 0.2393, "num_input_tokens_seen": 541696, "step": 78 }, { "epoch": 0.5084473049074819, "grad_norm": 10.922965049743652, "learning_rate": 6.583333333333333e-07, "loss": 0.2206, "num_input_tokens_seen": 548608, "step": 79 }, { "epoch": 0.5148833467417538, "grad_norm": 17.484182357788086, "learning_rate": 6.666666666666667e-07, "loss": 0.2029, "num_input_tokens_seen": 555456, "step": 80 }, { "epoch": 0.5213193885760258, "grad_norm": 16.49226188659668, "learning_rate": 6.750000000000001e-07, "loss": 0.2125, "num_input_tokens_seen": 562768, "step": 81 }, { "epoch": 0.5277554304102977, "grad_norm": 9.977084159851074, "learning_rate": 6.833333333333334e-07, "loss": 0.2023, "num_input_tokens_seen": 569536, "step": 82 }, { "epoch": 0.5341914722445696, "grad_norm": 17.79197120666504, "learning_rate": 6.916666666666668e-07, "loss": 0.2262, "num_input_tokens_seen": 576096, "step": 83 }, { "epoch": 0.5406275140788415, "grad_norm": 16.699260711669922, "learning_rate": 7.000000000000001e-07, "loss": 0.2003, "num_input_tokens_seen": 583472, "step": 84 }, { "epoch": 0.5470635559131134, "grad_norm": 25.02164077758789, "learning_rate": 7.083333333333334e-07, "loss": 0.2351, "num_input_tokens_seen": 590304, "step": 85 }, { "epoch": 0.5534995977473853, "grad_norm": 3.8612709045410156, "learning_rate": 7.166666666666668e-07, "loss": 0.1839, "num_input_tokens_seen": 597152, "step": 86 }, { "epoch": 0.5599356395816573, "grad_norm": 31.555482864379883, "learning_rate": 7.25e-07, "loss": 0.2315, "num_input_tokens_seen": 604208, "step": 87 }, { "epoch": 0.5663716814159292, "grad_norm": 54.94756317138672, "learning_rate": 7.333333333333334e-07, "loss": 0.2732, "num_input_tokens_seen": 610896, "step": 88 }, { "epoch": 0.5728077232502011, "grad_norm": 30.55241584777832, "learning_rate": 7.416666666666668e-07, "loss": 0.2405, "num_input_tokens_seen": 618112, "step": 89 }, { "epoch": 0.5792437650844731, "grad_norm": 16.687997817993164, "learning_rate": 7.5e-07, "loss": 0.2005, "num_input_tokens_seen": 625040, "step": 90 }, { "epoch": 0.585679806918745, "grad_norm": 10.350790977478027, "learning_rate": 7.583333333333334e-07, "loss": 0.2005, "num_input_tokens_seen": 631840, "step": 91 }, { "epoch": 0.5921158487530169, "grad_norm": 25.88368797302246, "learning_rate": 7.666666666666667e-07, "loss": 0.2115, "num_input_tokens_seen": 638752, "step": 92 }, { "epoch": 0.5985518905872889, "grad_norm": 17.11625099182129, "learning_rate": 7.750000000000001e-07, "loss": 0.2141, "num_input_tokens_seen": 645968, "step": 93 }, { "epoch": 0.6049879324215608, "grad_norm": 12.70864200592041, "learning_rate": 7.833333333333335e-07, "loss": 0.1898, "num_input_tokens_seen": 652752, "step": 94 }, { "epoch": 0.6114239742558326, "grad_norm": 3.674001455307007, "learning_rate": 7.916666666666667e-07, "loss": 0.2099, "num_input_tokens_seen": 660048, "step": 95 }, { "epoch": 0.6178600160901045, "grad_norm": 20.51032066345215, "learning_rate": 8.000000000000001e-07, "loss": 0.2014, "num_input_tokens_seen": 666752, "step": 96 }, { "epoch": 0.6242960579243765, "grad_norm": 47.562381744384766, "learning_rate": 8.083333333333334e-07, "loss": 0.2349, "num_input_tokens_seen": 673856, "step": 97 }, { "epoch": 0.6307320997586484, "grad_norm": 35.69169998168945, "learning_rate": 8.166666666666668e-07, "loss": 0.2205, "num_input_tokens_seen": 681104, "step": 98 }, { "epoch": 0.6371681415929203, "grad_norm": 10.080629348754883, "learning_rate": 8.250000000000001e-07, "loss": 0.199, "num_input_tokens_seen": 688128, "step": 99 }, { "epoch": 0.6436041834271923, "grad_norm": 26.242666244506836, "learning_rate": 8.333333333333333e-07, "loss": 0.236, "num_input_tokens_seen": 695216, "step": 100 }, { "epoch": 0.6500402252614642, "grad_norm": 22.0434627532959, "learning_rate": 8.416666666666667e-07, "loss": 0.2265, "num_input_tokens_seen": 701968, "step": 101 }, { "epoch": 0.6564762670957361, "grad_norm": 27.378408432006836, "learning_rate": 8.500000000000001e-07, "loss": 0.2443, "num_input_tokens_seen": 708928, "step": 102 }, { "epoch": 0.6629123089300081, "grad_norm": 11.929069519042969, "learning_rate": 8.583333333333334e-07, "loss": 0.2086, "num_input_tokens_seen": 715952, "step": 103 }, { "epoch": 0.66934835076428, "grad_norm": 6.677243232727051, "learning_rate": 8.666666666666668e-07, "loss": 0.1915, "num_input_tokens_seen": 722928, "step": 104 }, { "epoch": 0.6757843925985519, "grad_norm": 17.033658981323242, "learning_rate": 8.75e-07, "loss": 0.1967, "num_input_tokens_seen": 730160, "step": 105 }, { "epoch": 0.6822204344328238, "grad_norm": 6.806990146636963, "learning_rate": 8.833333333333334e-07, "loss": 0.188, "num_input_tokens_seen": 737088, "step": 106 }, { "epoch": 0.6886564762670957, "grad_norm": 4.871335506439209, "learning_rate": 8.916666666666668e-07, "loss": 0.1895, "num_input_tokens_seen": 743744, "step": 107 }, { "epoch": 0.6950925181013676, "grad_norm": 9.054122924804688, "learning_rate": 9.000000000000001e-07, "loss": 0.1667, "num_input_tokens_seen": 750496, "step": 108 }, { "epoch": 0.7015285599356396, "grad_norm": 15.78903579711914, "learning_rate": 9.083333333333335e-07, "loss": 0.1976, "num_input_tokens_seen": 757792, "step": 109 }, { "epoch": 0.7079646017699115, "grad_norm": 10.51429271697998, "learning_rate": 9.166666666666666e-07, "loss": 0.2057, "num_input_tokens_seen": 764992, "step": 110 }, { "epoch": 0.7144006436041834, "grad_norm": 24.346830368041992, "learning_rate": 9.25e-07, "loss": 0.2002, "num_input_tokens_seen": 771648, "step": 111 }, { "epoch": 0.7208366854384554, "grad_norm": 46.50392532348633, "learning_rate": 9.333333333333334e-07, "loss": 0.2173, "num_input_tokens_seen": 778480, "step": 112 }, { "epoch": 0.7272727272727273, "grad_norm": 22.505762100219727, "learning_rate": 9.416666666666667e-07, "loss": 0.1756, "num_input_tokens_seen": 785328, "step": 113 }, { "epoch": 0.7337087691069992, "grad_norm": 5.675211429595947, "learning_rate": 9.500000000000001e-07, "loss": 0.1786, "num_input_tokens_seen": 792592, "step": 114 }, { "epoch": 0.7401448109412712, "grad_norm": 14.814651489257812, "learning_rate": 9.583333333333334e-07, "loss": 0.1879, "num_input_tokens_seen": 799808, "step": 115 }, { "epoch": 0.7465808527755431, "grad_norm": 13.106173515319824, "learning_rate": 9.666666666666668e-07, "loss": 0.173, "num_input_tokens_seen": 806896, "step": 116 }, { "epoch": 0.7530168946098149, "grad_norm": 24.56918716430664, "learning_rate": 9.750000000000002e-07, "loss": 0.1714, "num_input_tokens_seen": 813536, "step": 117 }, { "epoch": 0.7594529364440868, "grad_norm": 27.256954193115234, "learning_rate": 9.833333333333334e-07, "loss": 0.2015, "num_input_tokens_seen": 820608, "step": 118 }, { "epoch": 0.7658889782783588, "grad_norm": 4.209413051605225, "learning_rate": 9.916666666666668e-07, "loss": 0.1847, "num_input_tokens_seen": 827776, "step": 119 }, { "epoch": 0.7723250201126307, "grad_norm": 18.684349060058594, "learning_rate": 1.0000000000000002e-06, "loss": 0.1876, "num_input_tokens_seen": 834704, "step": 120 }, { "epoch": 0.7787610619469026, "grad_norm": 19.470041275024414, "learning_rate": 1.0083333333333333e-06, "loss": 0.1937, "num_input_tokens_seen": 841568, "step": 121 }, { "epoch": 0.7851971037811746, "grad_norm": 11.242873191833496, "learning_rate": 1.0166666666666667e-06, "loss": 0.1974, "num_input_tokens_seen": 848704, "step": 122 }, { "epoch": 0.7916331456154465, "grad_norm": 26.72730255126953, "learning_rate": 1.025e-06, "loss": 0.2099, "num_input_tokens_seen": 855664, "step": 123 }, { "epoch": 0.7980691874497184, "grad_norm": 41.4288215637207, "learning_rate": 1.0333333333333333e-06, "loss": 0.2239, "num_input_tokens_seen": 862464, "step": 124 }, { "epoch": 0.8045052292839904, "grad_norm": 27.283327102661133, "learning_rate": 1.0416666666666667e-06, "loss": 0.1953, "num_input_tokens_seen": 869376, "step": 125 }, { "epoch": 0.8109412711182623, "grad_norm": 4.882501602172852, "learning_rate": 1.0500000000000001e-06, "loss": 0.1906, "num_input_tokens_seen": 876848, "step": 126 }, { "epoch": 0.8173773129525342, "grad_norm": 8.478296279907227, "learning_rate": 1.0583333333333335e-06, "loss": 0.1852, "num_input_tokens_seen": 883664, "step": 127 }, { "epoch": 0.8238133547868061, "grad_norm": 6.773479461669922, "learning_rate": 1.066666666666667e-06, "loss": 0.198, "num_input_tokens_seen": 890592, "step": 128 }, { "epoch": 0.830249396621078, "grad_norm": 21.877212524414062, "learning_rate": 1.075e-06, "loss": 0.2105, "num_input_tokens_seen": 898048, "step": 129 }, { "epoch": 0.8366854384553499, "grad_norm": 12.123941421508789, "learning_rate": 1.0833333333333335e-06, "loss": 0.1899, "num_input_tokens_seen": 905040, "step": 130 }, { "epoch": 0.8431214802896219, "grad_norm": 15.84151554107666, "learning_rate": 1.0916666666666667e-06, "loss": 0.1742, "num_input_tokens_seen": 912080, "step": 131 }, { "epoch": 0.8495575221238938, "grad_norm": 8.174356460571289, "learning_rate": 1.1e-06, "loss": 0.1585, "num_input_tokens_seen": 919424, "step": 132 }, { "epoch": 0.8559935639581657, "grad_norm": 14.87348461151123, "learning_rate": 1.1083333333333335e-06, "loss": 0.1878, "num_input_tokens_seen": 926608, "step": 133 }, { "epoch": 0.8624296057924377, "grad_norm": 11.989315032958984, "learning_rate": 1.1166666666666666e-06, "loss": 0.1748, "num_input_tokens_seen": 933712, "step": 134 }, { "epoch": 0.8688656476267096, "grad_norm": 9.659666061401367, "learning_rate": 1.125e-06, "loss": 0.1944, "num_input_tokens_seen": 940304, "step": 135 }, { "epoch": 0.8753016894609815, "grad_norm": 20.558237075805664, "learning_rate": 1.1333333333333334e-06, "loss": 0.1727, "num_input_tokens_seen": 947008, "step": 136 }, { "epoch": 0.8817377312952535, "grad_norm": 8.66232967376709, "learning_rate": 1.1416666666666668e-06, "loss": 0.1748, "num_input_tokens_seen": 954112, "step": 137 }, { "epoch": 0.8881737731295254, "grad_norm": 16.516559600830078, "learning_rate": 1.1500000000000002e-06, "loss": 0.1625, "num_input_tokens_seen": 961120, "step": 138 }, { "epoch": 0.8946098149637972, "grad_norm": 6.140871047973633, "learning_rate": 1.1583333333333334e-06, "loss": 0.1649, "num_input_tokens_seen": 967792, "step": 139 }, { "epoch": 0.9010458567980691, "grad_norm": 11.593804359436035, "learning_rate": 1.1666666666666668e-06, "loss": 0.1738, "num_input_tokens_seen": 974496, "step": 140 }, { "epoch": 0.9074818986323411, "grad_norm": 26.92620849609375, "learning_rate": 1.175e-06, "loss": 0.2221, "num_input_tokens_seen": 981344, "step": 141 }, { "epoch": 0.913917940466613, "grad_norm": 26.845230102539062, "learning_rate": 1.1833333333333334e-06, "loss": 0.1989, "num_input_tokens_seen": 988224, "step": 142 }, { "epoch": 0.9203539823008849, "grad_norm": 12.823030471801758, "learning_rate": 1.1916666666666668e-06, "loss": 0.1569, "num_input_tokens_seen": 995552, "step": 143 }, { "epoch": 0.9267900241351569, "grad_norm": 14.508877754211426, "learning_rate": 1.2000000000000002e-06, "loss": 0.1594, "num_input_tokens_seen": 1002224, "step": 144 }, { "epoch": 0.9332260659694288, "grad_norm": 13.097854614257812, "learning_rate": 1.2083333333333333e-06, "loss": 0.1609, "num_input_tokens_seen": 1009312, "step": 145 }, { "epoch": 0.9396621078037007, "grad_norm": 12.183431625366211, "learning_rate": 1.2166666666666667e-06, "loss": 0.1649, "num_input_tokens_seen": 1016256, "step": 146 }, { "epoch": 0.9460981496379727, "grad_norm": 10.628469467163086, "learning_rate": 1.2250000000000001e-06, "loss": 0.1412, "num_input_tokens_seen": 1022880, "step": 147 }, { "epoch": 0.9525341914722446, "grad_norm": 11.713327407836914, "learning_rate": 1.2333333333333335e-06, "loss": 0.165, "num_input_tokens_seen": 1029856, "step": 148 }, { "epoch": 0.9589702333065165, "grad_norm": 10.031126976013184, "learning_rate": 1.2416666666666667e-06, "loss": 0.1971, "num_input_tokens_seen": 1036928, "step": 149 }, { "epoch": 0.9654062751407884, "grad_norm": 34.122074127197266, "learning_rate": 1.25e-06, "loss": 0.1843, "num_input_tokens_seen": 1044000, "step": 150 }, { "epoch": 0.9718423169750603, "grad_norm": 13.707520484924316, "learning_rate": 1.2583333333333333e-06, "loss": 0.1628, "num_input_tokens_seen": 1050928, "step": 151 }, { "epoch": 0.9782783588093322, "grad_norm": 8.588343620300293, "learning_rate": 1.2666666666666669e-06, "loss": 0.1878, "num_input_tokens_seen": 1057920, "step": 152 }, { "epoch": 0.9847144006436042, "grad_norm": 4.411599159240723, "learning_rate": 1.275e-06, "loss": 0.1153, "num_input_tokens_seen": 1064704, "step": 153 }, { "epoch": 0.9911504424778761, "grad_norm": 13.095698356628418, "learning_rate": 1.2833333333333335e-06, "loss": 0.1622, "num_input_tokens_seen": 1071760, "step": 154 }, { "epoch": 0.997586484312148, "grad_norm": 14.093315124511719, "learning_rate": 1.2916666666666669e-06, "loss": 0.1549, "num_input_tokens_seen": 1078912, "step": 155 }, { "epoch": 1.00402252614642, "grad_norm": 17.082075119018555, "learning_rate": 1.3e-06, "loss": 0.1729, "num_input_tokens_seen": 1086288, "step": 156 }, { "epoch": 1.010458567980692, "grad_norm": 4.992012977600098, "learning_rate": 1.3083333333333334e-06, "loss": 0.1198, "num_input_tokens_seen": 1093584, "step": 157 }, { "epoch": 1.0168946098149638, "grad_norm": 5.45336389541626, "learning_rate": 1.3166666666666666e-06, "loss": 0.1723, "num_input_tokens_seen": 1100432, "step": 158 }, { "epoch": 1.0233306516492358, "grad_norm": 7.4880757331848145, "learning_rate": 1.3250000000000002e-06, "loss": 0.1485, "num_input_tokens_seen": 1107280, "step": 159 }, { "epoch": 1.0297666934835077, "grad_norm": 40.28890609741211, "learning_rate": 1.3333333333333334e-06, "loss": 0.1757, "num_input_tokens_seen": 1113968, "step": 160 }, { "epoch": 1.0362027353177796, "grad_norm": 39.24993896484375, "learning_rate": 1.3416666666666666e-06, "loss": 0.1907, "num_input_tokens_seen": 1120752, "step": 161 }, { "epoch": 1.0426387771520516, "grad_norm": 5.63855504989624, "learning_rate": 1.3500000000000002e-06, "loss": 0.1842, "num_input_tokens_seen": 1127712, "step": 162 }, { "epoch": 1.0490748189863235, "grad_norm": 5.1802754402160645, "learning_rate": 1.3583333333333334e-06, "loss": 0.1549, "num_input_tokens_seen": 1134592, "step": 163 }, { "epoch": 1.0555108608205954, "grad_norm": 4.200067043304443, "learning_rate": 1.3666666666666668e-06, "loss": 0.153, "num_input_tokens_seen": 1141888, "step": 164 }, { "epoch": 1.0619469026548674, "grad_norm": 6.892277240753174, "learning_rate": 1.3750000000000002e-06, "loss": 0.1532, "num_input_tokens_seen": 1148688, "step": 165 }, { "epoch": 1.068382944489139, "grad_norm": 11.852892875671387, "learning_rate": 1.3833333333333336e-06, "loss": 0.1629, "num_input_tokens_seen": 1155552, "step": 166 }, { "epoch": 1.074818986323411, "grad_norm": 8.346076011657715, "learning_rate": 1.3916666666666668e-06, "loss": 0.1708, "num_input_tokens_seen": 1162624, "step": 167 }, { "epoch": 1.081255028157683, "grad_norm": 7.836976528167725, "learning_rate": 1.4000000000000001e-06, "loss": 0.1461, "num_input_tokens_seen": 1169904, "step": 168 }, { "epoch": 1.0876910699919549, "grad_norm": 15.59913158416748, "learning_rate": 1.4083333333333335e-06, "loss": 0.1402, "num_input_tokens_seen": 1176928, "step": 169 }, { "epoch": 1.0941271118262268, "grad_norm": 8.46536636352539, "learning_rate": 1.4166666666666667e-06, "loss": 0.143, "num_input_tokens_seen": 1184160, "step": 170 }, { "epoch": 1.1005631536604987, "grad_norm": 7.491546154022217, "learning_rate": 1.425e-06, "loss": 0.1454, "num_input_tokens_seen": 1191120, "step": 171 }, { "epoch": 1.1069991954947707, "grad_norm": 16.70829200744629, "learning_rate": 1.4333333333333335e-06, "loss": 0.1286, "num_input_tokens_seen": 1197920, "step": 172 }, { "epoch": 1.1134352373290426, "grad_norm": 16.273927688598633, "learning_rate": 1.4416666666666667e-06, "loss": 0.1523, "num_input_tokens_seen": 1204576, "step": 173 }, { "epoch": 1.1198712791633145, "grad_norm": 8.122928619384766, "learning_rate": 1.45e-06, "loss": 0.1345, "num_input_tokens_seen": 1211344, "step": 174 }, { "epoch": 1.1263073209975865, "grad_norm": 27.850522994995117, "learning_rate": 1.4583333333333335e-06, "loss": 0.1749, "num_input_tokens_seen": 1218432, "step": 175 }, { "epoch": 1.1327433628318584, "grad_norm": 30.498666763305664, "learning_rate": 1.4666666666666669e-06, "loss": 0.166, "num_input_tokens_seen": 1225728, "step": 176 }, { "epoch": 1.1391794046661303, "grad_norm": 26.916791915893555, "learning_rate": 1.475e-06, "loss": 0.1708, "num_input_tokens_seen": 1232784, "step": 177 }, { "epoch": 1.1456154465004023, "grad_norm": 13.593954086303711, "learning_rate": 1.4833333333333337e-06, "loss": 0.1363, "num_input_tokens_seen": 1239472, "step": 178 }, { "epoch": 1.1520514883346742, "grad_norm": 17.63590431213379, "learning_rate": 1.4916666666666669e-06, "loss": 0.1369, "num_input_tokens_seen": 1246864, "step": 179 }, { "epoch": 1.1584875301689461, "grad_norm": 12.465302467346191, "learning_rate": 1.5e-06, "loss": 0.1632, "num_input_tokens_seen": 1253936, "step": 180 }, { "epoch": 1.164923572003218, "grad_norm": 18.099266052246094, "learning_rate": 1.5083333333333336e-06, "loss": 0.1734, "num_input_tokens_seen": 1261120, "step": 181 }, { "epoch": 1.17135961383749, "grad_norm": 12.134090423583984, "learning_rate": 1.5166666666666668e-06, "loss": 0.135, "num_input_tokens_seen": 1268208, "step": 182 }, { "epoch": 1.177795655671762, "grad_norm": 5.747508525848389, "learning_rate": 1.525e-06, "loss": 0.1355, "num_input_tokens_seen": 1275296, "step": 183 }, { "epoch": 1.1842316975060339, "grad_norm": 16.193449020385742, "learning_rate": 1.5333333333333334e-06, "loss": 0.1324, "num_input_tokens_seen": 1282320, "step": 184 }, { "epoch": 1.1906677393403058, "grad_norm": 23.576427459716797, "learning_rate": 1.5416666666666668e-06, "loss": 0.1754, "num_input_tokens_seen": 1289008, "step": 185 }, { "epoch": 1.1971037811745777, "grad_norm": 4.542221546173096, "learning_rate": 1.5500000000000002e-06, "loss": 0.1484, "num_input_tokens_seen": 1296208, "step": 186 }, { "epoch": 1.2035398230088497, "grad_norm": 6.084584712982178, "learning_rate": 1.5583333333333334e-06, "loss": 0.1315, "num_input_tokens_seen": 1303072, "step": 187 }, { "epoch": 1.2099758648431216, "grad_norm": 18.8467960357666, "learning_rate": 1.566666666666667e-06, "loss": 0.1665, "num_input_tokens_seen": 1310320, "step": 188 }, { "epoch": 1.2164119066773935, "grad_norm": 6.79512882232666, "learning_rate": 1.5750000000000002e-06, "loss": 0.1406, "num_input_tokens_seen": 1317728, "step": 189 }, { "epoch": 1.2228479485116655, "grad_norm": 11.130036354064941, "learning_rate": 1.5833333333333333e-06, "loss": 0.1391, "num_input_tokens_seen": 1325216, "step": 190 }, { "epoch": 1.2292839903459372, "grad_norm": 17.00998306274414, "learning_rate": 1.591666666666667e-06, "loss": 0.1339, "num_input_tokens_seen": 1332272, "step": 191 }, { "epoch": 1.235720032180209, "grad_norm": 16.623762130737305, "learning_rate": 1.6000000000000001e-06, "loss": 0.1613, "num_input_tokens_seen": 1339008, "step": 192 }, { "epoch": 1.242156074014481, "grad_norm": 15.660219192504883, "learning_rate": 1.6083333333333333e-06, "loss": 0.1274, "num_input_tokens_seen": 1345664, "step": 193 }, { "epoch": 1.248592115848753, "grad_norm": 21.379770278930664, "learning_rate": 1.6166666666666667e-06, "loss": 0.1882, "num_input_tokens_seen": 1352720, "step": 194 }, { "epoch": 1.255028157683025, "grad_norm": 8.196439743041992, "learning_rate": 1.6250000000000001e-06, "loss": 0.1106, "num_input_tokens_seen": 1359616, "step": 195 }, { "epoch": 1.2614641995172968, "grad_norm": 4.444194793701172, "learning_rate": 1.6333333333333335e-06, "loss": 0.1249, "num_input_tokens_seen": 1366656, "step": 196 }, { "epoch": 1.2679002413515688, "grad_norm": 10.585016250610352, "learning_rate": 1.6416666666666667e-06, "loss": 0.1499, "num_input_tokens_seen": 1373904, "step": 197 }, { "epoch": 1.2743362831858407, "grad_norm": 18.406293869018555, "learning_rate": 1.6500000000000003e-06, "loss": 0.1512, "num_input_tokens_seen": 1380528, "step": 198 }, { "epoch": 1.2807723250201126, "grad_norm": 5.323694229125977, "learning_rate": 1.6583333333333335e-06, "loss": 0.1166, "num_input_tokens_seen": 1386912, "step": 199 }, { "epoch": 1.2872083668543846, "grad_norm": 20.726289749145508, "learning_rate": 1.6666666666666667e-06, "loss": 0.21, "num_input_tokens_seen": 1393648, "step": 200 }, { "epoch": 1.2936444086886565, "grad_norm": 24.05786895751953, "learning_rate": 1.6750000000000003e-06, "loss": 0.1915, "num_input_tokens_seen": 1400640, "step": 201 }, { "epoch": 1.3000804505229284, "grad_norm": 19.30237579345703, "learning_rate": 1.6833333333333335e-06, "loss": 0.1911, "num_input_tokens_seen": 1407984, "step": 202 }, { "epoch": 1.3065164923572004, "grad_norm": 6.517977714538574, "learning_rate": 1.6916666666666666e-06, "loss": 0.1487, "num_input_tokens_seen": 1414672, "step": 203 }, { "epoch": 1.3129525341914723, "grad_norm": 30.81540870666504, "learning_rate": 1.7000000000000002e-06, "loss": 0.2154, "num_input_tokens_seen": 1421872, "step": 204 }, { "epoch": 1.3193885760257442, "grad_norm": 44.00107955932617, "learning_rate": 1.7083333333333334e-06, "loss": 0.2909, "num_input_tokens_seen": 1428640, "step": 205 }, { "epoch": 1.3258246178600162, "grad_norm": 41.464210510253906, "learning_rate": 1.7166666666666668e-06, "loss": 0.271, "num_input_tokens_seen": 1435456, "step": 206 }, { "epoch": 1.332260659694288, "grad_norm": 12.14904499053955, "learning_rate": 1.725e-06, "loss": 0.1616, "num_input_tokens_seen": 1442592, "step": 207 }, { "epoch": 1.33869670152856, "grad_norm": 8.393083572387695, "learning_rate": 1.7333333333333336e-06, "loss": 0.1427, "num_input_tokens_seen": 1449200, "step": 208 }, { "epoch": 1.3451327433628317, "grad_norm": 11.04562759399414, "learning_rate": 1.7416666666666668e-06, "loss": 0.1602, "num_input_tokens_seen": 1455920, "step": 209 }, { "epoch": 1.3515687851971037, "grad_norm": 12.494465827941895, "learning_rate": 1.75e-06, "loss": 0.169, "num_input_tokens_seen": 1462624, "step": 210 }, { "epoch": 1.3580048270313756, "grad_norm": 5.395782470703125, "learning_rate": 1.7583333333333336e-06, "loss": 0.1285, "num_input_tokens_seen": 1469520, "step": 211 }, { "epoch": 1.3644408688656475, "grad_norm": 19.773469924926758, "learning_rate": 1.7666666666666668e-06, "loss": 0.1636, "num_input_tokens_seen": 1476592, "step": 212 }, { "epoch": 1.3708769106999195, "grad_norm": 28.318584442138672, "learning_rate": 1.7750000000000002e-06, "loss": 0.1702, "num_input_tokens_seen": 1483632, "step": 213 }, { "epoch": 1.3773129525341914, "grad_norm": 20.225502014160156, "learning_rate": 1.7833333333333336e-06, "loss": 0.1562, "num_input_tokens_seen": 1490528, "step": 214 }, { "epoch": 1.3837489943684633, "grad_norm": 5.386298179626465, "learning_rate": 1.7916666666666667e-06, "loss": 0.1537, "num_input_tokens_seen": 1497648, "step": 215 }, { "epoch": 1.3901850362027353, "grad_norm": 6.181918144226074, "learning_rate": 1.8000000000000001e-06, "loss": 0.1114, "num_input_tokens_seen": 1504800, "step": 216 }, { "epoch": 1.3966210780370072, "grad_norm": 5.554294109344482, "learning_rate": 1.8083333333333335e-06, "loss": 0.1017, "num_input_tokens_seen": 1512240, "step": 217 }, { "epoch": 1.4030571198712791, "grad_norm": 5.2657880783081055, "learning_rate": 1.816666666666667e-06, "loss": 0.1184, "num_input_tokens_seen": 1519200, "step": 218 }, { "epoch": 1.409493161705551, "grad_norm": 8.627300262451172, "learning_rate": 1.825e-06, "loss": 0.1343, "num_input_tokens_seen": 1526272, "step": 219 }, { "epoch": 1.415929203539823, "grad_norm": 7.965896129608154, "learning_rate": 1.8333333333333333e-06, "loss": 0.1271, "num_input_tokens_seen": 1533440, "step": 220 }, { "epoch": 1.422365245374095, "grad_norm": 7.089397430419922, "learning_rate": 1.8416666666666669e-06, "loss": 0.1383, "num_input_tokens_seen": 1540272, "step": 221 }, { "epoch": 1.4288012872083669, "grad_norm": 4.354486465454102, "learning_rate": 1.85e-06, "loss": 0.1558, "num_input_tokens_seen": 1547632, "step": 222 }, { "epoch": 1.4352373290426388, "grad_norm": 7.841838836669922, "learning_rate": 1.8583333333333335e-06, "loss": 0.1312, "num_input_tokens_seen": 1554608, "step": 223 }, { "epoch": 1.4416733708769107, "grad_norm": 6.812905311584473, "learning_rate": 1.8666666666666669e-06, "loss": 0.1212, "num_input_tokens_seen": 1561472, "step": 224 }, { "epoch": 1.4481094127111827, "grad_norm": 5.038280963897705, "learning_rate": 1.8750000000000003e-06, "loss": 0.1342, "num_input_tokens_seen": 1568496, "step": 225 }, { "epoch": 1.4545454545454546, "grad_norm": 4.255394458770752, "learning_rate": 1.8833333333333334e-06, "loss": 0.096, "num_input_tokens_seen": 1575184, "step": 226 }, { "epoch": 1.4609814963797265, "grad_norm": 3.311915397644043, "learning_rate": 1.8916666666666668e-06, "loss": 0.0982, "num_input_tokens_seen": 1582080, "step": 227 }, { "epoch": 1.4674175382139985, "grad_norm": 4.303693771362305, "learning_rate": 1.9000000000000002e-06, "loss": 0.1099, "num_input_tokens_seen": 1588688, "step": 228 }, { "epoch": 1.4738535800482704, "grad_norm": 14.854019165039062, "learning_rate": 1.9083333333333334e-06, "loss": 0.1265, "num_input_tokens_seen": 1595216, "step": 229 }, { "epoch": 1.4802896218825423, "grad_norm": 10.509958267211914, "learning_rate": 1.916666666666667e-06, "loss": 0.1066, "num_input_tokens_seen": 1602336, "step": 230 }, { "epoch": 1.4867256637168142, "grad_norm": 9.096975326538086, "learning_rate": 1.925e-06, "loss": 0.1593, "num_input_tokens_seen": 1609024, "step": 231 }, { "epoch": 1.4931617055510862, "grad_norm": 18.944650650024414, "learning_rate": 1.9333333333333336e-06, "loss": 0.1891, "num_input_tokens_seen": 1615712, "step": 232 }, { "epoch": 1.4995977473853581, "grad_norm": 6.735738754272461, "learning_rate": 1.9416666666666666e-06, "loss": 0.0867, "num_input_tokens_seen": 1622608, "step": 233 }, { "epoch": 1.50603378921963, "grad_norm": 12.395522117614746, "learning_rate": 1.9500000000000004e-06, "loss": 0.1286, "num_input_tokens_seen": 1629520, "step": 234 }, { "epoch": 1.512469831053902, "grad_norm": 13.864114761352539, "learning_rate": 1.9583333333333334e-06, "loss": 0.1262, "num_input_tokens_seen": 1636320, "step": 235 }, { "epoch": 1.518905872888174, "grad_norm": 4.206810474395752, "learning_rate": 1.9666666666666668e-06, "loss": 0.0878, "num_input_tokens_seen": 1643216, "step": 236 }, { "epoch": 1.5253419147224458, "grad_norm": 9.294787406921387, "learning_rate": 1.975e-06, "loss": 0.1532, "num_input_tokens_seen": 1650256, "step": 237 }, { "epoch": 1.5317779565567178, "grad_norm": 5.397519111633301, "learning_rate": 1.9833333333333335e-06, "loss": 0.1232, "num_input_tokens_seen": 1657328, "step": 238 }, { "epoch": 1.5382139983909895, "grad_norm": 4.74614953994751, "learning_rate": 1.991666666666667e-06, "loss": 0.1119, "num_input_tokens_seen": 1664192, "step": 239 }, { "epoch": 1.5446500402252614, "grad_norm": 8.80385971069336, "learning_rate": 2.0000000000000003e-06, "loss": 0.1334, "num_input_tokens_seen": 1670944, "step": 240 }, { "epoch": 1.5510860820595334, "grad_norm": 12.17174243927002, "learning_rate": 2.0083333333333337e-06, "loss": 0.1224, "num_input_tokens_seen": 1677792, "step": 241 }, { "epoch": 1.5575221238938053, "grad_norm": 6.9399800300598145, "learning_rate": 2.0166666666666667e-06, "loss": 0.106, "num_input_tokens_seen": 1684640, "step": 242 }, { "epoch": 1.5639581657280772, "grad_norm": 5.804976463317871, "learning_rate": 2.025e-06, "loss": 0.1237, "num_input_tokens_seen": 1691664, "step": 243 }, { "epoch": 1.5703942075623492, "grad_norm": 5.245293617248535, "learning_rate": 2.0333333333333335e-06, "loss": 0.095, "num_input_tokens_seen": 1698528, "step": 244 }, { "epoch": 1.576830249396621, "grad_norm": 2.9305763244628906, "learning_rate": 2.041666666666667e-06, "loss": 0.0741, "num_input_tokens_seen": 1705600, "step": 245 }, { "epoch": 1.583266291230893, "grad_norm": 10.269381523132324, "learning_rate": 2.05e-06, "loss": 0.1239, "num_input_tokens_seen": 1712704, "step": 246 }, { "epoch": 1.589702333065165, "grad_norm": 4.453558921813965, "learning_rate": 2.0583333333333337e-06, "loss": 0.091, "num_input_tokens_seen": 1719568, "step": 247 }, { "epoch": 1.5961383748994369, "grad_norm": 16.549911499023438, "learning_rate": 2.0666666666666666e-06, "loss": 0.1403, "num_input_tokens_seen": 1726480, "step": 248 }, { "epoch": 1.6025744167337088, "grad_norm": 17.650426864624023, "learning_rate": 2.075e-06, "loss": 0.1638, "num_input_tokens_seen": 1733936, "step": 249 }, { "epoch": 1.6090104585679805, "grad_norm": 5.322378158569336, "learning_rate": 2.0833333333333334e-06, "loss": 0.1343, "num_input_tokens_seen": 1741008, "step": 250 }, { "epoch": 1.6154465004022525, "grad_norm": 11.570721626281738, "learning_rate": 2.091666666666667e-06, "loss": 0.1558, "num_input_tokens_seen": 1748240, "step": 251 }, { "epoch": 1.6218825422365244, "grad_norm": 2.901578426361084, "learning_rate": 2.1000000000000002e-06, "loss": 0.0809, "num_input_tokens_seen": 1755072, "step": 252 }, { "epoch": 1.6283185840707963, "grad_norm": 8.972208023071289, "learning_rate": 2.1083333333333336e-06, "loss": 0.1435, "num_input_tokens_seen": 1762048, "step": 253 }, { "epoch": 1.6347546259050683, "grad_norm": 2.364783525466919, "learning_rate": 2.116666666666667e-06, "loss": 0.0887, "num_input_tokens_seen": 1769200, "step": 254 }, { "epoch": 1.6411906677393402, "grad_norm": 3.7692675590515137, "learning_rate": 2.125e-06, "loss": 0.1038, "num_input_tokens_seen": 1776112, "step": 255 }, { "epoch": 1.6476267095736121, "grad_norm": 3.0572264194488525, "learning_rate": 2.133333333333334e-06, "loss": 0.0889, "num_input_tokens_seen": 1783664, "step": 256 }, { "epoch": 1.654062751407884, "grad_norm": 3.8316140174865723, "learning_rate": 2.1416666666666668e-06, "loss": 0.0751, "num_input_tokens_seen": 1790096, "step": 257 }, { "epoch": 1.660498793242156, "grad_norm": 5.133974552154541, "learning_rate": 2.15e-06, "loss": 0.0921, "num_input_tokens_seen": 1796912, "step": 258 }, { "epoch": 1.666934835076428, "grad_norm": 5.002286911010742, "learning_rate": 2.1583333333333336e-06, "loss": 0.1102, "num_input_tokens_seen": 1804144, "step": 259 }, { "epoch": 1.6733708769106999, "grad_norm": 8.221644401550293, "learning_rate": 2.166666666666667e-06, "loss": 0.1036, "num_input_tokens_seen": 1811040, "step": 260 }, { "epoch": 1.6798069187449718, "grad_norm": 6.029963493347168, "learning_rate": 2.1750000000000004e-06, "loss": 0.1093, "num_input_tokens_seen": 1818064, "step": 261 }, { "epoch": 1.6862429605792437, "grad_norm": 6.715224742889404, "learning_rate": 2.1833333333333333e-06, "loss": 0.1714, "num_input_tokens_seen": 1825056, "step": 262 }, { "epoch": 1.6926790024135157, "grad_norm": 6.136181354522705, "learning_rate": 2.191666666666667e-06, "loss": 0.1007, "num_input_tokens_seen": 1831968, "step": 263 }, { "epoch": 1.6991150442477876, "grad_norm": 5.392821788787842, "learning_rate": 2.2e-06, "loss": 0.109, "num_input_tokens_seen": 1838656, "step": 264 }, { "epoch": 1.7055510860820595, "grad_norm": 3.0743072032928467, "learning_rate": 2.2083333333333335e-06, "loss": 0.0574, "num_input_tokens_seen": 1845760, "step": 265 }, { "epoch": 1.7119871279163315, "grad_norm": 4.986932277679443, "learning_rate": 2.216666666666667e-06, "loss": 0.0697, "num_input_tokens_seen": 1852480, "step": 266 }, { "epoch": 1.7184231697506034, "grad_norm": 3.588496685028076, "learning_rate": 2.2250000000000003e-06, "loss": 0.1188, "num_input_tokens_seen": 1859312, "step": 267 }, { "epoch": 1.7248592115848753, "grad_norm": 3.850637912750244, "learning_rate": 2.2333333333333333e-06, "loss": 0.0998, "num_input_tokens_seen": 1866256, "step": 268 }, { "epoch": 1.7312952534191473, "grad_norm": 10.427441596984863, "learning_rate": 2.2416666666666667e-06, "loss": 0.1083, "num_input_tokens_seen": 1873104, "step": 269 }, { "epoch": 1.7377312952534192, "grad_norm": 6.516834259033203, "learning_rate": 2.25e-06, "loss": 0.0749, "num_input_tokens_seen": 1880192, "step": 270 }, { "epoch": 1.7441673370876911, "grad_norm": 5.243050575256348, "learning_rate": 2.2583333333333335e-06, "loss": 0.0771, "num_input_tokens_seen": 1887008, "step": 271 }, { "epoch": 1.750603378921963, "grad_norm": 3.874545097351074, "learning_rate": 2.266666666666667e-06, "loss": 0.0646, "num_input_tokens_seen": 1894096, "step": 272 }, { "epoch": 1.757039420756235, "grad_norm": 4.2995476722717285, "learning_rate": 2.2750000000000002e-06, "loss": 0.1147, "num_input_tokens_seen": 1901216, "step": 273 }, { "epoch": 1.763475462590507, "grad_norm": 9.720036506652832, "learning_rate": 2.2833333333333336e-06, "loss": 0.0917, "num_input_tokens_seen": 1908160, "step": 274 }, { "epoch": 1.7699115044247788, "grad_norm": 7.985558986663818, "learning_rate": 2.2916666666666666e-06, "loss": 0.106, "num_input_tokens_seen": 1915104, "step": 275 }, { "epoch": 1.7763475462590508, "grad_norm": 4.0768327713012695, "learning_rate": 2.3000000000000004e-06, "loss": 0.0849, "num_input_tokens_seen": 1922128, "step": 276 }, { "epoch": 1.7827835880933227, "grad_norm": 5.870975017547607, "learning_rate": 2.3083333333333334e-06, "loss": 0.1074, "num_input_tokens_seen": 1929200, "step": 277 }, { "epoch": 1.7892196299275946, "grad_norm": 3.490455389022827, "learning_rate": 2.316666666666667e-06, "loss": 0.0981, "num_input_tokens_seen": 1936144, "step": 278 }, { "epoch": 1.7956556717618666, "grad_norm": 4.1171183586120605, "learning_rate": 2.325e-06, "loss": 0.1008, "num_input_tokens_seen": 1943136, "step": 279 }, { "epoch": 1.8020917135961385, "grad_norm": 7.664264678955078, "learning_rate": 2.3333333333333336e-06, "loss": 0.1032, "num_input_tokens_seen": 1950208, "step": 280 }, { "epoch": 1.8085277554304104, "grad_norm": 4.865798473358154, "learning_rate": 2.341666666666667e-06, "loss": 0.0711, "num_input_tokens_seen": 1957056, "step": 281 }, { "epoch": 1.8149637972646824, "grad_norm": 2.5436036586761475, "learning_rate": 2.35e-06, "loss": 0.0901, "num_input_tokens_seen": 1964176, "step": 282 }, { "epoch": 1.8213998390989543, "grad_norm": 6.305140972137451, "learning_rate": 2.3583333333333338e-06, "loss": 0.0847, "num_input_tokens_seen": 1970736, "step": 283 }, { "epoch": 1.827835880933226, "grad_norm": 2.6688449382781982, "learning_rate": 2.3666666666666667e-06, "loss": 0.0752, "num_input_tokens_seen": 1977440, "step": 284 }, { "epoch": 1.834271922767498, "grad_norm": 2.5124077796936035, "learning_rate": 2.375e-06, "loss": 0.068, "num_input_tokens_seen": 1984464, "step": 285 }, { "epoch": 1.8407079646017699, "grad_norm": 6.168980121612549, "learning_rate": 2.3833333333333335e-06, "loss": 0.1088, "num_input_tokens_seen": 1991248, "step": 286 }, { "epoch": 1.8471440064360418, "grad_norm": 5.883851051330566, "learning_rate": 2.391666666666667e-06, "loss": 0.1017, "num_input_tokens_seen": 1998496, "step": 287 }, { "epoch": 1.8535800482703138, "grad_norm": 9.373373985290527, "learning_rate": 2.4000000000000003e-06, "loss": 0.13, "num_input_tokens_seen": 2005552, "step": 288 }, { "epoch": 1.8600160901045857, "grad_norm": 9.111586570739746, "learning_rate": 2.4083333333333337e-06, "loss": 0.0998, "num_input_tokens_seen": 2012272, "step": 289 }, { "epoch": 1.8664521319388576, "grad_norm": 5.353252410888672, "learning_rate": 2.4166666666666667e-06, "loss": 0.0779, "num_input_tokens_seen": 2019056, "step": 290 }, { "epoch": 1.8728881737731295, "grad_norm": 6.586206436157227, "learning_rate": 2.425e-06, "loss": 0.0907, "num_input_tokens_seen": 2025760, "step": 291 }, { "epoch": 1.8793242156074015, "grad_norm": 5.485732555389404, "learning_rate": 2.4333333333333335e-06, "loss": 0.0911, "num_input_tokens_seen": 2032928, "step": 292 }, { "epoch": 1.8857602574416734, "grad_norm": 3.5151724815368652, "learning_rate": 2.441666666666667e-06, "loss": 0.0987, "num_input_tokens_seen": 2039856, "step": 293 }, { "epoch": 1.8921962992759453, "grad_norm": 3.680494546890259, "learning_rate": 2.4500000000000003e-06, "loss": 0.1254, "num_input_tokens_seen": 2046896, "step": 294 }, { "epoch": 1.898632341110217, "grad_norm": 3.302248001098633, "learning_rate": 2.4583333333333332e-06, "loss": 0.0494, "num_input_tokens_seen": 2053600, "step": 295 }, { "epoch": 1.905068382944489, "grad_norm": 3.605039119720459, "learning_rate": 2.466666666666667e-06, "loss": 0.1082, "num_input_tokens_seen": 2060240, "step": 296 }, { "epoch": 1.911504424778761, "grad_norm": 2.6599857807159424, "learning_rate": 2.475e-06, "loss": 0.0785, "num_input_tokens_seen": 2067936, "step": 297 }, { "epoch": 1.9179404666130329, "grad_norm": 7.149720191955566, "learning_rate": 2.4833333333333334e-06, "loss": 0.1026, "num_input_tokens_seen": 2074656, "step": 298 }, { "epoch": 1.9243765084473048, "grad_norm": 4.549108982086182, "learning_rate": 2.491666666666667e-06, "loss": 0.0617, "num_input_tokens_seen": 2081568, "step": 299 }, { "epoch": 1.9308125502815767, "grad_norm": 2.900601625442505, "learning_rate": 2.5e-06, "loss": 0.0659, "num_input_tokens_seen": 2088368, "step": 300 }, { "epoch": 1.9372485921158487, "grad_norm": 6.378200531005859, "learning_rate": 2.5083333333333336e-06, "loss": 0.088, "num_input_tokens_seen": 2095728, "step": 301 }, { "epoch": 1.9436846339501206, "grad_norm": 6.718885898590088, "learning_rate": 2.5166666666666666e-06, "loss": 0.0771, "num_input_tokens_seen": 2103104, "step": 302 }, { "epoch": 1.9501206757843925, "grad_norm": 3.587820291519165, "learning_rate": 2.5250000000000004e-06, "loss": 0.0642, "num_input_tokens_seen": 2110032, "step": 303 }, { "epoch": 1.9565567176186645, "grad_norm": 7.106460094451904, "learning_rate": 2.5333333333333338e-06, "loss": 0.0947, "num_input_tokens_seen": 2117056, "step": 304 }, { "epoch": 1.9629927594529364, "grad_norm": 3.480973243713379, "learning_rate": 2.5416666666666668e-06, "loss": 0.0975, "num_input_tokens_seen": 2123552, "step": 305 }, { "epoch": 1.9694288012872083, "grad_norm": 2.709892511367798, "learning_rate": 2.55e-06, "loss": 0.0527, "num_input_tokens_seen": 2130128, "step": 306 }, { "epoch": 1.9758648431214803, "grad_norm": 3.3756306171417236, "learning_rate": 2.558333333333334e-06, "loss": 0.0869, "num_input_tokens_seen": 2137232, "step": 307 }, { "epoch": 1.9823008849557522, "grad_norm": 6.785555839538574, "learning_rate": 2.566666666666667e-06, "loss": 0.0605, "num_input_tokens_seen": 2143776, "step": 308 }, { "epoch": 1.9887369267900241, "grad_norm": 3.4628372192382812, "learning_rate": 2.5750000000000003e-06, "loss": 0.0684, "num_input_tokens_seen": 2150976, "step": 309 }, { "epoch": 1.995172968624296, "grad_norm": 3.56925892829895, "learning_rate": 2.5833333333333337e-06, "loss": 0.0701, "num_input_tokens_seen": 2158080, "step": 310 }, { "epoch": 2.001609010458568, "grad_norm": 4.06324577331543, "learning_rate": 2.5916666666666667e-06, "loss": 0.0699, "num_input_tokens_seen": 2164992, "step": 311 }, { "epoch": 2.00804505229284, "grad_norm": 7.733395576477051, "learning_rate": 2.6e-06, "loss": 0.0949, "num_input_tokens_seen": 2171952, "step": 312 }, { "epoch": 2.014481094127112, "grad_norm": 7.6149139404296875, "learning_rate": 2.608333333333333e-06, "loss": 0.0911, "num_input_tokens_seen": 2179072, "step": 313 }, { "epoch": 2.020917135961384, "grad_norm": 2.538379192352295, "learning_rate": 2.616666666666667e-06, "loss": 0.0615, "num_input_tokens_seen": 2185872, "step": 314 }, { "epoch": 2.0273531777956557, "grad_norm": 2.5334603786468506, "learning_rate": 2.6250000000000003e-06, "loss": 0.0448, "num_input_tokens_seen": 2192656, "step": 315 }, { "epoch": 2.0337892196299276, "grad_norm": 4.8344340324401855, "learning_rate": 2.6333333333333332e-06, "loss": 0.0619, "num_input_tokens_seen": 2199728, "step": 316 }, { "epoch": 2.0402252614641996, "grad_norm": 4.393861770629883, "learning_rate": 2.6416666666666666e-06, "loss": 0.0475, "num_input_tokens_seen": 2206608, "step": 317 }, { "epoch": 2.0466613032984715, "grad_norm": 2.7922892570495605, "learning_rate": 2.6500000000000005e-06, "loss": 0.0438, "num_input_tokens_seen": 2213856, "step": 318 }, { "epoch": 2.0530973451327434, "grad_norm": 1.5408401489257812, "learning_rate": 2.6583333333333334e-06, "loss": 0.0245, "num_input_tokens_seen": 2220528, "step": 319 }, { "epoch": 2.0595333869670154, "grad_norm": 5.6088433265686035, "learning_rate": 2.666666666666667e-06, "loss": 0.0716, "num_input_tokens_seen": 2227616, "step": 320 }, { "epoch": 2.0659694288012873, "grad_norm": 9.311470985412598, "learning_rate": 2.6750000000000002e-06, "loss": 0.1015, "num_input_tokens_seen": 2234304, "step": 321 }, { "epoch": 2.0724054706355592, "grad_norm": 5.244096279144287, "learning_rate": 2.683333333333333e-06, "loss": 0.0753, "num_input_tokens_seen": 2241088, "step": 322 }, { "epoch": 2.078841512469831, "grad_norm": 3.443998098373413, "learning_rate": 2.691666666666667e-06, "loss": 0.0521, "num_input_tokens_seen": 2247632, "step": 323 }, { "epoch": 2.085277554304103, "grad_norm": 2.4997072219848633, "learning_rate": 2.7000000000000004e-06, "loss": 0.0287, "num_input_tokens_seen": 2254448, "step": 324 }, { "epoch": 2.091713596138375, "grad_norm": 4.817678928375244, "learning_rate": 2.7083333333333334e-06, "loss": 0.0471, "num_input_tokens_seen": 2261424, "step": 325 }, { "epoch": 2.098149637972647, "grad_norm": 6.326369285583496, "learning_rate": 2.7166666666666668e-06, "loss": 0.0697, "num_input_tokens_seen": 2268528, "step": 326 }, { "epoch": 2.104585679806919, "grad_norm": 3.599905490875244, "learning_rate": 2.7250000000000006e-06, "loss": 0.0438, "num_input_tokens_seen": 2275328, "step": 327 }, { "epoch": 2.111021721641191, "grad_norm": 2.8037264347076416, "learning_rate": 2.7333333333333336e-06, "loss": 0.0475, "num_input_tokens_seen": 2282400, "step": 328 }, { "epoch": 2.1174577634754628, "grad_norm": 2.7425622940063477, "learning_rate": 2.741666666666667e-06, "loss": 0.0601, "num_input_tokens_seen": 2289312, "step": 329 }, { "epoch": 2.1238938053097347, "grad_norm": 2.064824342727661, "learning_rate": 2.7500000000000004e-06, "loss": 0.0355, "num_input_tokens_seen": 2295824, "step": 330 }, { "epoch": 2.1303298471440066, "grad_norm": 3.695521593093872, "learning_rate": 2.7583333333333333e-06, "loss": 0.0515, "num_input_tokens_seen": 2303024, "step": 331 }, { "epoch": 2.136765888978278, "grad_norm": 3.3290112018585205, "learning_rate": 2.766666666666667e-06, "loss": 0.0601, "num_input_tokens_seen": 2309904, "step": 332 }, { "epoch": 2.14320193081255, "grad_norm": 2.751953363418579, "learning_rate": 2.7750000000000005e-06, "loss": 0.0288, "num_input_tokens_seen": 2316416, "step": 333 }, { "epoch": 2.149637972646822, "grad_norm": 4.679827690124512, "learning_rate": 2.7833333333333335e-06, "loss": 0.0563, "num_input_tokens_seen": 2323088, "step": 334 }, { "epoch": 2.156074014481094, "grad_norm": 9.301896095275879, "learning_rate": 2.791666666666667e-06, "loss": 0.1176, "num_input_tokens_seen": 2329968, "step": 335 }, { "epoch": 2.162510056315366, "grad_norm": 6.16165828704834, "learning_rate": 2.8000000000000003e-06, "loss": 0.0965, "num_input_tokens_seen": 2336656, "step": 336 }, { "epoch": 2.168946098149638, "grad_norm": 2.442518711090088, "learning_rate": 2.8083333333333333e-06, "loss": 0.0359, "num_input_tokens_seen": 2343984, "step": 337 }, { "epoch": 2.1753821399839097, "grad_norm": 3.537282943725586, "learning_rate": 2.816666666666667e-06, "loss": 0.0609, "num_input_tokens_seen": 2350912, "step": 338 }, { "epoch": 2.1818181818181817, "grad_norm": 5.1499223709106445, "learning_rate": 2.825e-06, "loss": 0.0768, "num_input_tokens_seen": 2357680, "step": 339 }, { "epoch": 2.1882542236524536, "grad_norm": 8.193970680236816, "learning_rate": 2.8333333333333335e-06, "loss": 0.0849, "num_input_tokens_seen": 2364736, "step": 340 }, { "epoch": 2.1946902654867255, "grad_norm": 2.2035670280456543, "learning_rate": 2.841666666666667e-06, "loss": 0.0581, "num_input_tokens_seen": 2371568, "step": 341 }, { "epoch": 2.2011263073209975, "grad_norm": 2.7924435138702393, "learning_rate": 2.85e-06, "loss": 0.046, "num_input_tokens_seen": 2378384, "step": 342 }, { "epoch": 2.2075623491552694, "grad_norm": 4.6174445152282715, "learning_rate": 2.8583333333333336e-06, "loss": 0.0674, "num_input_tokens_seen": 2385584, "step": 343 }, { "epoch": 2.2139983909895413, "grad_norm": 2.4459989070892334, "learning_rate": 2.866666666666667e-06, "loss": 0.0563, "num_input_tokens_seen": 2392640, "step": 344 }, { "epoch": 2.2204344328238133, "grad_norm": 2.3443846702575684, "learning_rate": 2.875e-06, "loss": 0.0621, "num_input_tokens_seen": 2399936, "step": 345 }, { "epoch": 2.226870474658085, "grad_norm": 2.865879774093628, "learning_rate": 2.8833333333333334e-06, "loss": 0.0659, "num_input_tokens_seen": 2406928, "step": 346 }, { "epoch": 2.233306516492357, "grad_norm": 4.03169059753418, "learning_rate": 2.8916666666666672e-06, "loss": 0.039, "num_input_tokens_seen": 2413888, "step": 347 }, { "epoch": 2.239742558326629, "grad_norm": 1.693605899810791, "learning_rate": 2.9e-06, "loss": 0.0239, "num_input_tokens_seen": 2421104, "step": 348 }, { "epoch": 2.246178600160901, "grad_norm": 2.7058444023132324, "learning_rate": 2.9083333333333336e-06, "loss": 0.0521, "num_input_tokens_seen": 2428128, "step": 349 }, { "epoch": 2.252614641995173, "grad_norm": 3.9503567218780518, "learning_rate": 2.916666666666667e-06, "loss": 0.0561, "num_input_tokens_seen": 2434880, "step": 350 }, { "epoch": 2.259050683829445, "grad_norm": 4.444098472595215, "learning_rate": 2.925e-06, "loss": 0.0622, "num_input_tokens_seen": 2441824, "step": 351 }, { "epoch": 2.265486725663717, "grad_norm": 3.7014055252075195, "learning_rate": 2.9333333333333338e-06, "loss": 0.0875, "num_input_tokens_seen": 2448688, "step": 352 }, { "epoch": 2.2719227674979887, "grad_norm": 4.078037261962891, "learning_rate": 2.941666666666667e-06, "loss": 0.0307, "num_input_tokens_seen": 2455488, "step": 353 }, { "epoch": 2.2783588093322606, "grad_norm": 3.753711700439453, "learning_rate": 2.95e-06, "loss": 0.063, "num_input_tokens_seen": 2462240, "step": 354 }, { "epoch": 2.2847948511665326, "grad_norm": 2.9653706550598145, "learning_rate": 2.9583333333333335e-06, "loss": 0.0404, "num_input_tokens_seen": 2469408, "step": 355 }, { "epoch": 2.2912308930008045, "grad_norm": 3.8090925216674805, "learning_rate": 2.9666666666666673e-06, "loss": 0.0759, "num_input_tokens_seen": 2476240, "step": 356 }, { "epoch": 2.2976669348350764, "grad_norm": 2.4684033393859863, "learning_rate": 2.9750000000000003e-06, "loss": 0.0488, "num_input_tokens_seen": 2482864, "step": 357 }, { "epoch": 2.3041029766693484, "grad_norm": 2.0687243938446045, "learning_rate": 2.9833333333333337e-06, "loss": 0.0499, "num_input_tokens_seen": 2489664, "step": 358 }, { "epoch": 2.3105390185036203, "grad_norm": 3.223965883255005, "learning_rate": 2.991666666666667e-06, "loss": 0.0441, "num_input_tokens_seen": 2496704, "step": 359 }, { "epoch": 2.3169750603378922, "grad_norm": 2.1407270431518555, "learning_rate": 3e-06, "loss": 0.0485, "num_input_tokens_seen": 2503920, "step": 360 }, { "epoch": 2.323411102172164, "grad_norm": 2.632885217666626, "learning_rate": 3.0083333333333335e-06, "loss": 0.0674, "num_input_tokens_seen": 2510544, "step": 361 }, { "epoch": 2.329847144006436, "grad_norm": 3.258030652999878, "learning_rate": 3.0166666666666673e-06, "loss": 0.0689, "num_input_tokens_seen": 2517408, "step": 362 }, { "epoch": 2.336283185840708, "grad_norm": 6.024159908294678, "learning_rate": 3.0250000000000003e-06, "loss": 0.0618, "num_input_tokens_seen": 2524160, "step": 363 }, { "epoch": 2.34271922767498, "grad_norm": 4.7281999588012695, "learning_rate": 3.0333333333333337e-06, "loss": 0.0629, "num_input_tokens_seen": 2531072, "step": 364 }, { "epoch": 2.349155269509252, "grad_norm": 4.178661823272705, "learning_rate": 3.0416666666666666e-06, "loss": 0.0499, "num_input_tokens_seen": 2537920, "step": 365 }, { "epoch": 2.355591311343524, "grad_norm": 1.5715197324752808, "learning_rate": 3.05e-06, "loss": 0.0361, "num_input_tokens_seen": 2544736, "step": 366 }, { "epoch": 2.3620273531777958, "grad_norm": 2.835855722427368, "learning_rate": 3.058333333333334e-06, "loss": 0.0471, "num_input_tokens_seen": 2552016, "step": 367 }, { "epoch": 2.3684633950120677, "grad_norm": 2.870889902114868, "learning_rate": 3.066666666666667e-06, "loss": 0.0622, "num_input_tokens_seen": 2559616, "step": 368 }, { "epoch": 2.3748994368463396, "grad_norm": 1.7411049604415894, "learning_rate": 3.075e-06, "loss": 0.0328, "num_input_tokens_seen": 2566240, "step": 369 }, { "epoch": 2.3813354786806116, "grad_norm": 3.0499918460845947, "learning_rate": 3.0833333333333336e-06, "loss": 0.0437, "num_input_tokens_seen": 2573392, "step": 370 }, { "epoch": 2.3877715205148835, "grad_norm": 4.242414474487305, "learning_rate": 3.0916666666666666e-06, "loss": 0.0644, "num_input_tokens_seen": 2580544, "step": 371 }, { "epoch": 2.3942075623491554, "grad_norm": 2.962906837463379, "learning_rate": 3.1000000000000004e-06, "loss": 0.0553, "num_input_tokens_seen": 2587344, "step": 372 }, { "epoch": 2.4006436041834274, "grad_norm": 4.431301116943359, "learning_rate": 3.1083333333333338e-06, "loss": 0.061, "num_input_tokens_seen": 2594560, "step": 373 }, { "epoch": 2.4070796460176993, "grad_norm": 5.075587272644043, "learning_rate": 3.1166666666666668e-06, "loss": 0.0866, "num_input_tokens_seen": 2601408, "step": 374 }, { "epoch": 2.4135156878519712, "grad_norm": 3.877520799636841, "learning_rate": 3.125e-06, "loss": 0.0632, "num_input_tokens_seen": 2608624, "step": 375 }, { "epoch": 2.419951729686243, "grad_norm": 2.9902503490448, "learning_rate": 3.133333333333334e-06, "loss": 0.0395, "num_input_tokens_seen": 2615456, "step": 376 }, { "epoch": 2.426387771520515, "grad_norm": 3.7800397872924805, "learning_rate": 3.141666666666667e-06, "loss": 0.0819, "num_input_tokens_seen": 2622672, "step": 377 }, { "epoch": 2.432823813354787, "grad_norm": 2.4674911499023438, "learning_rate": 3.1500000000000003e-06, "loss": 0.064, "num_input_tokens_seen": 2629952, "step": 378 }, { "epoch": 2.439259855189059, "grad_norm": 5.3331146240234375, "learning_rate": 3.1583333333333337e-06, "loss": 0.0803, "num_input_tokens_seen": 2637168, "step": 379 }, { "epoch": 2.445695897023331, "grad_norm": 9.950706481933594, "learning_rate": 3.1666666666666667e-06, "loss": 0.0798, "num_input_tokens_seen": 2644144, "step": 380 }, { "epoch": 2.4521319388576024, "grad_norm": 5.1734442710876465, "learning_rate": 3.175e-06, "loss": 0.0544, "num_input_tokens_seen": 2651376, "step": 381 }, { "epoch": 2.4585679806918743, "grad_norm": 2.5671188831329346, "learning_rate": 3.183333333333334e-06, "loss": 0.0629, "num_input_tokens_seen": 2658336, "step": 382 }, { "epoch": 2.4650040225261463, "grad_norm": 4.357182025909424, "learning_rate": 3.191666666666667e-06, "loss": 0.0471, "num_input_tokens_seen": 2665360, "step": 383 }, { "epoch": 2.471440064360418, "grad_norm": 4.694338321685791, "learning_rate": 3.2000000000000003e-06, "loss": 0.0533, "num_input_tokens_seen": 2672704, "step": 384 }, { "epoch": 2.47787610619469, "grad_norm": 2.391195774078369, "learning_rate": 3.2083333333333337e-06, "loss": 0.0542, "num_input_tokens_seen": 2679872, "step": 385 }, { "epoch": 2.484312148028962, "grad_norm": 3.859102249145508, "learning_rate": 3.2166666666666666e-06, "loss": 0.034, "num_input_tokens_seen": 2686672, "step": 386 }, { "epoch": 2.490748189863234, "grad_norm": 2.4710166454315186, "learning_rate": 3.2250000000000005e-06, "loss": 0.0517, "num_input_tokens_seen": 2693520, "step": 387 }, { "epoch": 2.497184231697506, "grad_norm": 3.309068202972412, "learning_rate": 3.2333333333333334e-06, "loss": 0.0698, "num_input_tokens_seen": 2700432, "step": 388 }, { "epoch": 2.503620273531778, "grad_norm": 4.21011209487915, "learning_rate": 3.241666666666667e-06, "loss": 0.0573, "num_input_tokens_seen": 2707184, "step": 389 }, { "epoch": 2.51005631536605, "grad_norm": 4.34623908996582, "learning_rate": 3.2500000000000002e-06, "loss": 0.0568, "num_input_tokens_seen": 2713936, "step": 390 }, { "epoch": 2.5164923572003217, "grad_norm": 3.361445188522339, "learning_rate": 3.258333333333333e-06, "loss": 0.0669, "num_input_tokens_seen": 2721216, "step": 391 }, { "epoch": 2.5229283990345936, "grad_norm": 2.091728925704956, "learning_rate": 3.266666666666667e-06, "loss": 0.027, "num_input_tokens_seen": 2727968, "step": 392 }, { "epoch": 2.5293644408688656, "grad_norm": 2.1977951526641846, "learning_rate": 3.2750000000000004e-06, "loss": 0.0303, "num_input_tokens_seen": 2734816, "step": 393 }, { "epoch": 2.5358004827031375, "grad_norm": 2.7409942150115967, "learning_rate": 3.2833333333333334e-06, "loss": 0.0392, "num_input_tokens_seen": 2741744, "step": 394 }, { "epoch": 2.5422365245374094, "grad_norm": 3.695770740509033, "learning_rate": 3.2916666666666668e-06, "loss": 0.0813, "num_input_tokens_seen": 2748640, "step": 395 }, { "epoch": 2.5486725663716814, "grad_norm": 3.674891471862793, "learning_rate": 3.3000000000000006e-06, "loss": 0.0403, "num_input_tokens_seen": 2755888, "step": 396 }, { "epoch": 2.5551086082059533, "grad_norm": 1.716131567955017, "learning_rate": 3.3083333333333336e-06, "loss": 0.0222, "num_input_tokens_seen": 2762464, "step": 397 }, { "epoch": 2.5615446500402252, "grad_norm": 2.5081095695495605, "learning_rate": 3.316666666666667e-06, "loss": 0.0611, "num_input_tokens_seen": 2769712, "step": 398 }, { "epoch": 2.567980691874497, "grad_norm": 1.9974850416183472, "learning_rate": 3.3250000000000004e-06, "loss": 0.035, "num_input_tokens_seen": 2776736, "step": 399 }, { "epoch": 2.574416733708769, "grad_norm": 4.233558177947998, "learning_rate": 3.3333333333333333e-06, "loss": 0.068, "num_input_tokens_seen": 2783376, "step": 400 }, { "epoch": 2.580852775543041, "grad_norm": 3.359081983566284, "learning_rate": 3.341666666666667e-06, "loss": 0.0543, "num_input_tokens_seen": 2790528, "step": 401 }, { "epoch": 2.587288817377313, "grad_norm": 2.669712543487549, "learning_rate": 3.3500000000000005e-06, "loss": 0.0466, "num_input_tokens_seen": 2797312, "step": 402 }, { "epoch": 2.593724859211585, "grad_norm": 3.1529603004455566, "learning_rate": 3.3583333333333335e-06, "loss": 0.0626, "num_input_tokens_seen": 2804288, "step": 403 }, { "epoch": 2.600160901045857, "grad_norm": 3.069842576980591, "learning_rate": 3.366666666666667e-06, "loss": 0.0589, "num_input_tokens_seen": 2811456, "step": 404 }, { "epoch": 2.6065969428801288, "grad_norm": 1.881988525390625, "learning_rate": 3.3750000000000003e-06, "loss": 0.0415, "num_input_tokens_seen": 2818080, "step": 405 }, { "epoch": 2.6130329847144007, "grad_norm": 1.862747073173523, "learning_rate": 3.3833333333333333e-06, "loss": 0.0344, "num_input_tokens_seen": 2825136, "step": 406 }, { "epoch": 2.6194690265486726, "grad_norm": 2.6847071647644043, "learning_rate": 3.391666666666667e-06, "loss": 0.0423, "num_input_tokens_seen": 2832400, "step": 407 }, { "epoch": 2.6259050683829446, "grad_norm": 3.631681203842163, "learning_rate": 3.4000000000000005e-06, "loss": 0.0838, "num_input_tokens_seen": 2839712, "step": 408 }, { "epoch": 2.6323411102172165, "grad_norm": 3.7878201007843018, "learning_rate": 3.4083333333333335e-06, "loss": 0.0732, "num_input_tokens_seen": 2846160, "step": 409 }, { "epoch": 2.6387771520514884, "grad_norm": 2.826582431793213, "learning_rate": 3.416666666666667e-06, "loss": 0.0464, "num_input_tokens_seen": 2853520, "step": 410 }, { "epoch": 2.6452131938857604, "grad_norm": 2.330638885498047, "learning_rate": 3.4250000000000007e-06, "loss": 0.0387, "num_input_tokens_seen": 2860384, "step": 411 }, { "epoch": 2.6516492357200323, "grad_norm": 2.330439567565918, "learning_rate": 3.4333333333333336e-06, "loss": 0.0507, "num_input_tokens_seen": 2867360, "step": 412 }, { "epoch": 2.6580852775543042, "grad_norm": 3.929145336151123, "learning_rate": 3.441666666666667e-06, "loss": 0.0549, "num_input_tokens_seen": 2873648, "step": 413 }, { "epoch": 2.664521319388576, "grad_norm": 3.001359224319458, "learning_rate": 3.45e-06, "loss": 0.0285, "num_input_tokens_seen": 2880848, "step": 414 }, { "epoch": 2.670957361222848, "grad_norm": 2.7936651706695557, "learning_rate": 3.4583333333333334e-06, "loss": 0.0668, "num_input_tokens_seen": 2888256, "step": 415 }, { "epoch": 2.67739340305712, "grad_norm": 4.050117015838623, "learning_rate": 3.4666666666666672e-06, "loss": 0.0691, "num_input_tokens_seen": 2895040, "step": 416 }, { "epoch": 2.6838294448913915, "grad_norm": 5.509685516357422, "learning_rate": 3.475e-06, "loss": 0.066, "num_input_tokens_seen": 2902320, "step": 417 }, { "epoch": 2.6902654867256635, "grad_norm": 3.968433380126953, "learning_rate": 3.4833333333333336e-06, "loss": 0.0495, "num_input_tokens_seen": 2908960, "step": 418 }, { "epoch": 2.6967015285599354, "grad_norm": 2.082157611846924, "learning_rate": 3.491666666666667e-06, "loss": 0.034, "num_input_tokens_seen": 2915808, "step": 419 }, { "epoch": 2.7031375703942073, "grad_norm": 2.403968334197998, "learning_rate": 3.5e-06, "loss": 0.0604, "num_input_tokens_seen": 2922608, "step": 420 }, { "epoch": 2.7095736122284793, "grad_norm": 4.667454719543457, "learning_rate": 3.5083333333333338e-06, "loss": 0.0535, "num_input_tokens_seen": 2929728, "step": 421 }, { "epoch": 2.716009654062751, "grad_norm": 2.5968987941741943, "learning_rate": 3.516666666666667e-06, "loss": 0.0369, "num_input_tokens_seen": 2937024, "step": 422 }, { "epoch": 2.722445695897023, "grad_norm": 3.4746780395507812, "learning_rate": 3.525e-06, "loss": 0.045, "num_input_tokens_seen": 2943760, "step": 423 }, { "epoch": 2.728881737731295, "grad_norm": 1.9599398374557495, "learning_rate": 3.5333333333333335e-06, "loss": 0.0314, "num_input_tokens_seen": 2950848, "step": 424 }, { "epoch": 2.735317779565567, "grad_norm": 2.971634864807129, "learning_rate": 3.5416666666666673e-06, "loss": 0.0611, "num_input_tokens_seen": 2957408, "step": 425 }, { "epoch": 2.741753821399839, "grad_norm": 3.1944162845611572, "learning_rate": 3.5500000000000003e-06, "loss": 0.0478, "num_input_tokens_seen": 2964288, "step": 426 }, { "epoch": 2.748189863234111, "grad_norm": 3.3659610748291016, "learning_rate": 3.5583333333333337e-06, "loss": 0.038, "num_input_tokens_seen": 2970912, "step": 427 }, { "epoch": 2.754625905068383, "grad_norm": 2.965097188949585, "learning_rate": 3.566666666666667e-06, "loss": 0.043, "num_input_tokens_seen": 2978032, "step": 428 }, { "epoch": 2.7610619469026547, "grad_norm": 2.4006049633026123, "learning_rate": 3.575e-06, "loss": 0.0478, "num_input_tokens_seen": 2985232, "step": 429 }, { "epoch": 2.7674979887369267, "grad_norm": 3.7348554134368896, "learning_rate": 3.5833333333333335e-06, "loss": 0.0977, "num_input_tokens_seen": 2992240, "step": 430 }, { "epoch": 2.7739340305711986, "grad_norm": 3.1373274326324463, "learning_rate": 3.5916666666666673e-06, "loss": 0.0835, "num_input_tokens_seen": 2999008, "step": 431 }, { "epoch": 2.7803700724054705, "grad_norm": 1.9444302320480347, "learning_rate": 3.6000000000000003e-06, "loss": 0.0406, "num_input_tokens_seen": 3005648, "step": 432 }, { "epoch": 2.7868061142397424, "grad_norm": 1.8665870428085327, "learning_rate": 3.6083333333333337e-06, "loss": 0.0661, "num_input_tokens_seen": 3012224, "step": 433 }, { "epoch": 2.7932421560740144, "grad_norm": 1.9893403053283691, "learning_rate": 3.616666666666667e-06, "loss": 0.0647, "num_input_tokens_seen": 3019104, "step": 434 }, { "epoch": 2.7996781979082863, "grad_norm": 2.656529426574707, "learning_rate": 3.625e-06, "loss": 0.0499, "num_input_tokens_seen": 3026096, "step": 435 }, { "epoch": 2.8061142397425582, "grad_norm": 1.7047683000564575, "learning_rate": 3.633333333333334e-06, "loss": 0.0422, "num_input_tokens_seen": 3032784, "step": 436 }, { "epoch": 2.81255028157683, "grad_norm": 1.6727882623672485, "learning_rate": 3.6416666666666672e-06, "loss": 0.048, "num_input_tokens_seen": 3040096, "step": 437 }, { "epoch": 2.818986323411102, "grad_norm": 4.0175251960754395, "learning_rate": 3.65e-06, "loss": 0.0474, "num_input_tokens_seen": 3046720, "step": 438 }, { "epoch": 2.825422365245374, "grad_norm": 8.139860153198242, "learning_rate": 3.6583333333333336e-06, "loss": 0.0801, "num_input_tokens_seen": 3053712, "step": 439 }, { "epoch": 2.831858407079646, "grad_norm": 3.832087278366089, "learning_rate": 3.6666666666666666e-06, "loss": 0.0528, "num_input_tokens_seen": 3060528, "step": 440 }, { "epoch": 2.838294448913918, "grad_norm": 2.881619930267334, "learning_rate": 3.6750000000000004e-06, "loss": 0.0461, "num_input_tokens_seen": 3067440, "step": 441 }, { "epoch": 2.84473049074819, "grad_norm": 4.456245422363281, "learning_rate": 3.6833333333333338e-06, "loss": 0.0646, "num_input_tokens_seen": 3074208, "step": 442 }, { "epoch": 2.8511665325824618, "grad_norm": 5.1570820808410645, "learning_rate": 3.6916666666666668e-06, "loss": 0.049, "num_input_tokens_seen": 3081072, "step": 443 }, { "epoch": 2.8576025744167337, "grad_norm": 2.944526433944702, "learning_rate": 3.7e-06, "loss": 0.0531, "num_input_tokens_seen": 3088240, "step": 444 }, { "epoch": 2.8640386162510056, "grad_norm": 2.021688222885132, "learning_rate": 3.708333333333334e-06, "loss": 0.0521, "num_input_tokens_seen": 3095504, "step": 445 }, { "epoch": 2.8704746580852776, "grad_norm": 6.054248809814453, "learning_rate": 3.716666666666667e-06, "loss": 0.0927, "num_input_tokens_seen": 3102688, "step": 446 }, { "epoch": 2.8769106999195495, "grad_norm": 3.5824503898620605, "learning_rate": 3.7250000000000003e-06, "loss": 0.0491, "num_input_tokens_seen": 3109440, "step": 447 }, { "epoch": 2.8833467417538214, "grad_norm": 2.0240774154663086, "learning_rate": 3.7333333333333337e-06, "loss": 0.0399, "num_input_tokens_seen": 3116720, "step": 448 }, { "epoch": 2.8897827835880934, "grad_norm": 4.0125579833984375, "learning_rate": 3.7416666666666667e-06, "loss": 0.0499, "num_input_tokens_seen": 3123568, "step": 449 }, { "epoch": 2.8962188254223653, "grad_norm": 3.733275890350342, "learning_rate": 3.7500000000000005e-06, "loss": 0.0569, "num_input_tokens_seen": 3130768, "step": 450 }, { "epoch": 2.9026548672566372, "grad_norm": 4.261077880859375, "learning_rate": 3.758333333333334e-06, "loss": 0.0608, "num_input_tokens_seen": 3138128, "step": 451 }, { "epoch": 2.909090909090909, "grad_norm": 1.4142907857894897, "learning_rate": 3.766666666666667e-06, "loss": 0.0325, "num_input_tokens_seen": 3145008, "step": 452 }, { "epoch": 2.915526950925181, "grad_norm": 2.610344171524048, "learning_rate": 3.7750000000000003e-06, "loss": 0.0643, "num_input_tokens_seen": 3151792, "step": 453 }, { "epoch": 2.921962992759453, "grad_norm": 2.9687604904174805, "learning_rate": 3.7833333333333337e-06, "loss": 0.0479, "num_input_tokens_seen": 3158800, "step": 454 }, { "epoch": 2.928399034593725, "grad_norm": 2.2706518173217773, "learning_rate": 3.7916666666666666e-06, "loss": 0.0549, "num_input_tokens_seen": 3165744, "step": 455 }, { "epoch": 2.934835076427997, "grad_norm": 3.606792449951172, "learning_rate": 3.8000000000000005e-06, "loss": 0.0789, "num_input_tokens_seen": 3172896, "step": 456 }, { "epoch": 2.941271118262269, "grad_norm": 1.8851637840270996, "learning_rate": 3.808333333333334e-06, "loss": 0.0319, "num_input_tokens_seen": 3179888, "step": 457 }, { "epoch": 2.9477071600965408, "grad_norm": 2.6292834281921387, "learning_rate": 3.816666666666667e-06, "loss": 0.05, "num_input_tokens_seen": 3186960, "step": 458 }, { "epoch": 2.9541432019308127, "grad_norm": 2.099109172821045, "learning_rate": 3.825000000000001e-06, "loss": 0.0677, "num_input_tokens_seen": 3194208, "step": 459 }, { "epoch": 2.9605792437650846, "grad_norm": 2.5214834213256836, "learning_rate": 3.833333333333334e-06, "loss": 0.0512, "num_input_tokens_seen": 3201120, "step": 460 }, { "epoch": 2.9670152855993566, "grad_norm": 6.318456649780273, "learning_rate": 3.841666666666667e-06, "loss": 0.0681, "num_input_tokens_seen": 3208160, "step": 461 }, { "epoch": 2.9734513274336285, "grad_norm": 4.119838714599609, "learning_rate": 3.85e-06, "loss": 0.0651, "num_input_tokens_seen": 3214992, "step": 462 }, { "epoch": 2.9798873692679004, "grad_norm": 3.248420238494873, "learning_rate": 3.858333333333333e-06, "loss": 0.0498, "num_input_tokens_seen": 3222192, "step": 463 }, { "epoch": 2.9863234111021724, "grad_norm": 1.6198488473892212, "learning_rate": 3.866666666666667e-06, "loss": 0.0496, "num_input_tokens_seen": 3229504, "step": 464 }, { "epoch": 2.9927594529364443, "grad_norm": 2.6008763313293457, "learning_rate": 3.875e-06, "loss": 0.0446, "num_input_tokens_seen": 3236400, "step": 465 }, { "epoch": 2.9991954947707162, "grad_norm": 2.349928379058838, "learning_rate": 3.883333333333333e-06, "loss": 0.0543, "num_input_tokens_seen": 3243600, "step": 466 }, { "epoch": 3.0056315366049877, "grad_norm": 0.8590204119682312, "learning_rate": 3.891666666666667e-06, "loss": 0.0137, "num_input_tokens_seen": 3249808, "step": 467 }, { "epoch": 3.0120675784392597, "grad_norm": 1.2689623832702637, "learning_rate": 3.900000000000001e-06, "loss": 0.0201, "num_input_tokens_seen": 3257168, "step": 468 }, { "epoch": 3.0185036202735316, "grad_norm": 1.329512596130371, "learning_rate": 3.908333333333334e-06, "loss": 0.0119, "num_input_tokens_seen": 3264064, "step": 469 }, { "epoch": 3.0249396621078035, "grad_norm": 2.423644781112671, "learning_rate": 3.916666666666667e-06, "loss": 0.0305, "num_input_tokens_seen": 3270688, "step": 470 }, { "epoch": 3.0313757039420755, "grad_norm": 3.6647322177886963, "learning_rate": 3.9250000000000005e-06, "loss": 0.0213, "num_input_tokens_seen": 3277664, "step": 471 }, { "epoch": 3.0378117457763474, "grad_norm": 3.736281156539917, "learning_rate": 3.9333333333333335e-06, "loss": 0.035, "num_input_tokens_seen": 3284352, "step": 472 }, { "epoch": 3.0442477876106193, "grad_norm": 2.274883270263672, "learning_rate": 3.941666666666667e-06, "loss": 0.0438, "num_input_tokens_seen": 3290864, "step": 473 }, { "epoch": 3.0506838294448912, "grad_norm": 3.032172203063965, "learning_rate": 3.95e-06, "loss": 0.0464, "num_input_tokens_seen": 3297856, "step": 474 }, { "epoch": 3.057119871279163, "grad_norm": 2.258751392364502, "learning_rate": 3.958333333333333e-06, "loss": 0.0172, "num_input_tokens_seen": 3305120, "step": 475 }, { "epoch": 3.063555913113435, "grad_norm": 2.925736427307129, "learning_rate": 3.966666666666667e-06, "loss": 0.0287, "num_input_tokens_seen": 3312032, "step": 476 }, { "epoch": 3.069991954947707, "grad_norm": 3.100857734680176, "learning_rate": 3.975000000000001e-06, "loss": 0.0579, "num_input_tokens_seen": 3319424, "step": 477 }, { "epoch": 3.076427996781979, "grad_norm": 1.753515601158142, "learning_rate": 3.983333333333334e-06, "loss": 0.0095, "num_input_tokens_seen": 3326304, "step": 478 }, { "epoch": 3.082864038616251, "grad_norm": 2.3217740058898926, "learning_rate": 3.991666666666667e-06, "loss": 0.0238, "num_input_tokens_seen": 3333184, "step": 479 }, { "epoch": 3.089300080450523, "grad_norm": 2.512751579284668, "learning_rate": 4.000000000000001e-06, "loss": 0.0313, "num_input_tokens_seen": 3340384, "step": 480 }, { "epoch": 3.0957361222847948, "grad_norm": 1.2185322046279907, "learning_rate": 4.008333333333334e-06, "loss": 0.0146, "num_input_tokens_seen": 3347344, "step": 481 }, { "epoch": 3.1021721641190667, "grad_norm": 1.1303057670593262, "learning_rate": 4.0166666666666675e-06, "loss": 0.0347, "num_input_tokens_seen": 3354080, "step": 482 }, { "epoch": 3.1086082059533386, "grad_norm": 2.4247186183929443, "learning_rate": 4.0250000000000004e-06, "loss": 0.024, "num_input_tokens_seen": 3360848, "step": 483 }, { "epoch": 3.1150442477876106, "grad_norm": 1.4767001867294312, "learning_rate": 4.033333333333333e-06, "loss": 0.0128, "num_input_tokens_seen": 3367616, "step": 484 }, { "epoch": 3.1214802896218825, "grad_norm": 2.458953857421875, "learning_rate": 4.041666666666667e-06, "loss": 0.0311, "num_input_tokens_seen": 3374880, "step": 485 }, { "epoch": 3.1279163314561544, "grad_norm": 0.5494964718818665, "learning_rate": 4.05e-06, "loss": 0.0178, "num_input_tokens_seen": 3381696, "step": 486 }, { "epoch": 3.1343523732904264, "grad_norm": 1.5969914197921753, "learning_rate": 4.058333333333333e-06, "loss": 0.0379, "num_input_tokens_seen": 3388880, "step": 487 }, { "epoch": 3.1407884151246983, "grad_norm": 1.7003910541534424, "learning_rate": 4.066666666666667e-06, "loss": 0.0299, "num_input_tokens_seen": 3395984, "step": 488 }, { "epoch": 3.1472244569589702, "grad_norm": 2.297182083129883, "learning_rate": 4.075e-06, "loss": 0.0261, "num_input_tokens_seen": 3402896, "step": 489 }, { "epoch": 3.153660498793242, "grad_norm": 2.3937814235687256, "learning_rate": 4.083333333333334e-06, "loss": 0.0347, "num_input_tokens_seen": 3409888, "step": 490 }, { "epoch": 3.160096540627514, "grad_norm": 1.349425196647644, "learning_rate": 4.091666666666667e-06, "loss": 0.011, "num_input_tokens_seen": 3416928, "step": 491 }, { "epoch": 3.166532582461786, "grad_norm": 3.0355069637298584, "learning_rate": 4.1e-06, "loss": 0.0541, "num_input_tokens_seen": 3423968, "step": 492 }, { "epoch": 3.172968624296058, "grad_norm": 2.680206537246704, "learning_rate": 4.1083333333333335e-06, "loss": 0.0465, "num_input_tokens_seen": 3431120, "step": 493 }, { "epoch": 3.17940466613033, "grad_norm": 1.5906095504760742, "learning_rate": 4.116666666666667e-06, "loss": 0.0187, "num_input_tokens_seen": 3437776, "step": 494 }, { "epoch": 3.185840707964602, "grad_norm": 0.8296425938606262, "learning_rate": 4.125e-06, "loss": 0.0089, "num_input_tokens_seen": 3444480, "step": 495 }, { "epoch": 3.1922767497988738, "grad_norm": 2.857689142227173, "learning_rate": 4.133333333333333e-06, "loss": 0.0289, "num_input_tokens_seen": 3451232, "step": 496 }, { "epoch": 3.1987127916331457, "grad_norm": 1.0910203456878662, "learning_rate": 4.141666666666667e-06, "loss": 0.0103, "num_input_tokens_seen": 3457776, "step": 497 }, { "epoch": 3.2051488334674176, "grad_norm": 1.3560919761657715, "learning_rate": 4.15e-06, "loss": 0.0132, "num_input_tokens_seen": 3465056, "step": 498 }, { "epoch": 3.2115848753016896, "grad_norm": 4.861215591430664, "learning_rate": 4.158333333333334e-06, "loss": 0.0375, "num_input_tokens_seen": 3471968, "step": 499 }, { "epoch": 3.2180209171359615, "grad_norm": 1.8714208602905273, "learning_rate": 4.166666666666667e-06, "loss": 0.0143, "num_input_tokens_seen": 3479648, "step": 500 }, { "epoch": 3.2244569589702334, "grad_norm": 1.6230028867721558, "learning_rate": 4.175e-06, "loss": 0.0159, "num_input_tokens_seen": 3486272, "step": 501 }, { "epoch": 3.2308930008045054, "grad_norm": 0.7852226495742798, "learning_rate": 4.183333333333334e-06, "loss": 0.0073, "num_input_tokens_seen": 3493360, "step": 502 }, { "epoch": 3.2373290426387773, "grad_norm": 2.3990976810455322, "learning_rate": 4.1916666666666675e-06, "loss": 0.0186, "num_input_tokens_seen": 3500336, "step": 503 }, { "epoch": 3.2437650844730492, "grad_norm": 0.796851634979248, "learning_rate": 4.2000000000000004e-06, "loss": 0.0035, "num_input_tokens_seen": 3507232, "step": 504 }, { "epoch": 3.250201126307321, "grad_norm": 2.7951748371124268, "learning_rate": 4.208333333333333e-06, "loss": 0.0416, "num_input_tokens_seen": 3514144, "step": 505 }, { "epoch": 3.256637168141593, "grad_norm": 2.40897274017334, "learning_rate": 4.216666666666667e-06, "loss": 0.0266, "num_input_tokens_seen": 3520976, "step": 506 }, { "epoch": 3.263073209975865, "grad_norm": 2.3974061012268066, "learning_rate": 4.225e-06, "loss": 0.0351, "num_input_tokens_seen": 3527920, "step": 507 }, { "epoch": 3.2695092518101365, "grad_norm": 2.30100154876709, "learning_rate": 4.233333333333334e-06, "loss": 0.0209, "num_input_tokens_seen": 3534864, "step": 508 }, { "epoch": 3.2759452936444085, "grad_norm": 2.1172518730163574, "learning_rate": 4.241666666666667e-06, "loss": 0.0434, "num_input_tokens_seen": 3541872, "step": 509 }, { "epoch": 3.2823813354786804, "grad_norm": 3.7030341625213623, "learning_rate": 4.25e-06, "loss": 0.0174, "num_input_tokens_seen": 3548384, "step": 510 }, { "epoch": 3.2888173773129523, "grad_norm": 2.152125597000122, "learning_rate": 4.258333333333334e-06, "loss": 0.0529, "num_input_tokens_seen": 3555792, "step": 511 }, { "epoch": 3.2952534191472242, "grad_norm": 0.6081152558326721, "learning_rate": 4.266666666666668e-06, "loss": 0.0033, "num_input_tokens_seen": 3562608, "step": 512 }, { "epoch": 3.301689460981496, "grad_norm": 1.7042624950408936, "learning_rate": 4.2750000000000006e-06, "loss": 0.0196, "num_input_tokens_seen": 3569184, "step": 513 }, { "epoch": 3.308125502815768, "grad_norm": 1.3502767086029053, "learning_rate": 4.2833333333333335e-06, "loss": 0.0242, "num_input_tokens_seen": 3576224, "step": 514 }, { "epoch": 3.31456154465004, "grad_norm": 4.480360984802246, "learning_rate": 4.2916666666666665e-06, "loss": 0.0316, "num_input_tokens_seen": 3583328, "step": 515 }, { "epoch": 3.320997586484312, "grad_norm": 2.2217299938201904, "learning_rate": 4.3e-06, "loss": 0.0268, "num_input_tokens_seen": 3590256, "step": 516 }, { "epoch": 3.327433628318584, "grad_norm": 1.5919010639190674, "learning_rate": 4.308333333333334e-06, "loss": 0.0248, "num_input_tokens_seen": 3597328, "step": 517 }, { "epoch": 3.333869670152856, "grad_norm": 2.425961971282959, "learning_rate": 4.316666666666667e-06, "loss": 0.032, "num_input_tokens_seen": 3604576, "step": 518 }, { "epoch": 3.340305711987128, "grad_norm": 2.987424612045288, "learning_rate": 4.325e-06, "loss": 0.0202, "num_input_tokens_seen": 3611520, "step": 519 }, { "epoch": 3.3467417538213997, "grad_norm": 2.633897304534912, "learning_rate": 4.333333333333334e-06, "loss": 0.0329, "num_input_tokens_seen": 3618288, "step": 520 }, { "epoch": 3.3531777956556716, "grad_norm": 1.0696384906768799, "learning_rate": 4.341666666666667e-06, "loss": 0.019, "num_input_tokens_seen": 3625216, "step": 521 }, { "epoch": 3.3596138374899436, "grad_norm": 2.400972604751587, "learning_rate": 4.350000000000001e-06, "loss": 0.0182, "num_input_tokens_seen": 3631888, "step": 522 }, { "epoch": 3.3660498793242155, "grad_norm": 1.3744821548461914, "learning_rate": 4.358333333333334e-06, "loss": 0.0124, "num_input_tokens_seen": 3638848, "step": 523 }, { "epoch": 3.3724859211584874, "grad_norm": 1.613145112991333, "learning_rate": 4.366666666666667e-06, "loss": 0.0122, "num_input_tokens_seen": 3646112, "step": 524 }, { "epoch": 3.3789219629927594, "grad_norm": 2.450824499130249, "learning_rate": 4.3750000000000005e-06, "loss": 0.0388, "num_input_tokens_seen": 3652928, "step": 525 }, { "epoch": 3.3853580048270313, "grad_norm": 1.6122058629989624, "learning_rate": 4.383333333333334e-06, "loss": 0.0106, "num_input_tokens_seen": 3659632, "step": 526 }, { "epoch": 3.3917940466613032, "grad_norm": 1.53513765335083, "learning_rate": 4.391666666666667e-06, "loss": 0.0305, "num_input_tokens_seen": 3666480, "step": 527 }, { "epoch": 3.398230088495575, "grad_norm": 2.103663444519043, "learning_rate": 4.4e-06, "loss": 0.0512, "num_input_tokens_seen": 3673136, "step": 528 }, { "epoch": 3.404666130329847, "grad_norm": 0.41373467445373535, "learning_rate": 4.408333333333334e-06, "loss": 0.0031, "num_input_tokens_seen": 3679760, "step": 529 }, { "epoch": 3.411102172164119, "grad_norm": 2.9610488414764404, "learning_rate": 4.416666666666667e-06, "loss": 0.0309, "num_input_tokens_seen": 3686576, "step": 530 }, { "epoch": 3.417538213998391, "grad_norm": 2.415531873703003, "learning_rate": 4.425e-06, "loss": 0.0472, "num_input_tokens_seen": 3693312, "step": 531 }, { "epoch": 3.423974255832663, "grad_norm": 2.175546407699585, "learning_rate": 4.433333333333334e-06, "loss": 0.0222, "num_input_tokens_seen": 3700000, "step": 532 }, { "epoch": 3.430410297666935, "grad_norm": 1.0903018712997437, "learning_rate": 4.441666666666667e-06, "loss": 0.0077, "num_input_tokens_seen": 3706736, "step": 533 }, { "epoch": 3.4368463395012068, "grad_norm": 0.8305991888046265, "learning_rate": 4.450000000000001e-06, "loss": 0.0064, "num_input_tokens_seen": 3714192, "step": 534 }, { "epoch": 3.4432823813354787, "grad_norm": 0.9347790479660034, "learning_rate": 4.4583333333333336e-06, "loss": 0.0104, "num_input_tokens_seen": 3721408, "step": 535 }, { "epoch": 3.4497184231697506, "grad_norm": 1.7669559717178345, "learning_rate": 4.4666666666666665e-06, "loss": 0.0121, "num_input_tokens_seen": 3728144, "step": 536 }, { "epoch": 3.4561544650040226, "grad_norm": 3.121467351913452, "learning_rate": 4.475e-06, "loss": 0.0386, "num_input_tokens_seen": 3734960, "step": 537 }, { "epoch": 3.4625905068382945, "grad_norm": 2.683410882949829, "learning_rate": 4.483333333333333e-06, "loss": 0.0319, "num_input_tokens_seen": 3741728, "step": 538 }, { "epoch": 3.4690265486725664, "grad_norm": 9.728205680847168, "learning_rate": 4.491666666666667e-06, "loss": 0.0579, "num_input_tokens_seen": 3749200, "step": 539 }, { "epoch": 3.4754625905068384, "grad_norm": 4.415483474731445, "learning_rate": 4.5e-06, "loss": 0.0255, "num_input_tokens_seen": 3755856, "step": 540 }, { "epoch": 3.4818986323411103, "grad_norm": 3.651423692703247, "learning_rate": 4.508333333333333e-06, "loss": 0.0301, "num_input_tokens_seen": 3762528, "step": 541 }, { "epoch": 3.4883346741753822, "grad_norm": 2.318000078201294, "learning_rate": 4.516666666666667e-06, "loss": 0.0589, "num_input_tokens_seen": 3769632, "step": 542 }, { "epoch": 3.494770716009654, "grad_norm": 4.982158660888672, "learning_rate": 4.525000000000001e-06, "loss": 0.0442, "num_input_tokens_seen": 3776592, "step": 543 }, { "epoch": 3.501206757843926, "grad_norm": 3.0872108936309814, "learning_rate": 4.533333333333334e-06, "loss": 0.0366, "num_input_tokens_seen": 3783824, "step": 544 }, { "epoch": 3.507642799678198, "grad_norm": 5.150477886199951, "learning_rate": 4.541666666666667e-06, "loss": 0.0643, "num_input_tokens_seen": 3790864, "step": 545 }, { "epoch": 3.51407884151247, "grad_norm": 3.0513834953308105, "learning_rate": 4.5500000000000005e-06, "loss": 0.0213, "num_input_tokens_seen": 3797664, "step": 546 }, { "epoch": 3.520514883346742, "grad_norm": 1.5530712604522705, "learning_rate": 4.5583333333333335e-06, "loss": 0.0154, "num_input_tokens_seen": 3804576, "step": 547 }, { "epoch": 3.526950925181014, "grad_norm": 2.6350319385528564, "learning_rate": 4.566666666666667e-06, "loss": 0.0252, "num_input_tokens_seen": 3811440, "step": 548 }, { "epoch": 3.5333869670152858, "grad_norm": 2.8993167877197266, "learning_rate": 4.575e-06, "loss": 0.038, "num_input_tokens_seen": 3818352, "step": 549 }, { "epoch": 3.5398230088495577, "grad_norm": 2.0168752670288086, "learning_rate": 4.583333333333333e-06, "loss": 0.0169, "num_input_tokens_seen": 3825360, "step": 550 }, { "epoch": 3.5462590506838296, "grad_norm": 2.4160525798797607, "learning_rate": 4.591666666666667e-06, "loss": 0.0253, "num_input_tokens_seen": 3832416, "step": 551 }, { "epoch": 3.5526950925181016, "grad_norm": 1.543545126914978, "learning_rate": 4.600000000000001e-06, "loss": 0.0164, "num_input_tokens_seen": 3839344, "step": 552 }, { "epoch": 3.5591311343523735, "grad_norm": 2.355316400527954, "learning_rate": 4.608333333333334e-06, "loss": 0.0269, "num_input_tokens_seen": 3846688, "step": 553 }, { "epoch": 3.5655671761866454, "grad_norm": 1.4751020669937134, "learning_rate": 4.616666666666667e-06, "loss": 0.0192, "num_input_tokens_seen": 3853696, "step": 554 }, { "epoch": 3.5720032180209174, "grad_norm": 0.9673195481300354, "learning_rate": 4.625000000000001e-06, "loss": 0.0132, "num_input_tokens_seen": 3860832, "step": 555 }, { "epoch": 3.5784392598551893, "grad_norm": 1.1592040061950684, "learning_rate": 4.633333333333334e-06, "loss": 0.0156, "num_input_tokens_seen": 3868000, "step": 556 }, { "epoch": 3.5848753016894612, "grad_norm": 1.01143217086792, "learning_rate": 4.641666666666667e-06, "loss": 0.0081, "num_input_tokens_seen": 3874672, "step": 557 }, { "epoch": 3.591311343523733, "grad_norm": 2.855041980743408, "learning_rate": 4.65e-06, "loss": 0.0351, "num_input_tokens_seen": 3881744, "step": 558 }, { "epoch": 3.597747385358005, "grad_norm": 2.0597968101501465, "learning_rate": 4.658333333333333e-06, "loss": 0.0288, "num_input_tokens_seen": 3888256, "step": 559 }, { "epoch": 3.604183427192277, "grad_norm": 2.9965226650238037, "learning_rate": 4.666666666666667e-06, "loss": 0.0335, "num_input_tokens_seen": 3895104, "step": 560 }, { "epoch": 3.6106194690265485, "grad_norm": 3.625206708908081, "learning_rate": 4.675000000000001e-06, "loss": 0.0492, "num_input_tokens_seen": 3902208, "step": 561 }, { "epoch": 3.6170555108608204, "grad_norm": 2.021160840988159, "learning_rate": 4.683333333333334e-06, "loss": 0.0082, "num_input_tokens_seen": 3909040, "step": 562 }, { "epoch": 3.6234915526950924, "grad_norm": 3.4565329551696777, "learning_rate": 4.691666666666667e-06, "loss": 0.0491, "num_input_tokens_seen": 3916304, "step": 563 }, { "epoch": 3.6299275945293643, "grad_norm": 3.2362654209136963, "learning_rate": 4.7e-06, "loss": 0.0568, "num_input_tokens_seen": 3923216, "step": 564 }, { "epoch": 3.6363636363636362, "grad_norm": 3.234666347503662, "learning_rate": 4.708333333333334e-06, "loss": 0.0414, "num_input_tokens_seen": 3930448, "step": 565 }, { "epoch": 3.642799678197908, "grad_norm": 2.1742103099823, "learning_rate": 4.7166666666666675e-06, "loss": 0.034, "num_input_tokens_seen": 3937424, "step": 566 }, { "epoch": 3.64923572003218, "grad_norm": 2.9156923294067383, "learning_rate": 4.7250000000000005e-06, "loss": 0.0392, "num_input_tokens_seen": 3944112, "step": 567 }, { "epoch": 3.655671761866452, "grad_norm": 4.092429161071777, "learning_rate": 4.7333333333333335e-06, "loss": 0.051, "num_input_tokens_seen": 3951504, "step": 568 }, { "epoch": 3.662107803700724, "grad_norm": 3.9395768642425537, "learning_rate": 4.741666666666667e-06, "loss": 0.034, "num_input_tokens_seen": 3958352, "step": 569 }, { "epoch": 3.668543845534996, "grad_norm": 1.9961844682693481, "learning_rate": 4.75e-06, "loss": 0.014, "num_input_tokens_seen": 3965552, "step": 570 }, { "epoch": 3.674979887369268, "grad_norm": 1.8078194856643677, "learning_rate": 4.758333333333334e-06, "loss": 0.0406, "num_input_tokens_seen": 3972544, "step": 571 }, { "epoch": 3.6814159292035398, "grad_norm": 2.048532485961914, "learning_rate": 4.766666666666667e-06, "loss": 0.0407, "num_input_tokens_seen": 3979264, "step": 572 }, { "epoch": 3.6878519710378117, "grad_norm": 1.9979974031448364, "learning_rate": 4.775e-06, "loss": 0.0282, "num_input_tokens_seen": 3986240, "step": 573 }, { "epoch": 3.6942880128720836, "grad_norm": 3.6126463413238525, "learning_rate": 4.783333333333334e-06, "loss": 0.0326, "num_input_tokens_seen": 3993232, "step": 574 }, { "epoch": 3.7007240547063556, "grad_norm": 3.131657838821411, "learning_rate": 4.791666666666668e-06, "loss": 0.0348, "num_input_tokens_seen": 3999952, "step": 575 }, { "epoch": 3.7071600965406275, "grad_norm": 2.2662060260772705, "learning_rate": 4.800000000000001e-06, "loss": 0.0256, "num_input_tokens_seen": 4007456, "step": 576 }, { "epoch": 3.7135961383748994, "grad_norm": 4.874523639678955, "learning_rate": 4.808333333333334e-06, "loss": 0.0765, "num_input_tokens_seen": 4015024, "step": 577 }, { "epoch": 3.7200321802091714, "grad_norm": 0.882166862487793, "learning_rate": 4.816666666666667e-06, "loss": 0.0099, "num_input_tokens_seen": 4021920, "step": 578 }, { "epoch": 3.7264682220434433, "grad_norm": 3.1239066123962402, "learning_rate": 4.825e-06, "loss": 0.0173, "num_input_tokens_seen": 4028720, "step": 579 }, { "epoch": 3.7329042638777152, "grad_norm": 1.5819370746612549, "learning_rate": 4.833333333333333e-06, "loss": 0.0084, "num_input_tokens_seen": 4035584, "step": 580 }, { "epoch": 3.739340305711987, "grad_norm": 2.6252429485321045, "learning_rate": 4.841666666666667e-06, "loss": 0.0251, "num_input_tokens_seen": 4042464, "step": 581 }, { "epoch": 3.745776347546259, "grad_norm": 2.0619590282440186, "learning_rate": 4.85e-06, "loss": 0.0909, "num_input_tokens_seen": 4049600, "step": 582 }, { "epoch": 3.752212389380531, "grad_norm": 2.547422409057617, "learning_rate": 4.858333333333334e-06, "loss": 0.039, "num_input_tokens_seen": 4056320, "step": 583 }, { "epoch": 3.758648431214803, "grad_norm": 1.3179091215133667, "learning_rate": 4.866666666666667e-06, "loss": 0.0079, "num_input_tokens_seen": 4063200, "step": 584 }, { "epoch": 3.765084473049075, "grad_norm": 3.090376377105713, "learning_rate": 4.875e-06, "loss": 0.0242, "num_input_tokens_seen": 4070112, "step": 585 }, { "epoch": 3.771520514883347, "grad_norm": 2.50468111038208, "learning_rate": 4.883333333333334e-06, "loss": 0.0138, "num_input_tokens_seen": 4076928, "step": 586 }, { "epoch": 3.7779565567176188, "grad_norm": 3.921415090560913, "learning_rate": 4.8916666666666675e-06, "loss": 0.0467, "num_input_tokens_seen": 4083792, "step": 587 }, { "epoch": 3.7843925985518907, "grad_norm": 1.2243348360061646, "learning_rate": 4.9000000000000005e-06, "loss": 0.0241, "num_input_tokens_seen": 4090672, "step": 588 }, { "epoch": 3.7908286403861626, "grad_norm": 1.4968576431274414, "learning_rate": 4.9083333333333335e-06, "loss": 0.0404, "num_input_tokens_seen": 4097472, "step": 589 }, { "epoch": 3.7972646822204346, "grad_norm": 1.235217809677124, "learning_rate": 4.9166666666666665e-06, "loss": 0.0094, "num_input_tokens_seen": 4104016, "step": 590 }, { "epoch": 3.8037007240547065, "grad_norm": 1.3862783908843994, "learning_rate": 4.925e-06, "loss": 0.0196, "num_input_tokens_seen": 4110784, "step": 591 }, { "epoch": 3.8101367658889784, "grad_norm": 3.560793399810791, "learning_rate": 4.933333333333334e-06, "loss": 0.0514, "num_input_tokens_seen": 4117984, "step": 592 }, { "epoch": 3.8165728077232504, "grad_norm": 2.008575677871704, "learning_rate": 4.941666666666667e-06, "loss": 0.0286, "num_input_tokens_seen": 4125072, "step": 593 }, { "epoch": 3.823008849557522, "grad_norm": 2.3213093280792236, "learning_rate": 4.95e-06, "loss": 0.0417, "num_input_tokens_seen": 4132160, "step": 594 }, { "epoch": 3.829444891391794, "grad_norm": 1.3540257215499878, "learning_rate": 4.958333333333334e-06, "loss": 0.0347, "num_input_tokens_seen": 4139136, "step": 595 }, { "epoch": 3.8358809332260657, "grad_norm": 1.289825677871704, "learning_rate": 4.966666666666667e-06, "loss": 0.0229, "num_input_tokens_seen": 4146240, "step": 596 }, { "epoch": 3.8423169750603376, "grad_norm": 2.4050135612487793, "learning_rate": 4.975000000000001e-06, "loss": 0.0176, "num_input_tokens_seen": 4153152, "step": 597 }, { "epoch": 3.8487530168946096, "grad_norm": 1.523977518081665, "learning_rate": 4.983333333333334e-06, "loss": 0.0274, "num_input_tokens_seen": 4160080, "step": 598 }, { "epoch": 3.8551890587288815, "grad_norm": 1.1898863315582275, "learning_rate": 4.991666666666667e-06, "loss": 0.0253, "num_input_tokens_seen": 4167008, "step": 599 }, { "epoch": 3.8616251005631534, "grad_norm": 1.992311954498291, "learning_rate": 5e-06, "loss": 0.0429, "num_input_tokens_seen": 4174080, "step": 600 }, { "epoch": 3.8680611423974254, "grad_norm": 0.9558950066566467, "learning_rate": 4.999597169822646e-06, "loss": 0.0142, "num_input_tokens_seen": 4181104, "step": 601 }, { "epoch": 3.8744971842316973, "grad_norm": 0.9275301694869995, "learning_rate": 4.998388809108304e-06, "loss": 0.0148, "num_input_tokens_seen": 4188096, "step": 602 }, { "epoch": 3.8809332260659692, "grad_norm": 1.6707432270050049, "learning_rate": 4.996375307268303e-06, "loss": 0.0166, "num_input_tokens_seen": 4195152, "step": 603 }, { "epoch": 3.887369267900241, "grad_norm": 5.857227325439453, "learning_rate": 4.993557313182086e-06, "loss": 0.0224, "num_input_tokens_seen": 4201952, "step": 604 }, { "epoch": 3.893805309734513, "grad_norm": 5.273613452911377, "learning_rate": 4.989935734988098e-06, "loss": 0.0227, "num_input_tokens_seen": 4209104, "step": 605 }, { "epoch": 3.900241351568785, "grad_norm": 6.268670082092285, "learning_rate": 4.985511739791129e-06, "loss": 0.0597, "num_input_tokens_seen": 4216496, "step": 606 }, { "epoch": 3.906677393403057, "grad_norm": 3.373368501663208, "learning_rate": 4.980286753286196e-06, "loss": 0.0339, "num_input_tokens_seen": 4223840, "step": 607 }, { "epoch": 3.913113435237329, "grad_norm": 1.3991198539733887, "learning_rate": 4.974262459299088e-06, "loss": 0.0192, "num_input_tokens_seen": 4230752, "step": 608 }, { "epoch": 3.919549477071601, "grad_norm": 0.7424534559249878, "learning_rate": 4.967440799243739e-06, "loss": 0.007, "num_input_tokens_seen": 4237360, "step": 609 }, { "epoch": 3.9259855189058728, "grad_norm": 3.0347440242767334, "learning_rate": 4.959823971496575e-06, "loss": 0.017, "num_input_tokens_seen": 4244128, "step": 610 }, { "epoch": 3.9324215607401447, "grad_norm": 2.929175853729248, "learning_rate": 4.9514144306880506e-06, "loss": 0.0296, "num_input_tokens_seen": 4251264, "step": 611 }, { "epoch": 3.9388576025744166, "grad_norm": 4.076401710510254, "learning_rate": 4.942214886911619e-06, "loss": 0.0429, "num_input_tokens_seen": 4258256, "step": 612 }, { "epoch": 3.9452936444086886, "grad_norm": 0.7720851302146912, "learning_rate": 4.932228304850363e-06, "loss": 0.0027, "num_input_tokens_seen": 4265280, "step": 613 }, { "epoch": 3.9517296862429605, "grad_norm": 1.500545859336853, "learning_rate": 4.921457902821578e-06, "loss": 0.0395, "num_input_tokens_seen": 4271968, "step": 614 }, { "epoch": 3.9581657280772324, "grad_norm": 3.0767860412597656, "learning_rate": 4.909907151739634e-06, "loss": 0.03, "num_input_tokens_seen": 4278848, "step": 615 }, { "epoch": 3.9646017699115044, "grad_norm": 1.5455620288848877, "learning_rate": 4.897579773997415e-06, "loss": 0.0178, "num_input_tokens_seen": 4285808, "step": 616 }, { "epoch": 3.9710378117457763, "grad_norm": 1.1472654342651367, "learning_rate": 4.884479742266731e-06, "loss": 0.0139, "num_input_tokens_seen": 4292912, "step": 617 }, { "epoch": 3.9774738535800482, "grad_norm": 1.3290921449661255, "learning_rate": 4.870611278218066e-06, "loss": 0.0076, "num_input_tokens_seen": 4300176, "step": 618 }, { "epoch": 3.98390989541432, "grad_norm": 4.543910026550293, "learning_rate": 4.855978851160088e-06, "loss": 0.0683, "num_input_tokens_seen": 4307776, "step": 619 }, { "epoch": 3.990345937248592, "grad_norm": 3.424959421157837, "learning_rate": 4.8405871765993435e-06, "loss": 0.0367, "num_input_tokens_seen": 4314688, "step": 620 }, { "epoch": 3.996781979082864, "grad_norm": 1.5345810651779175, "learning_rate": 4.824441214720629e-06, "loss": 0.0497, "num_input_tokens_seen": 4321840, "step": 621 }, { "epoch": 4.003218020917136, "grad_norm": 0.5405219793319702, "learning_rate": 4.8075461687884935e-06, "loss": 0.0054, "num_input_tokens_seen": 4328736, "step": 622 }, { "epoch": 4.009654062751408, "grad_norm": 2.3540198802948, "learning_rate": 4.7899074834704165e-06, "loss": 0.0259, "num_input_tokens_seen": 4335952, "step": 623 }, { "epoch": 4.01609010458568, "grad_norm": 0.7733599543571472, "learning_rate": 4.771530843082187e-06, "loss": 0.0082, "num_input_tokens_seen": 4342816, "step": 624 }, { "epoch": 4.022526146419952, "grad_norm": 3.051017999649048, "learning_rate": 4.752422169756048e-06, "loss": 0.0359, "num_input_tokens_seen": 4349456, "step": 625 }, { "epoch": 4.028962188254224, "grad_norm": 0.4645274579524994, "learning_rate": 4.732587621532214e-06, "loss": 0.0081, "num_input_tokens_seen": 4356032, "step": 626 }, { "epoch": 4.035398230088496, "grad_norm": 1.9294419288635254, "learning_rate": 4.712033590374346e-06, "loss": 0.0118, "num_input_tokens_seen": 4362928, "step": 627 }, { "epoch": 4.041834271922768, "grad_norm": 2.5432851314544678, "learning_rate": 4.690766700109659e-06, "loss": 0.0235, "num_input_tokens_seen": 4369616, "step": 628 }, { "epoch": 4.0482703137570395, "grad_norm": 1.8334590196609497, "learning_rate": 4.668793804294294e-06, "loss": 0.0145, "num_input_tokens_seen": 4376656, "step": 629 }, { "epoch": 4.054706355591311, "grad_norm": 0.6473208069801331, "learning_rate": 4.646121984004666e-06, "loss": 0.006, "num_input_tokens_seen": 4383696, "step": 630 }, { "epoch": 4.061142397425583, "grad_norm": 2.0988128185272217, "learning_rate": 4.622758545555485e-06, "loss": 0.0191, "num_input_tokens_seen": 4390880, "step": 631 }, { "epoch": 4.067578439259855, "grad_norm": 1.8957973718643188, "learning_rate": 4.598711018145193e-06, "loss": 0.0075, "num_input_tokens_seen": 4398000, "step": 632 }, { "epoch": 4.074014481094127, "grad_norm": 1.117255449295044, "learning_rate": 4.573987151429579e-06, "loss": 0.0253, "num_input_tokens_seen": 4404640, "step": 633 }, { "epoch": 4.080450522928399, "grad_norm": 2.326129198074341, "learning_rate": 4.54859491302433e-06, "loss": 0.0317, "num_input_tokens_seen": 4411760, "step": 634 }, { "epoch": 4.086886564762671, "grad_norm": 1.6843276023864746, "learning_rate": 4.522542485937369e-06, "loss": 0.0082, "num_input_tokens_seen": 4418896, "step": 635 }, { "epoch": 4.093322606596943, "grad_norm": 2.301496744155884, "learning_rate": 4.495838265931754e-06, "loss": 0.0101, "num_input_tokens_seen": 4425776, "step": 636 }, { "epoch": 4.099758648431215, "grad_norm": 1.434444546699524, "learning_rate": 4.4684908588200305e-06, "loss": 0.0112, "num_input_tokens_seen": 4432656, "step": 637 }, { "epoch": 4.106194690265487, "grad_norm": 1.3446779251098633, "learning_rate": 4.440509077690883e-06, "loss": 0.0034, "num_input_tokens_seen": 4439424, "step": 638 }, { "epoch": 4.112630732099759, "grad_norm": 0.6733867526054382, "learning_rate": 4.411901940068997e-06, "loss": 0.0037, "num_input_tokens_seen": 4446160, "step": 639 }, { "epoch": 4.119066773934031, "grad_norm": 1.339034080505371, "learning_rate": 4.382678665009028e-06, "loss": 0.0085, "num_input_tokens_seen": 4453376, "step": 640 }, { "epoch": 4.125502815768303, "grad_norm": 3.2036638259887695, "learning_rate": 4.352848670124637e-06, "loss": 0.0328, "num_input_tokens_seen": 4459952, "step": 641 }, { "epoch": 4.131938857602575, "grad_norm": 1.1791878938674927, "learning_rate": 4.322421568553529e-06, "loss": 0.0098, "num_input_tokens_seen": 4466880, "step": 642 }, { "epoch": 4.1383748994368466, "grad_norm": 1.8526674509048462, "learning_rate": 4.291407165859481e-06, "loss": 0.0051, "num_input_tokens_seen": 4474064, "step": 643 }, { "epoch": 4.1448109412711185, "grad_norm": 0.4795032739639282, "learning_rate": 4.259815456872363e-06, "loss": 0.0047, "num_input_tokens_seen": 4480864, "step": 644 }, { "epoch": 4.15124698310539, "grad_norm": 1.4392155408859253, "learning_rate": 4.227656622467162e-06, "loss": 0.0111, "num_input_tokens_seen": 4487504, "step": 645 }, { "epoch": 4.157683024939662, "grad_norm": 3.185128688812256, "learning_rate": 4.194941026283053e-06, "loss": 0.0334, "num_input_tokens_seen": 4494512, "step": 646 }, { "epoch": 4.164119066773934, "grad_norm": 1.7285927534103394, "learning_rate": 4.161679211383565e-06, "loss": 0.013, "num_input_tokens_seen": 4501296, "step": 647 }, { "epoch": 4.170555108608206, "grad_norm": 4.266958713531494, "learning_rate": 4.127881896858934e-06, "loss": 0.0305, "num_input_tokens_seen": 4508128, "step": 648 }, { "epoch": 4.176991150442478, "grad_norm": 1.000532627105713, "learning_rate": 4.093559974371725e-06, "loss": 0.0092, "num_input_tokens_seen": 4515008, "step": 649 }, { "epoch": 4.18342719227675, "grad_norm": 1.1824270486831665, "learning_rate": 4.058724504646834e-06, "loss": 0.0223, "num_input_tokens_seen": 4521920, "step": 650 }, { "epoch": 4.189863234111022, "grad_norm": 2.444427728652954, "learning_rate": 4.023386713907021e-06, "loss": 0.0234, "num_input_tokens_seen": 4528912, "step": 651 }, { "epoch": 4.196299275945294, "grad_norm": 1.421184778213501, "learning_rate": 3.987557990255093e-06, "loss": 0.0185, "num_input_tokens_seen": 4535664, "step": 652 }, { "epoch": 4.202735317779566, "grad_norm": 0.9019869565963745, "learning_rate": 3.951249880003934e-06, "loss": 0.0075, "num_input_tokens_seen": 4542832, "step": 653 }, { "epoch": 4.209171359613838, "grad_norm": 1.7373372316360474, "learning_rate": 3.914474083955537e-06, "loss": 0.0217, "num_input_tokens_seen": 4549552, "step": 654 }, { "epoch": 4.21560740144811, "grad_norm": 0.31386592984199524, "learning_rate": 3.8772424536302565e-06, "loss": 0.0027, "num_input_tokens_seen": 4556192, "step": 655 }, { "epoch": 4.222043443282382, "grad_norm": 1.8379613161087036, "learning_rate": 3.839566987447492e-06, "loss": 0.0153, "num_input_tokens_seen": 4563168, "step": 656 }, { "epoch": 4.228479485116654, "grad_norm": 1.221056342124939, "learning_rate": 3.801459826859022e-06, "loss": 0.0092, "num_input_tokens_seen": 4570704, "step": 657 }, { "epoch": 4.2349155269509255, "grad_norm": 0.7823006510734558, "learning_rate": 3.7629332524362532e-06, "loss": 0.0082, "num_input_tokens_seen": 4578016, "step": 658 }, { "epoch": 4.2413515687851975, "grad_norm": 1.149715781211853, "learning_rate": 3.7239996799126315e-06, "loss": 0.0163, "num_input_tokens_seen": 4584896, "step": 659 }, { "epoch": 4.247787610619469, "grad_norm": 0.6069539189338684, "learning_rate": 3.684671656182497e-06, "loss": 0.0099, "num_input_tokens_seen": 4591984, "step": 660 }, { "epoch": 4.254223652453741, "grad_norm": 2.427281141281128, "learning_rate": 3.644961855257669e-06, "loss": 0.0269, "num_input_tokens_seen": 4598656, "step": 661 }, { "epoch": 4.260659694288013, "grad_norm": 1.0770633220672607, "learning_rate": 3.6048830741830678e-06, "loss": 0.007, "num_input_tokens_seen": 4606032, "step": 662 }, { "epoch": 4.267095736122285, "grad_norm": 2.4310688972473145, "learning_rate": 3.564448228912682e-06, "loss": 0.0427, "num_input_tokens_seen": 4613056, "step": 663 }, { "epoch": 4.273531777956556, "grad_norm": 1.2328161001205444, "learning_rate": 3.523670350147227e-06, "loss": 0.0122, "num_input_tokens_seen": 4619776, "step": 664 }, { "epoch": 4.279967819790828, "grad_norm": 1.519998550415039, "learning_rate": 3.4825625791348093e-06, "loss": 0.0137, "num_input_tokens_seen": 4626240, "step": 665 }, { "epoch": 4.2864038616251, "grad_norm": 1.4114880561828613, "learning_rate": 3.44113816343598e-06, "loss": 0.02, "num_input_tokens_seen": 4633216, "step": 666 }, { "epoch": 4.292839903459372, "grad_norm": 1.4585809707641602, "learning_rate": 3.399410452654518e-06, "loss": 0.006, "num_input_tokens_seen": 4639856, "step": 667 }, { "epoch": 4.299275945293644, "grad_norm": 1.594936490058899, "learning_rate": 3.357392894135329e-06, "loss": 0.0085, "num_input_tokens_seen": 4646832, "step": 668 }, { "epoch": 4.305711987127916, "grad_norm": 2.5802690982818604, "learning_rate": 3.315099028630855e-06, "loss": 0.0112, "num_input_tokens_seen": 4653648, "step": 669 }, { "epoch": 4.312148028962188, "grad_norm": 1.3826483488082886, "learning_rate": 3.272542485937369e-06, "loss": 0.0131, "num_input_tokens_seen": 4660672, "step": 670 }, { "epoch": 4.31858407079646, "grad_norm": 2.1874148845672607, "learning_rate": 3.229736980502584e-06, "loss": 0.0124, "num_input_tokens_seen": 4667888, "step": 671 }, { "epoch": 4.325020112630732, "grad_norm": 1.61604642868042, "learning_rate": 3.186696307005976e-06, "loss": 0.0042, "num_input_tokens_seen": 4675072, "step": 672 }, { "epoch": 4.331456154465004, "grad_norm": 0.40999871492385864, "learning_rate": 3.1434343359132565e-06, "loss": 0.0011, "num_input_tokens_seen": 4682016, "step": 673 }, { "epoch": 4.337892196299276, "grad_norm": 0.1305094212293625, "learning_rate": 3.099965009006415e-06, "loss": 0.0008, "num_input_tokens_seen": 4688912, "step": 674 }, { "epoch": 4.3443282381335475, "grad_norm": 1.6623185873031616, "learning_rate": 3.056302334890786e-06, "loss": 0.0056, "num_input_tokens_seen": 4695936, "step": 675 }, { "epoch": 4.3507642799678194, "grad_norm": 1.034837007522583, "learning_rate": 3.0124603844805767e-06, "loss": 0.0079, "num_input_tokens_seen": 4703184, "step": 676 }, { "epoch": 4.357200321802091, "grad_norm": 2.2049107551574707, "learning_rate": 2.9684532864643123e-06, "loss": 0.0216, "num_input_tokens_seen": 4710064, "step": 677 }, { "epoch": 4.363636363636363, "grad_norm": 4.32258939743042, "learning_rate": 2.9242952227516726e-06, "loss": 0.0258, "num_input_tokens_seen": 4716336, "step": 678 }, { "epoch": 4.370072405470635, "grad_norm": 1.0949031114578247, "learning_rate": 2.8800004239031687e-06, "loss": 0.0049, "num_input_tokens_seen": 4723360, "step": 679 }, { "epoch": 4.376508447304907, "grad_norm": 1.563004493713379, "learning_rate": 2.835583164544139e-06, "loss": 0.0034, "num_input_tokens_seen": 4730464, "step": 680 }, { "epoch": 4.382944489139179, "grad_norm": 2.775270938873291, "learning_rate": 2.791057758764557e-06, "loss": 0.0341, "num_input_tokens_seen": 4737056, "step": 681 }, { "epoch": 4.389380530973451, "grad_norm": 3.1517560482025146, "learning_rate": 2.7464385555061092e-06, "loss": 0.0074, "num_input_tokens_seen": 4743936, "step": 682 }, { "epoch": 4.395816572807723, "grad_norm": 1.2521913051605225, "learning_rate": 2.7017399339380435e-06, "loss": 0.0272, "num_input_tokens_seen": 4751024, "step": 683 }, { "epoch": 4.402252614641995, "grad_norm": 3.4706435203552246, "learning_rate": 2.6569762988232838e-06, "loss": 0.0168, "num_input_tokens_seen": 4758000, "step": 684 }, { "epoch": 4.408688656476267, "grad_norm": 0.8021034598350525, "learning_rate": 2.6121620758762877e-06, "loss": 0.0047, "num_input_tokens_seen": 4764816, "step": 685 }, { "epoch": 4.415124698310539, "grad_norm": 4.709753036499023, "learning_rate": 2.5673117071141574e-06, "loss": 0.0198, "num_input_tokens_seen": 4772144, "step": 686 }, { "epoch": 4.421560740144811, "grad_norm": 0.40973323583602905, "learning_rate": 2.522439646202495e-06, "loss": 0.0012, "num_input_tokens_seen": 4778960, "step": 687 }, { "epoch": 4.427996781979083, "grad_norm": 3.179236888885498, "learning_rate": 2.4775603537975055e-06, "loss": 0.0256, "num_input_tokens_seen": 4785952, "step": 688 }, { "epoch": 4.434432823813355, "grad_norm": 2.5204341411590576, "learning_rate": 2.4326882928858435e-06, "loss": 0.0187, "num_input_tokens_seen": 4792608, "step": 689 }, { "epoch": 4.4408688656476265, "grad_norm": 3.6536998748779297, "learning_rate": 2.3878379241237136e-06, "loss": 0.0135, "num_input_tokens_seen": 4799232, "step": 690 }, { "epoch": 4.447304907481898, "grad_norm": 1.0689839124679565, "learning_rate": 2.3430237011767166e-06, "loss": 0.0036, "num_input_tokens_seen": 4806080, "step": 691 }, { "epoch": 4.45374094931617, "grad_norm": 2.071629762649536, "learning_rate": 2.2982600660619574e-06, "loss": 0.0135, "num_input_tokens_seen": 4813728, "step": 692 }, { "epoch": 4.460176991150442, "grad_norm": 3.4168224334716797, "learning_rate": 2.253561444493891e-06, "loss": 0.0046, "num_input_tokens_seen": 4820608, "step": 693 }, { "epoch": 4.466613032984714, "grad_norm": 0.3058677017688751, "learning_rate": 2.2089422412354434e-06, "loss": 0.0019, "num_input_tokens_seen": 4827056, "step": 694 }, { "epoch": 4.473049074818986, "grad_norm": 0.4175882935523987, "learning_rate": 2.1644168354558623e-06, "loss": 0.0022, "num_input_tokens_seen": 4834080, "step": 695 }, { "epoch": 4.479485116653258, "grad_norm": 0.7226863503456116, "learning_rate": 2.119999576096832e-06, "loss": 0.0093, "num_input_tokens_seen": 4840912, "step": 696 }, { "epoch": 4.48592115848753, "grad_norm": 0.1190720871090889, "learning_rate": 2.0757047772483278e-06, "loss": 0.0012, "num_input_tokens_seen": 4848112, "step": 697 }, { "epoch": 4.492357200321802, "grad_norm": 1.0061287879943848, "learning_rate": 2.031546713535688e-06, "loss": 0.0036, "num_input_tokens_seen": 4855072, "step": 698 }, { "epoch": 4.498793242156074, "grad_norm": 0.9472126364707947, "learning_rate": 1.987539615519424e-06, "loss": 0.0071, "num_input_tokens_seen": 4862064, "step": 699 }, { "epoch": 4.505229283990346, "grad_norm": 0.8338857889175415, "learning_rate": 1.9436976651092143e-06, "loss": 0.0055, "num_input_tokens_seen": 4869104, "step": 700 }, { "epoch": 4.511665325824618, "grad_norm": 3.2061474323272705, "learning_rate": 1.9000349909935852e-06, "loss": 0.0291, "num_input_tokens_seen": 4876112, "step": 701 }, { "epoch": 4.51810136765889, "grad_norm": 3.644125461578369, "learning_rate": 1.8565656640867448e-06, "loss": 0.0407, "num_input_tokens_seen": 4883264, "step": 702 }, { "epoch": 4.524537409493162, "grad_norm": 2.2370316982269287, "learning_rate": 1.813303692994025e-06, "loss": 0.0245, "num_input_tokens_seen": 4890192, "step": 703 }, { "epoch": 4.530973451327434, "grad_norm": 3.3120510578155518, "learning_rate": 1.770263019497417e-06, "loss": 0.0207, "num_input_tokens_seen": 4897200, "step": 704 }, { "epoch": 4.5374094931617055, "grad_norm": 1.256335973739624, "learning_rate": 1.7274575140626318e-06, "loss": 0.0269, "num_input_tokens_seen": 4904016, "step": 705 }, { "epoch": 4.543845534995977, "grad_norm": 0.10977872461080551, "learning_rate": 1.6849009713691456e-06, "loss": 0.001, "num_input_tokens_seen": 4910944, "step": 706 }, { "epoch": 4.550281576830249, "grad_norm": 1.9825077056884766, "learning_rate": 1.6426071058646718e-06, "loss": 0.0205, "num_input_tokens_seen": 4917424, "step": 707 }, { "epoch": 4.556717618664521, "grad_norm": 0.7529383897781372, "learning_rate": 1.6005895473454836e-06, "loss": 0.0148, "num_input_tokens_seen": 4924288, "step": 708 }, { "epoch": 4.563153660498793, "grad_norm": 2.29215145111084, "learning_rate": 1.55886183656402e-06, "loss": 0.0239, "num_input_tokens_seen": 4931040, "step": 709 }, { "epoch": 4.569589702333065, "grad_norm": 1.639636754989624, "learning_rate": 1.5174374208651913e-06, "loss": 0.0165, "num_input_tokens_seen": 4937968, "step": 710 }, { "epoch": 4.576025744167337, "grad_norm": 1.8043317794799805, "learning_rate": 1.4763296498527744e-06, "loss": 0.0079, "num_input_tokens_seen": 4945456, "step": 711 }, { "epoch": 4.582461786001609, "grad_norm": 1.8007737398147583, "learning_rate": 1.4355517710873184e-06, "loss": 0.0338, "num_input_tokens_seen": 4952080, "step": 712 }, { "epoch": 4.588897827835881, "grad_norm": 0.6810876131057739, "learning_rate": 1.395116925816934e-06, "loss": 0.0136, "num_input_tokens_seen": 4958944, "step": 713 }, { "epoch": 4.595333869670153, "grad_norm": 1.0080180168151855, "learning_rate": 1.3550381447423317e-06, "loss": 0.0126, "num_input_tokens_seen": 4966320, "step": 714 }, { "epoch": 4.601769911504425, "grad_norm": 1.1210750341415405, "learning_rate": 1.3153283438175036e-06, "loss": 0.0174, "num_input_tokens_seen": 4973344, "step": 715 }, { "epoch": 4.608205953338697, "grad_norm": 2.2793147563934326, "learning_rate": 1.27600032008737e-06, "loss": 0.0155, "num_input_tokens_seen": 4980304, "step": 716 }, { "epoch": 4.614641995172969, "grad_norm": 2.0746471881866455, "learning_rate": 1.2370667475637474e-06, "loss": 0.0349, "num_input_tokens_seen": 4987616, "step": 717 }, { "epoch": 4.621078037007241, "grad_norm": 1.9974377155303955, "learning_rate": 1.1985401731409793e-06, "loss": 0.0082, "num_input_tokens_seen": 4994656, "step": 718 }, { "epoch": 4.627514078841513, "grad_norm": 0.9225305914878845, "learning_rate": 1.160433012552508e-06, "loss": 0.0204, "num_input_tokens_seen": 5001776, "step": 719 }, { "epoch": 4.6339501206757845, "grad_norm": 0.6030845642089844, "learning_rate": 1.122757546369744e-06, "loss": 0.0074, "num_input_tokens_seen": 5008688, "step": 720 }, { "epoch": 4.640386162510056, "grad_norm": 1.1969950199127197, "learning_rate": 1.085525916044464e-06, "loss": 0.0154, "num_input_tokens_seen": 5015680, "step": 721 }, { "epoch": 4.646822204344328, "grad_norm": 1.7312675714492798, "learning_rate": 1.048750119996066e-06, "loss": 0.0101, "num_input_tokens_seen": 5022336, "step": 722 }, { "epoch": 4.6532582461786, "grad_norm": 0.9403418898582458, "learning_rate": 1.0124420097449077e-06, "loss": 0.0107, "num_input_tokens_seen": 5029184, "step": 723 }, { "epoch": 4.659694288012872, "grad_norm": 2.2545931339263916, "learning_rate": 9.7661328609298e-07, "loss": 0.0279, "num_input_tokens_seen": 5036000, "step": 724 }, { "epoch": 4.666130329847144, "grad_norm": 0.5637010931968689, "learning_rate": 9.412754953531664e-07, "loss": 0.0044, "num_input_tokens_seen": 5042944, "step": 725 }, { "epoch": 4.672566371681416, "grad_norm": 0.24136967957019806, "learning_rate": 9.064400256282757e-07, "loss": 0.0021, "num_input_tokens_seen": 5049840, "step": 726 }, { "epoch": 4.679002413515688, "grad_norm": 1.0340116024017334, "learning_rate": 8.721181031410661e-07, "loss": 0.0086, "num_input_tokens_seen": 5057296, "step": 727 }, { "epoch": 4.68543845534996, "grad_norm": 0.548861026763916, "learning_rate": 8.383207886164366e-07, "loss": 0.005, "num_input_tokens_seen": 5064560, "step": 728 }, { "epoch": 4.691874497184232, "grad_norm": 1.089135766029358, "learning_rate": 8.050589737169485e-07, "loss": 0.0096, "num_input_tokens_seen": 5071472, "step": 729 }, { "epoch": 4.698310539018504, "grad_norm": 0.3106631636619568, "learning_rate": 7.723433775328385e-07, "loss": 0.0029, "num_input_tokens_seen": 5078512, "step": 730 }, { "epoch": 4.704746580852776, "grad_norm": 1.3499066829681396, "learning_rate": 7.401845431276378e-07, "loss": 0.0082, "num_input_tokens_seen": 5085248, "step": 731 }, { "epoch": 4.711182622687048, "grad_norm": 0.30332618951797485, "learning_rate": 7.085928341405193e-07, "loss": 0.0033, "num_input_tokens_seen": 5092160, "step": 732 }, { "epoch": 4.71761866452132, "grad_norm": 0.7549375295639038, "learning_rate": 6.775784314464717e-07, "loss": 0.0253, "num_input_tokens_seen": 5099360, "step": 733 }, { "epoch": 4.7240547063555915, "grad_norm": 1.567395567893982, "learning_rate": 6.471513298753634e-07, "loss": 0.0117, "num_input_tokens_seen": 5106160, "step": 734 }, { "epoch": 4.7304907481898635, "grad_norm": 1.192610502243042, "learning_rate": 6.17321334990973e-07, "loss": 0.0052, "num_input_tokens_seen": 5113264, "step": 735 }, { "epoch": 4.736926790024135, "grad_norm": 3.9402077198028564, "learning_rate": 5.880980599310041e-07, "loss": 0.0305, "num_input_tokens_seen": 5120032, "step": 736 }, { "epoch": 4.743362831858407, "grad_norm": 0.3623356223106384, "learning_rate": 5.59490922309118e-07, "loss": 0.0018, "num_input_tokens_seen": 5127280, "step": 737 }, { "epoch": 4.749798873692679, "grad_norm": 0.815592885017395, "learning_rate": 5.3150914117997e-07, "loss": 0.0066, "num_input_tokens_seen": 5134400, "step": 738 }, { "epoch": 4.756234915526951, "grad_norm": 0.4423564076423645, "learning_rate": 5.041617340682467e-07, "loss": 0.0032, "num_input_tokens_seen": 5141488, "step": 739 }, { "epoch": 4.762670957361223, "grad_norm": 0.5768114924430847, "learning_rate": 4.774575140626317e-07, "loss": 0.0089, "num_input_tokens_seen": 5148432, "step": 740 }, { "epoch": 4.769106999195495, "grad_norm": 1.2286343574523926, "learning_rate": 4.514050869756703e-07, "loss": 0.0124, "num_input_tokens_seen": 5155328, "step": 741 }, { "epoch": 4.775543041029767, "grad_norm": 0.552872359752655, "learning_rate": 4.2601284857042263e-07, "loss": 0.0022, "num_input_tokens_seen": 5163008, "step": 742 }, { "epoch": 4.781979082864039, "grad_norm": 0.6165493726730347, "learning_rate": 4.012889818548069e-07, "loss": 0.0063, "num_input_tokens_seen": 5170096, "step": 743 }, { "epoch": 4.788415124698311, "grad_norm": 1.1403653621673584, "learning_rate": 3.772414544445163e-07, "loss": 0.0149, "num_input_tokens_seen": 5177536, "step": 744 }, { "epoch": 4.794851166532583, "grad_norm": 0.1795167326927185, "learning_rate": 3.538780159953348e-07, "loss": 0.0012, "num_input_tokens_seen": 5184608, "step": 745 }, { "epoch": 4.801287208366855, "grad_norm": 0.9326004981994629, "learning_rate": 3.312061957057061e-07, "loss": 0.0127, "num_input_tokens_seen": 5191344, "step": 746 }, { "epoch": 4.807723250201127, "grad_norm": 0.41363996267318726, "learning_rate": 3.092332998903416e-07, "loss": 0.0018, "num_input_tokens_seen": 5198416, "step": 747 }, { "epoch": 4.814159292035399, "grad_norm": 0.538027286529541, "learning_rate": 2.8796640962565374e-07, "loss": 0.0034, "num_input_tokens_seen": 5205392, "step": 748 }, { "epoch": 4.8205953338696705, "grad_norm": 1.531555414199829, "learning_rate": 2.674123784677868e-07, "loss": 0.0137, "num_input_tokens_seen": 5213216, "step": 749 }, { "epoch": 4.8270313757039425, "grad_norm": 1.671035647392273, "learning_rate": 2.4757783024395244e-07, "loss": 0.0219, "num_input_tokens_seen": 5220032, "step": 750 }, { "epoch": 4.833467417538214, "grad_norm": 0.30722492933273315, "learning_rate": 2.284691569178138e-07, "loss": 0.0014, "num_input_tokens_seen": 5226816, "step": 751 }, { "epoch": 4.839903459372486, "grad_norm": 1.3107943534851074, "learning_rate": 2.100925165295839e-07, "loss": 0.019, "num_input_tokens_seen": 5233920, "step": 752 }, { "epoch": 4.846339501206758, "grad_norm": 2.1163885593414307, "learning_rate": 1.9245383121150678e-07, "loss": 0.0075, "num_input_tokens_seen": 5241344, "step": 753 }, { "epoch": 4.85277554304103, "grad_norm": 1.2636387348175049, "learning_rate": 1.7555878527937164e-07, "loss": 0.0078, "num_input_tokens_seen": 5248256, "step": 754 }, { "epoch": 4.859211584875302, "grad_norm": 4.166254997253418, "learning_rate": 1.59412823400657e-07, "loss": 0.0244, "num_input_tokens_seen": 5255248, "step": 755 }, { "epoch": 4.865647626709574, "grad_norm": 1.078273892402649, "learning_rate": 1.4402114883991318e-07, "loss": 0.0218, "num_input_tokens_seen": 5262048, "step": 756 }, { "epoch": 4.872083668543846, "grad_norm": 2.091312885284424, "learning_rate": 1.2938872178193395e-07, "loss": 0.0044, "num_input_tokens_seen": 5268848, "step": 757 }, { "epoch": 4.878519710378118, "grad_norm": 1.7236751317977905, "learning_rate": 1.1552025773327008e-07, "loss": 0.0122, "num_input_tokens_seen": 5275664, "step": 758 }, { "epoch": 4.88495575221239, "grad_norm": 0.9874201416969299, "learning_rate": 1.0242022600258611e-07, "loss": 0.007, "num_input_tokens_seen": 5282112, "step": 759 }, { "epoch": 4.891391794046662, "grad_norm": 0.6303602457046509, "learning_rate": 9.00928482603669e-08, "loss": 0.0019, "num_input_tokens_seen": 5288912, "step": 760 }, { "epoch": 4.897827835880933, "grad_norm": 0.7971038818359375, "learning_rate": 7.854209717842231e-08, "loss": 0.0147, "num_input_tokens_seen": 5295920, "step": 761 }, { "epoch": 4.904263877715205, "grad_norm": 1.0757670402526855, "learning_rate": 6.777169514963766e-08, "loss": 0.0087, "num_input_tokens_seen": 5302816, "step": 762 }, { "epoch": 4.910699919549477, "grad_norm": 1.8044992685317993, "learning_rate": 5.778511308838108e-08, "loss": 0.0085, "num_input_tokens_seen": 5309680, "step": 763 }, { "epoch": 4.917135961383749, "grad_norm": 0.3801545202732086, "learning_rate": 4.8585569311949966e-08, "loss": 0.0026, "num_input_tokens_seen": 5316848, "step": 764 }, { "epoch": 4.923572003218021, "grad_norm": 0.20918627083301544, "learning_rate": 4.017602850342584e-08, "loss": 0.0018, "num_input_tokens_seen": 5323760, "step": 765 }, { "epoch": 4.9300080450522925, "grad_norm": 2.037950277328491, "learning_rate": 3.2559200756260845e-08, "loss": 0.0072, "num_input_tokens_seen": 5330336, "step": 766 }, { "epoch": 4.936444086886564, "grad_norm": 0.8903030753135681, "learning_rate": 2.5737540700912777e-08, "loss": 0.0079, "num_input_tokens_seen": 5336816, "step": 767 }, { "epoch": 4.942880128720836, "grad_norm": 1.0508862733840942, "learning_rate": 1.9713246713805588e-08, "loss": 0.0275, "num_input_tokens_seen": 5344064, "step": 768 }, { "epoch": 4.949316170555108, "grad_norm": 1.0068142414093018, "learning_rate": 1.4488260208871397e-08, "loss": 0.0036, "num_input_tokens_seen": 5351328, "step": 769 }, { "epoch": 4.95575221238938, "grad_norm": 1.5033273696899414, "learning_rate": 1.006426501190233e-08, "loss": 0.0501, "num_input_tokens_seen": 5358672, "step": 770 }, { "epoch": 4.962188254223652, "grad_norm": 0.667352557182312, "learning_rate": 6.442686817914878e-09, "loss": 0.0082, "num_input_tokens_seen": 5365648, "step": 771 }, { "epoch": 4.968624296057924, "grad_norm": 0.9037322998046875, "learning_rate": 3.6246927316976875e-09, "loss": 0.0032, "num_input_tokens_seen": 5372432, "step": 772 }, { "epoch": 4.975060337892196, "grad_norm": 0.3071233630180359, "learning_rate": 1.6111908916965902e-09, "loss": 0.0017, "num_input_tokens_seen": 5379648, "step": 773 }, { "epoch": 4.981496379726468, "grad_norm": 0.7171315550804138, "learning_rate": 4.0283017735454066e-10, "loss": 0.0042, "num_input_tokens_seen": 5386864, "step": 774 }, { "epoch": 4.98793242156074, "grad_norm": 2.855295181274414, "learning_rate": 0.0, "loss": 0.0176, "num_input_tokens_seen": 5393616, "step": 775 }, { "epoch": 4.98793242156074, "num_input_tokens_seen": 5393616, "step": 775, "total_flos": 2.1382484588285133e+17, "train_loss": 0.5414434323177463, "train_runtime": 8640.8816, "train_samples_per_second": 11.503, "train_steps_per_second": 0.09 } ], "logging_steps": 1, "max_steps": 775, "num_input_tokens_seen": 5393616, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1382484588285133e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }