|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.98793242156074, |
|
"eval_steps": 500, |
|
"global_step": 775, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006436041834271922, |
|
"grad_norm": 181.50096130371094, |
|
"learning_rate": 8.333333333333335e-09, |
|
"loss": 8.4196, |
|
"num_input_tokens_seen": 6848, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012872083668543845, |
|
"grad_norm": 187.05642700195312, |
|
"learning_rate": 1.666666666666667e-08, |
|
"loss": 8.44, |
|
"num_input_tokens_seen": 14000, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.019308125502815767, |
|
"grad_norm": 182.92320251464844, |
|
"learning_rate": 2.5000000000000002e-08, |
|
"loss": 8.3839, |
|
"num_input_tokens_seen": 21152, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02574416733708769, |
|
"grad_norm": 186.71311950683594, |
|
"learning_rate": 3.333333333333334e-08, |
|
"loss": 8.4024, |
|
"num_input_tokens_seen": 28224, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032180209171359615, |
|
"grad_norm": 180.32656860351562, |
|
"learning_rate": 4.166666666666667e-08, |
|
"loss": 8.4594, |
|
"num_input_tokens_seen": 35360, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.038616251005631534, |
|
"grad_norm": 189.87557983398438, |
|
"learning_rate": 5.0000000000000004e-08, |
|
"loss": 8.4107, |
|
"num_input_tokens_seen": 42192, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04505229283990346, |
|
"grad_norm": 185.89984130859375, |
|
"learning_rate": 5.833333333333334e-08, |
|
"loss": 8.4551, |
|
"num_input_tokens_seen": 49088, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05148833467417538, |
|
"grad_norm": 188.8160400390625, |
|
"learning_rate": 6.666666666666668e-08, |
|
"loss": 8.4415, |
|
"num_input_tokens_seen": 55856, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.057924376508447305, |
|
"grad_norm": 190.1417236328125, |
|
"learning_rate": 7.500000000000001e-08, |
|
"loss": 8.4965, |
|
"num_input_tokens_seen": 63120, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06436041834271923, |
|
"grad_norm": 185.3598175048828, |
|
"learning_rate": 8.333333333333334e-08, |
|
"loss": 8.4251, |
|
"num_input_tokens_seen": 69968, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07079646017699115, |
|
"grad_norm": 183.81944274902344, |
|
"learning_rate": 9.166666666666668e-08, |
|
"loss": 8.4291, |
|
"num_input_tokens_seen": 77168, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07723250201126307, |
|
"grad_norm": 196.39779663085938, |
|
"learning_rate": 1.0000000000000001e-07, |
|
"loss": 8.4463, |
|
"num_input_tokens_seen": 84272, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.083668543845535, |
|
"grad_norm": 181.4925994873047, |
|
"learning_rate": 1.0833333333333335e-07, |
|
"loss": 8.5116, |
|
"num_input_tokens_seen": 91232, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09010458567980692, |
|
"grad_norm": 190.0314178466797, |
|
"learning_rate": 1.1666666666666668e-07, |
|
"loss": 8.4749, |
|
"num_input_tokens_seen": 97968, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09654062751407884, |
|
"grad_norm": 188.7615203857422, |
|
"learning_rate": 1.2500000000000002e-07, |
|
"loss": 8.3311, |
|
"num_input_tokens_seen": 104864, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10297666934835076, |
|
"grad_norm": 184.1820526123047, |
|
"learning_rate": 1.3333333333333336e-07, |
|
"loss": 8.3729, |
|
"num_input_tokens_seen": 111488, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10941271118262269, |
|
"grad_norm": 181.39308166503906, |
|
"learning_rate": 1.4166666666666668e-07, |
|
"loss": 8.4261, |
|
"num_input_tokens_seen": 118384, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11584875301689461, |
|
"grad_norm": 181.79583740234375, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 8.3051, |
|
"num_input_tokens_seen": 125360, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12228479485116653, |
|
"grad_norm": 181.36965942382812, |
|
"learning_rate": 1.5833333333333336e-07, |
|
"loss": 8.2461, |
|
"num_input_tokens_seen": 132320, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12872083668543846, |
|
"grad_norm": 182.36839294433594, |
|
"learning_rate": 1.6666666666666668e-07, |
|
"loss": 8.2894, |
|
"num_input_tokens_seen": 139376, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13515687851971037, |
|
"grad_norm": 189.7889404296875, |
|
"learning_rate": 1.7500000000000002e-07, |
|
"loss": 8.2484, |
|
"num_input_tokens_seen": 146544, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1415929203539823, |
|
"grad_norm": 190.1185302734375, |
|
"learning_rate": 1.8333333333333336e-07, |
|
"loss": 8.3034, |
|
"num_input_tokens_seen": 153472, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14802896218825423, |
|
"grad_norm": 183.1331024169922, |
|
"learning_rate": 1.9166666666666668e-07, |
|
"loss": 8.054, |
|
"num_input_tokens_seen": 159856, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15446500402252614, |
|
"grad_norm": 168.13046264648438, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 7.9583, |
|
"num_input_tokens_seen": 166528, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16090104585679807, |
|
"grad_norm": 167.57830810546875, |
|
"learning_rate": 2.0833333333333333e-07, |
|
"loss": 7.9626, |
|
"num_input_tokens_seen": 173056, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16733708769107, |
|
"grad_norm": 170.6557159423828, |
|
"learning_rate": 2.166666666666667e-07, |
|
"loss": 7.8761, |
|
"num_input_tokens_seen": 179616, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1737731295253419, |
|
"grad_norm": 179.7693328857422, |
|
"learning_rate": 2.2500000000000002e-07, |
|
"loss": 7.8896, |
|
"num_input_tokens_seen": 186912, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18020917135961384, |
|
"grad_norm": 180.4197998046875, |
|
"learning_rate": 2.3333333333333336e-07, |
|
"loss": 7.8352, |
|
"num_input_tokens_seen": 193936, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.18664521319388577, |
|
"grad_norm": 164.2944793701172, |
|
"learning_rate": 2.416666666666667e-07, |
|
"loss": 7.691, |
|
"num_input_tokens_seen": 200672, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19308125502815768, |
|
"grad_norm": 167.71722412109375, |
|
"learning_rate": 2.5000000000000004e-07, |
|
"loss": 7.7851, |
|
"num_input_tokens_seen": 207536, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1995172968624296, |
|
"grad_norm": 169.2217254638672, |
|
"learning_rate": 2.5833333333333333e-07, |
|
"loss": 7.7249, |
|
"num_input_tokens_seen": 214640, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.20595333869670152, |
|
"grad_norm": 155.74537658691406, |
|
"learning_rate": 2.666666666666667e-07, |
|
"loss": 6.8838, |
|
"num_input_tokens_seen": 221744, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21238938053097345, |
|
"grad_norm": 148.12120056152344, |
|
"learning_rate": 2.75e-07, |
|
"loss": 6.7173, |
|
"num_input_tokens_seen": 228624, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.21882542236524538, |
|
"grad_norm": 150.97012329101562, |
|
"learning_rate": 2.8333333333333336e-07, |
|
"loss": 6.6793, |
|
"num_input_tokens_seen": 235456, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2252614641995173, |
|
"grad_norm": 149.623291015625, |
|
"learning_rate": 2.916666666666667e-07, |
|
"loss": 6.725, |
|
"num_input_tokens_seen": 242768, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23169750603378922, |
|
"grad_norm": 147.1656036376953, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 6.6905, |
|
"num_input_tokens_seen": 249552, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.23813354786806115, |
|
"grad_norm": 151.0162811279297, |
|
"learning_rate": 3.083333333333334e-07, |
|
"loss": 6.6179, |
|
"num_input_tokens_seen": 256160, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24456958970233306, |
|
"grad_norm": 150.03030395507812, |
|
"learning_rate": 3.166666666666667e-07, |
|
"loss": 6.501, |
|
"num_input_tokens_seen": 262912, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.251005631536605, |
|
"grad_norm": 145.5784149169922, |
|
"learning_rate": 3.25e-07, |
|
"loss": 6.4588, |
|
"num_input_tokens_seen": 269600, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2574416733708769, |
|
"grad_norm": 143.5873565673828, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 6.3614, |
|
"num_input_tokens_seen": 276560, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26387771520514886, |
|
"grad_norm": 144.9624481201172, |
|
"learning_rate": 3.416666666666667e-07, |
|
"loss": 6.2775, |
|
"num_input_tokens_seen": 283696, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27031375703942073, |
|
"grad_norm": 146.71554565429688, |
|
"learning_rate": 3.5000000000000004e-07, |
|
"loss": 5.9868, |
|
"num_input_tokens_seen": 290832, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.27674979887369267, |
|
"grad_norm": 138.25450134277344, |
|
"learning_rate": 3.583333333333334e-07, |
|
"loss": 5.2286, |
|
"num_input_tokens_seen": 298096, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2831858407079646, |
|
"grad_norm": 156.28713989257812, |
|
"learning_rate": 3.666666666666667e-07, |
|
"loss": 4.5076, |
|
"num_input_tokens_seen": 305120, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.28962188254223653, |
|
"grad_norm": 178.4820556640625, |
|
"learning_rate": 3.75e-07, |
|
"loss": 4.1167, |
|
"num_input_tokens_seen": 312000, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.29605792437650846, |
|
"grad_norm": 317.7680358886719, |
|
"learning_rate": 3.8333333333333335e-07, |
|
"loss": 3.6585, |
|
"num_input_tokens_seen": 319008, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3024939662107804, |
|
"grad_norm": 282.17803955078125, |
|
"learning_rate": 3.9166666666666675e-07, |
|
"loss": 3.3613, |
|
"num_input_tokens_seen": 326192, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3089300080450523, |
|
"grad_norm": 257.7794494628906, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 3.1068, |
|
"num_input_tokens_seen": 333664, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3153660498793242, |
|
"grad_norm": 255.1024169921875, |
|
"learning_rate": 4.083333333333334e-07, |
|
"loss": 2.9368, |
|
"num_input_tokens_seen": 340912, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32180209171359614, |
|
"grad_norm": 259.47015380859375, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 2.3466, |
|
"num_input_tokens_seen": 347712, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32823813354786807, |
|
"grad_norm": 263.3533935546875, |
|
"learning_rate": 4.2500000000000006e-07, |
|
"loss": 2.0645, |
|
"num_input_tokens_seen": 355232, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33467417538214, |
|
"grad_norm": 239.1399688720703, |
|
"learning_rate": 4.333333333333334e-07, |
|
"loss": 1.7729, |
|
"num_input_tokens_seen": 361968, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3411102172164119, |
|
"grad_norm": 257.4410095214844, |
|
"learning_rate": 4.416666666666667e-07, |
|
"loss": 1.6199, |
|
"num_input_tokens_seen": 369136, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3475462590506838, |
|
"grad_norm": 169.56935119628906, |
|
"learning_rate": 4.5000000000000003e-07, |
|
"loss": 1.1593, |
|
"num_input_tokens_seen": 375904, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.35398230088495575, |
|
"grad_norm": 95.25677490234375, |
|
"learning_rate": 4.583333333333333e-07, |
|
"loss": 0.7199, |
|
"num_input_tokens_seen": 382848, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3604183427192277, |
|
"grad_norm": 48.7137451171875, |
|
"learning_rate": 4.666666666666667e-07, |
|
"loss": 0.4394, |
|
"num_input_tokens_seen": 389680, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3668543845534996, |
|
"grad_norm": 62.34474563598633, |
|
"learning_rate": 4.7500000000000006e-07, |
|
"loss": 0.3806, |
|
"num_input_tokens_seen": 396192, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37329042638777155, |
|
"grad_norm": 30.711780548095703, |
|
"learning_rate": 4.833333333333334e-07, |
|
"loss": 0.3185, |
|
"num_input_tokens_seen": 403104, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3797264682220434, |
|
"grad_norm": 34.46913528442383, |
|
"learning_rate": 4.916666666666667e-07, |
|
"loss": 0.3056, |
|
"num_input_tokens_seen": 410176, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.38616251005631536, |
|
"grad_norm": 25.92363166809082, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.2981, |
|
"num_input_tokens_seen": 416928, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3925985518905873, |
|
"grad_norm": 11.064619064331055, |
|
"learning_rate": 5.083333333333334e-07, |
|
"loss": 0.2473, |
|
"num_input_tokens_seen": 424128, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3990345937248592, |
|
"grad_norm": 55.367347717285156, |
|
"learning_rate": 5.166666666666667e-07, |
|
"loss": 0.2924, |
|
"num_input_tokens_seen": 430864, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.40547063555913115, |
|
"grad_norm": 42.00873947143555, |
|
"learning_rate": 5.250000000000001e-07, |
|
"loss": 0.2656, |
|
"num_input_tokens_seen": 437744, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.41190667739340303, |
|
"grad_norm": 13.313591003417969, |
|
"learning_rate": 5.333333333333335e-07, |
|
"loss": 0.2335, |
|
"num_input_tokens_seen": 444624, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.41834271922767496, |
|
"grad_norm": 60.489715576171875, |
|
"learning_rate": 5.416666666666667e-07, |
|
"loss": 0.2647, |
|
"num_input_tokens_seen": 451696, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4247787610619469, |
|
"grad_norm": 77.01821899414062, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.3003, |
|
"num_input_tokens_seen": 458784, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43121480289621883, |
|
"grad_norm": 58.067596435546875, |
|
"learning_rate": 5.583333333333333e-07, |
|
"loss": 0.2656, |
|
"num_input_tokens_seen": 465920, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.43765084473049076, |
|
"grad_norm": 12.40570068359375, |
|
"learning_rate": 5.666666666666667e-07, |
|
"loss": 0.2212, |
|
"num_input_tokens_seen": 473152, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4440868865647627, |
|
"grad_norm": 35.392276763916016, |
|
"learning_rate": 5.750000000000001e-07, |
|
"loss": 0.2532, |
|
"num_input_tokens_seen": 480544, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4505229283990346, |
|
"grad_norm": 51.42181396484375, |
|
"learning_rate": 5.833333333333334e-07, |
|
"loss": 0.2799, |
|
"num_input_tokens_seen": 487552, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4569589702333065, |
|
"grad_norm": 45.73934555053711, |
|
"learning_rate": 5.916666666666667e-07, |
|
"loss": 0.2876, |
|
"num_input_tokens_seen": 494256, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.46339501206757844, |
|
"grad_norm": 20.654096603393555, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.2191, |
|
"num_input_tokens_seen": 500768, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.46983105390185037, |
|
"grad_norm": 21.078027725219727, |
|
"learning_rate": 6.083333333333334e-07, |
|
"loss": 0.2344, |
|
"num_input_tokens_seen": 507136, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4762670957361223, |
|
"grad_norm": 36.7335205078125, |
|
"learning_rate": 6.166666666666668e-07, |
|
"loss": 0.2547, |
|
"num_input_tokens_seen": 514208, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4827031375703942, |
|
"grad_norm": 34.47271728515625, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.2349, |
|
"num_input_tokens_seen": 521120, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4891391794046661, |
|
"grad_norm": 5.103244781494141, |
|
"learning_rate": 6.333333333333334e-07, |
|
"loss": 0.2045, |
|
"num_input_tokens_seen": 527824, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.49557522123893805, |
|
"grad_norm": 22.47526741027832, |
|
"learning_rate": 6.416666666666667e-07, |
|
"loss": 0.2262, |
|
"num_input_tokens_seen": 534832, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.50201126307321, |
|
"grad_norm": 30.610803604125977, |
|
"learning_rate": 6.5e-07, |
|
"loss": 0.2393, |
|
"num_input_tokens_seen": 541696, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5084473049074819, |
|
"grad_norm": 10.922965049743652, |
|
"learning_rate": 6.583333333333333e-07, |
|
"loss": 0.2206, |
|
"num_input_tokens_seen": 548608, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5148833467417538, |
|
"grad_norm": 17.484182357788086, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.2029, |
|
"num_input_tokens_seen": 555456, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5213193885760258, |
|
"grad_norm": 16.49226188659668, |
|
"learning_rate": 6.750000000000001e-07, |
|
"loss": 0.2125, |
|
"num_input_tokens_seen": 562768, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5277554304102977, |
|
"grad_norm": 9.977084159851074, |
|
"learning_rate": 6.833333333333334e-07, |
|
"loss": 0.2023, |
|
"num_input_tokens_seen": 569536, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5341914722445696, |
|
"grad_norm": 17.79197120666504, |
|
"learning_rate": 6.916666666666668e-07, |
|
"loss": 0.2262, |
|
"num_input_tokens_seen": 576096, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5406275140788415, |
|
"grad_norm": 16.699260711669922, |
|
"learning_rate": 7.000000000000001e-07, |
|
"loss": 0.2003, |
|
"num_input_tokens_seen": 583472, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5470635559131134, |
|
"grad_norm": 25.02164077758789, |
|
"learning_rate": 7.083333333333334e-07, |
|
"loss": 0.2351, |
|
"num_input_tokens_seen": 590304, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5534995977473853, |
|
"grad_norm": 3.8612709045410156, |
|
"learning_rate": 7.166666666666668e-07, |
|
"loss": 0.1839, |
|
"num_input_tokens_seen": 597152, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5599356395816573, |
|
"grad_norm": 31.555482864379883, |
|
"learning_rate": 7.25e-07, |
|
"loss": 0.2315, |
|
"num_input_tokens_seen": 604208, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5663716814159292, |
|
"grad_norm": 54.94756317138672, |
|
"learning_rate": 7.333333333333334e-07, |
|
"loss": 0.2732, |
|
"num_input_tokens_seen": 610896, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5728077232502011, |
|
"grad_norm": 30.55241584777832, |
|
"learning_rate": 7.416666666666668e-07, |
|
"loss": 0.2405, |
|
"num_input_tokens_seen": 618112, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5792437650844731, |
|
"grad_norm": 16.687997817993164, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.2005, |
|
"num_input_tokens_seen": 625040, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.585679806918745, |
|
"grad_norm": 10.350790977478027, |
|
"learning_rate": 7.583333333333334e-07, |
|
"loss": 0.2005, |
|
"num_input_tokens_seen": 631840, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5921158487530169, |
|
"grad_norm": 25.88368797302246, |
|
"learning_rate": 7.666666666666667e-07, |
|
"loss": 0.2115, |
|
"num_input_tokens_seen": 638752, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5985518905872889, |
|
"grad_norm": 17.11625099182129, |
|
"learning_rate": 7.750000000000001e-07, |
|
"loss": 0.2141, |
|
"num_input_tokens_seen": 645968, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6049879324215608, |
|
"grad_norm": 12.70864200592041, |
|
"learning_rate": 7.833333333333335e-07, |
|
"loss": 0.1898, |
|
"num_input_tokens_seen": 652752, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6114239742558326, |
|
"grad_norm": 3.674001455307007, |
|
"learning_rate": 7.916666666666667e-07, |
|
"loss": 0.2099, |
|
"num_input_tokens_seen": 660048, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6178600160901045, |
|
"grad_norm": 20.51032066345215, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.2014, |
|
"num_input_tokens_seen": 666752, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6242960579243765, |
|
"grad_norm": 47.562381744384766, |
|
"learning_rate": 8.083333333333334e-07, |
|
"loss": 0.2349, |
|
"num_input_tokens_seen": 673856, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6307320997586484, |
|
"grad_norm": 35.69169998168945, |
|
"learning_rate": 8.166666666666668e-07, |
|
"loss": 0.2205, |
|
"num_input_tokens_seen": 681104, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6371681415929203, |
|
"grad_norm": 10.080629348754883, |
|
"learning_rate": 8.250000000000001e-07, |
|
"loss": 0.199, |
|
"num_input_tokens_seen": 688128, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6436041834271923, |
|
"grad_norm": 26.242666244506836, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.236, |
|
"num_input_tokens_seen": 695216, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6500402252614642, |
|
"grad_norm": 22.0434627532959, |
|
"learning_rate": 8.416666666666667e-07, |
|
"loss": 0.2265, |
|
"num_input_tokens_seen": 701968, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6564762670957361, |
|
"grad_norm": 27.378408432006836, |
|
"learning_rate": 8.500000000000001e-07, |
|
"loss": 0.2443, |
|
"num_input_tokens_seen": 708928, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6629123089300081, |
|
"grad_norm": 11.929069519042969, |
|
"learning_rate": 8.583333333333334e-07, |
|
"loss": 0.2086, |
|
"num_input_tokens_seen": 715952, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.66934835076428, |
|
"grad_norm": 6.677243232727051, |
|
"learning_rate": 8.666666666666668e-07, |
|
"loss": 0.1915, |
|
"num_input_tokens_seen": 722928, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6757843925985519, |
|
"grad_norm": 17.033658981323242, |
|
"learning_rate": 8.75e-07, |
|
"loss": 0.1967, |
|
"num_input_tokens_seen": 730160, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6822204344328238, |
|
"grad_norm": 6.806990146636963, |
|
"learning_rate": 8.833333333333334e-07, |
|
"loss": 0.188, |
|
"num_input_tokens_seen": 737088, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6886564762670957, |
|
"grad_norm": 4.871335506439209, |
|
"learning_rate": 8.916666666666668e-07, |
|
"loss": 0.1895, |
|
"num_input_tokens_seen": 743744, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6950925181013676, |
|
"grad_norm": 9.054122924804688, |
|
"learning_rate": 9.000000000000001e-07, |
|
"loss": 0.1667, |
|
"num_input_tokens_seen": 750496, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7015285599356396, |
|
"grad_norm": 15.78903579711914, |
|
"learning_rate": 9.083333333333335e-07, |
|
"loss": 0.1976, |
|
"num_input_tokens_seen": 757792, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7079646017699115, |
|
"grad_norm": 10.51429271697998, |
|
"learning_rate": 9.166666666666666e-07, |
|
"loss": 0.2057, |
|
"num_input_tokens_seen": 764992, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7144006436041834, |
|
"grad_norm": 24.346830368041992, |
|
"learning_rate": 9.25e-07, |
|
"loss": 0.2002, |
|
"num_input_tokens_seen": 771648, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7208366854384554, |
|
"grad_norm": 46.50392532348633, |
|
"learning_rate": 9.333333333333334e-07, |
|
"loss": 0.2173, |
|
"num_input_tokens_seen": 778480, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 22.505762100219727, |
|
"learning_rate": 9.416666666666667e-07, |
|
"loss": 0.1756, |
|
"num_input_tokens_seen": 785328, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7337087691069992, |
|
"grad_norm": 5.675211429595947, |
|
"learning_rate": 9.500000000000001e-07, |
|
"loss": 0.1786, |
|
"num_input_tokens_seen": 792592, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7401448109412712, |
|
"grad_norm": 14.814651489257812, |
|
"learning_rate": 9.583333333333334e-07, |
|
"loss": 0.1879, |
|
"num_input_tokens_seen": 799808, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7465808527755431, |
|
"grad_norm": 13.106173515319824, |
|
"learning_rate": 9.666666666666668e-07, |
|
"loss": 0.173, |
|
"num_input_tokens_seen": 806896, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7530168946098149, |
|
"grad_norm": 24.56918716430664, |
|
"learning_rate": 9.750000000000002e-07, |
|
"loss": 0.1714, |
|
"num_input_tokens_seen": 813536, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7594529364440868, |
|
"grad_norm": 27.256954193115234, |
|
"learning_rate": 9.833333333333334e-07, |
|
"loss": 0.2015, |
|
"num_input_tokens_seen": 820608, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7658889782783588, |
|
"grad_norm": 4.209413051605225, |
|
"learning_rate": 9.916666666666668e-07, |
|
"loss": 0.1847, |
|
"num_input_tokens_seen": 827776, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7723250201126307, |
|
"grad_norm": 18.684349060058594, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.1876, |
|
"num_input_tokens_seen": 834704, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7787610619469026, |
|
"grad_norm": 19.470041275024414, |
|
"learning_rate": 1.0083333333333333e-06, |
|
"loss": 0.1937, |
|
"num_input_tokens_seen": 841568, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7851971037811746, |
|
"grad_norm": 11.242873191833496, |
|
"learning_rate": 1.0166666666666667e-06, |
|
"loss": 0.1974, |
|
"num_input_tokens_seen": 848704, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7916331456154465, |
|
"grad_norm": 26.72730255126953, |
|
"learning_rate": 1.025e-06, |
|
"loss": 0.2099, |
|
"num_input_tokens_seen": 855664, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7980691874497184, |
|
"grad_norm": 41.4288215637207, |
|
"learning_rate": 1.0333333333333333e-06, |
|
"loss": 0.2239, |
|
"num_input_tokens_seen": 862464, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8045052292839904, |
|
"grad_norm": 27.283327102661133, |
|
"learning_rate": 1.0416666666666667e-06, |
|
"loss": 0.1953, |
|
"num_input_tokens_seen": 869376, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8109412711182623, |
|
"grad_norm": 4.882501602172852, |
|
"learning_rate": 1.0500000000000001e-06, |
|
"loss": 0.1906, |
|
"num_input_tokens_seen": 876848, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8173773129525342, |
|
"grad_norm": 8.478296279907227, |
|
"learning_rate": 1.0583333333333335e-06, |
|
"loss": 0.1852, |
|
"num_input_tokens_seen": 883664, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8238133547868061, |
|
"grad_norm": 6.773479461669922, |
|
"learning_rate": 1.066666666666667e-06, |
|
"loss": 0.198, |
|
"num_input_tokens_seen": 890592, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.830249396621078, |
|
"grad_norm": 21.877212524414062, |
|
"learning_rate": 1.075e-06, |
|
"loss": 0.2105, |
|
"num_input_tokens_seen": 898048, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8366854384553499, |
|
"grad_norm": 12.123941421508789, |
|
"learning_rate": 1.0833333333333335e-06, |
|
"loss": 0.1899, |
|
"num_input_tokens_seen": 905040, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8431214802896219, |
|
"grad_norm": 15.84151554107666, |
|
"learning_rate": 1.0916666666666667e-06, |
|
"loss": 0.1742, |
|
"num_input_tokens_seen": 912080, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8495575221238938, |
|
"grad_norm": 8.174356460571289, |
|
"learning_rate": 1.1e-06, |
|
"loss": 0.1585, |
|
"num_input_tokens_seen": 919424, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8559935639581657, |
|
"grad_norm": 14.87348461151123, |
|
"learning_rate": 1.1083333333333335e-06, |
|
"loss": 0.1878, |
|
"num_input_tokens_seen": 926608, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8624296057924377, |
|
"grad_norm": 11.989315032958984, |
|
"learning_rate": 1.1166666666666666e-06, |
|
"loss": 0.1748, |
|
"num_input_tokens_seen": 933712, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8688656476267096, |
|
"grad_norm": 9.659666061401367, |
|
"learning_rate": 1.125e-06, |
|
"loss": 0.1944, |
|
"num_input_tokens_seen": 940304, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8753016894609815, |
|
"grad_norm": 20.558237075805664, |
|
"learning_rate": 1.1333333333333334e-06, |
|
"loss": 0.1727, |
|
"num_input_tokens_seen": 947008, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8817377312952535, |
|
"grad_norm": 8.66232967376709, |
|
"learning_rate": 1.1416666666666668e-06, |
|
"loss": 0.1748, |
|
"num_input_tokens_seen": 954112, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8881737731295254, |
|
"grad_norm": 16.516559600830078, |
|
"learning_rate": 1.1500000000000002e-06, |
|
"loss": 0.1625, |
|
"num_input_tokens_seen": 961120, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8946098149637972, |
|
"grad_norm": 6.140871047973633, |
|
"learning_rate": 1.1583333333333334e-06, |
|
"loss": 0.1649, |
|
"num_input_tokens_seen": 967792, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9010458567980691, |
|
"grad_norm": 11.593804359436035, |
|
"learning_rate": 1.1666666666666668e-06, |
|
"loss": 0.1738, |
|
"num_input_tokens_seen": 974496, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9074818986323411, |
|
"grad_norm": 26.92620849609375, |
|
"learning_rate": 1.175e-06, |
|
"loss": 0.2221, |
|
"num_input_tokens_seen": 981344, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.913917940466613, |
|
"grad_norm": 26.845230102539062, |
|
"learning_rate": 1.1833333333333334e-06, |
|
"loss": 0.1989, |
|
"num_input_tokens_seen": 988224, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9203539823008849, |
|
"grad_norm": 12.823030471801758, |
|
"learning_rate": 1.1916666666666668e-06, |
|
"loss": 0.1569, |
|
"num_input_tokens_seen": 995552, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9267900241351569, |
|
"grad_norm": 14.508877754211426, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.1594, |
|
"num_input_tokens_seen": 1002224, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9332260659694288, |
|
"grad_norm": 13.097854614257812, |
|
"learning_rate": 1.2083333333333333e-06, |
|
"loss": 0.1609, |
|
"num_input_tokens_seen": 1009312, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9396621078037007, |
|
"grad_norm": 12.183431625366211, |
|
"learning_rate": 1.2166666666666667e-06, |
|
"loss": 0.1649, |
|
"num_input_tokens_seen": 1016256, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9460981496379727, |
|
"grad_norm": 10.628469467163086, |
|
"learning_rate": 1.2250000000000001e-06, |
|
"loss": 0.1412, |
|
"num_input_tokens_seen": 1022880, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9525341914722446, |
|
"grad_norm": 11.713327407836914, |
|
"learning_rate": 1.2333333333333335e-06, |
|
"loss": 0.165, |
|
"num_input_tokens_seen": 1029856, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9589702333065165, |
|
"grad_norm": 10.031126976013184, |
|
"learning_rate": 1.2416666666666667e-06, |
|
"loss": 0.1971, |
|
"num_input_tokens_seen": 1036928, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9654062751407884, |
|
"grad_norm": 34.122074127197266, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.1843, |
|
"num_input_tokens_seen": 1044000, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9718423169750603, |
|
"grad_norm": 13.707520484924316, |
|
"learning_rate": 1.2583333333333333e-06, |
|
"loss": 0.1628, |
|
"num_input_tokens_seen": 1050928, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9782783588093322, |
|
"grad_norm": 8.588343620300293, |
|
"learning_rate": 1.2666666666666669e-06, |
|
"loss": 0.1878, |
|
"num_input_tokens_seen": 1057920, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9847144006436042, |
|
"grad_norm": 4.411599159240723, |
|
"learning_rate": 1.275e-06, |
|
"loss": 0.1153, |
|
"num_input_tokens_seen": 1064704, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9911504424778761, |
|
"grad_norm": 13.095698356628418, |
|
"learning_rate": 1.2833333333333335e-06, |
|
"loss": 0.1622, |
|
"num_input_tokens_seen": 1071760, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.997586484312148, |
|
"grad_norm": 14.093315124511719, |
|
"learning_rate": 1.2916666666666669e-06, |
|
"loss": 0.1549, |
|
"num_input_tokens_seen": 1078912, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.00402252614642, |
|
"grad_norm": 17.082075119018555, |
|
"learning_rate": 1.3e-06, |
|
"loss": 0.1729, |
|
"num_input_tokens_seen": 1086288, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.010458567980692, |
|
"grad_norm": 4.992012977600098, |
|
"learning_rate": 1.3083333333333334e-06, |
|
"loss": 0.1198, |
|
"num_input_tokens_seen": 1093584, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0168946098149638, |
|
"grad_norm": 5.45336389541626, |
|
"learning_rate": 1.3166666666666666e-06, |
|
"loss": 0.1723, |
|
"num_input_tokens_seen": 1100432, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0233306516492358, |
|
"grad_norm": 7.4880757331848145, |
|
"learning_rate": 1.3250000000000002e-06, |
|
"loss": 0.1485, |
|
"num_input_tokens_seen": 1107280, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.0297666934835077, |
|
"grad_norm": 40.28890609741211, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.1757, |
|
"num_input_tokens_seen": 1113968, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0362027353177796, |
|
"grad_norm": 39.24993896484375, |
|
"learning_rate": 1.3416666666666666e-06, |
|
"loss": 0.1907, |
|
"num_input_tokens_seen": 1120752, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.0426387771520516, |
|
"grad_norm": 5.63855504989624, |
|
"learning_rate": 1.3500000000000002e-06, |
|
"loss": 0.1842, |
|
"num_input_tokens_seen": 1127712, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0490748189863235, |
|
"grad_norm": 5.1802754402160645, |
|
"learning_rate": 1.3583333333333334e-06, |
|
"loss": 0.1549, |
|
"num_input_tokens_seen": 1134592, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0555108608205954, |
|
"grad_norm": 4.200067043304443, |
|
"learning_rate": 1.3666666666666668e-06, |
|
"loss": 0.153, |
|
"num_input_tokens_seen": 1141888, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0619469026548674, |
|
"grad_norm": 6.892277240753174, |
|
"learning_rate": 1.3750000000000002e-06, |
|
"loss": 0.1532, |
|
"num_input_tokens_seen": 1148688, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.068382944489139, |
|
"grad_norm": 11.852892875671387, |
|
"learning_rate": 1.3833333333333336e-06, |
|
"loss": 0.1629, |
|
"num_input_tokens_seen": 1155552, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.074818986323411, |
|
"grad_norm": 8.346076011657715, |
|
"learning_rate": 1.3916666666666668e-06, |
|
"loss": 0.1708, |
|
"num_input_tokens_seen": 1162624, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.081255028157683, |
|
"grad_norm": 7.836976528167725, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 0.1461, |
|
"num_input_tokens_seen": 1169904, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0876910699919549, |
|
"grad_norm": 15.59913158416748, |
|
"learning_rate": 1.4083333333333335e-06, |
|
"loss": 0.1402, |
|
"num_input_tokens_seen": 1176928, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0941271118262268, |
|
"grad_norm": 8.46536636352539, |
|
"learning_rate": 1.4166666666666667e-06, |
|
"loss": 0.143, |
|
"num_input_tokens_seen": 1184160, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1005631536604987, |
|
"grad_norm": 7.491546154022217, |
|
"learning_rate": 1.425e-06, |
|
"loss": 0.1454, |
|
"num_input_tokens_seen": 1191120, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1069991954947707, |
|
"grad_norm": 16.70829200744629, |
|
"learning_rate": 1.4333333333333335e-06, |
|
"loss": 0.1286, |
|
"num_input_tokens_seen": 1197920, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1134352373290426, |
|
"grad_norm": 16.273927688598633, |
|
"learning_rate": 1.4416666666666667e-06, |
|
"loss": 0.1523, |
|
"num_input_tokens_seen": 1204576, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1198712791633145, |
|
"grad_norm": 8.122928619384766, |
|
"learning_rate": 1.45e-06, |
|
"loss": 0.1345, |
|
"num_input_tokens_seen": 1211344, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1263073209975865, |
|
"grad_norm": 27.850522994995117, |
|
"learning_rate": 1.4583333333333335e-06, |
|
"loss": 0.1749, |
|
"num_input_tokens_seen": 1218432, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1327433628318584, |
|
"grad_norm": 30.498666763305664, |
|
"learning_rate": 1.4666666666666669e-06, |
|
"loss": 0.166, |
|
"num_input_tokens_seen": 1225728, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.1391794046661303, |
|
"grad_norm": 26.916791915893555, |
|
"learning_rate": 1.475e-06, |
|
"loss": 0.1708, |
|
"num_input_tokens_seen": 1232784, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.1456154465004023, |
|
"grad_norm": 13.593954086303711, |
|
"learning_rate": 1.4833333333333337e-06, |
|
"loss": 0.1363, |
|
"num_input_tokens_seen": 1239472, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1520514883346742, |
|
"grad_norm": 17.63590431213379, |
|
"learning_rate": 1.4916666666666669e-06, |
|
"loss": 0.1369, |
|
"num_input_tokens_seen": 1246864, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.1584875301689461, |
|
"grad_norm": 12.465302467346191, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.1632, |
|
"num_input_tokens_seen": 1253936, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.164923572003218, |
|
"grad_norm": 18.099266052246094, |
|
"learning_rate": 1.5083333333333336e-06, |
|
"loss": 0.1734, |
|
"num_input_tokens_seen": 1261120, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.17135961383749, |
|
"grad_norm": 12.134090423583984, |
|
"learning_rate": 1.5166666666666668e-06, |
|
"loss": 0.135, |
|
"num_input_tokens_seen": 1268208, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.177795655671762, |
|
"grad_norm": 5.747508525848389, |
|
"learning_rate": 1.525e-06, |
|
"loss": 0.1355, |
|
"num_input_tokens_seen": 1275296, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.1842316975060339, |
|
"grad_norm": 16.193449020385742, |
|
"learning_rate": 1.5333333333333334e-06, |
|
"loss": 0.1324, |
|
"num_input_tokens_seen": 1282320, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1906677393403058, |
|
"grad_norm": 23.576427459716797, |
|
"learning_rate": 1.5416666666666668e-06, |
|
"loss": 0.1754, |
|
"num_input_tokens_seen": 1289008, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.1971037811745777, |
|
"grad_norm": 4.542221546173096, |
|
"learning_rate": 1.5500000000000002e-06, |
|
"loss": 0.1484, |
|
"num_input_tokens_seen": 1296208, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2035398230088497, |
|
"grad_norm": 6.084584712982178, |
|
"learning_rate": 1.5583333333333334e-06, |
|
"loss": 0.1315, |
|
"num_input_tokens_seen": 1303072, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2099758648431216, |
|
"grad_norm": 18.8467960357666, |
|
"learning_rate": 1.566666666666667e-06, |
|
"loss": 0.1665, |
|
"num_input_tokens_seen": 1310320, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2164119066773935, |
|
"grad_norm": 6.79512882232666, |
|
"learning_rate": 1.5750000000000002e-06, |
|
"loss": 0.1406, |
|
"num_input_tokens_seen": 1317728, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2228479485116655, |
|
"grad_norm": 11.130036354064941, |
|
"learning_rate": 1.5833333333333333e-06, |
|
"loss": 0.1391, |
|
"num_input_tokens_seen": 1325216, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2292839903459372, |
|
"grad_norm": 17.00998306274414, |
|
"learning_rate": 1.591666666666667e-06, |
|
"loss": 0.1339, |
|
"num_input_tokens_seen": 1332272, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.235720032180209, |
|
"grad_norm": 16.623762130737305, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.1613, |
|
"num_input_tokens_seen": 1339008, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.242156074014481, |
|
"grad_norm": 15.660219192504883, |
|
"learning_rate": 1.6083333333333333e-06, |
|
"loss": 0.1274, |
|
"num_input_tokens_seen": 1345664, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.248592115848753, |
|
"grad_norm": 21.379770278930664, |
|
"learning_rate": 1.6166666666666667e-06, |
|
"loss": 0.1882, |
|
"num_input_tokens_seen": 1352720, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.255028157683025, |
|
"grad_norm": 8.196439743041992, |
|
"learning_rate": 1.6250000000000001e-06, |
|
"loss": 0.1106, |
|
"num_input_tokens_seen": 1359616, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2614641995172968, |
|
"grad_norm": 4.444194793701172, |
|
"learning_rate": 1.6333333333333335e-06, |
|
"loss": 0.1249, |
|
"num_input_tokens_seen": 1366656, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.2679002413515688, |
|
"grad_norm": 10.585016250610352, |
|
"learning_rate": 1.6416666666666667e-06, |
|
"loss": 0.1499, |
|
"num_input_tokens_seen": 1373904, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.2743362831858407, |
|
"grad_norm": 18.406293869018555, |
|
"learning_rate": 1.6500000000000003e-06, |
|
"loss": 0.1512, |
|
"num_input_tokens_seen": 1380528, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2807723250201126, |
|
"grad_norm": 5.323694229125977, |
|
"learning_rate": 1.6583333333333335e-06, |
|
"loss": 0.1166, |
|
"num_input_tokens_seen": 1386912, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2872083668543846, |
|
"grad_norm": 20.726289749145508, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.21, |
|
"num_input_tokens_seen": 1393648, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2936444086886565, |
|
"grad_norm": 24.05786895751953, |
|
"learning_rate": 1.6750000000000003e-06, |
|
"loss": 0.1915, |
|
"num_input_tokens_seen": 1400640, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.3000804505229284, |
|
"grad_norm": 19.30237579345703, |
|
"learning_rate": 1.6833333333333335e-06, |
|
"loss": 0.1911, |
|
"num_input_tokens_seen": 1407984, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3065164923572004, |
|
"grad_norm": 6.517977714538574, |
|
"learning_rate": 1.6916666666666666e-06, |
|
"loss": 0.1487, |
|
"num_input_tokens_seen": 1414672, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3129525341914723, |
|
"grad_norm": 30.81540870666504, |
|
"learning_rate": 1.7000000000000002e-06, |
|
"loss": 0.2154, |
|
"num_input_tokens_seen": 1421872, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3193885760257442, |
|
"grad_norm": 44.00107955932617, |
|
"learning_rate": 1.7083333333333334e-06, |
|
"loss": 0.2909, |
|
"num_input_tokens_seen": 1428640, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3258246178600162, |
|
"grad_norm": 41.464210510253906, |
|
"learning_rate": 1.7166666666666668e-06, |
|
"loss": 0.271, |
|
"num_input_tokens_seen": 1435456, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.332260659694288, |
|
"grad_norm": 12.14904499053955, |
|
"learning_rate": 1.725e-06, |
|
"loss": 0.1616, |
|
"num_input_tokens_seen": 1442592, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.33869670152856, |
|
"grad_norm": 8.393083572387695, |
|
"learning_rate": 1.7333333333333336e-06, |
|
"loss": 0.1427, |
|
"num_input_tokens_seen": 1449200, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3451327433628317, |
|
"grad_norm": 11.04562759399414, |
|
"learning_rate": 1.7416666666666668e-06, |
|
"loss": 0.1602, |
|
"num_input_tokens_seen": 1455920, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.3515687851971037, |
|
"grad_norm": 12.494465827941895, |
|
"learning_rate": 1.75e-06, |
|
"loss": 0.169, |
|
"num_input_tokens_seen": 1462624, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3580048270313756, |
|
"grad_norm": 5.395782470703125, |
|
"learning_rate": 1.7583333333333336e-06, |
|
"loss": 0.1285, |
|
"num_input_tokens_seen": 1469520, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.3644408688656475, |
|
"grad_norm": 19.773469924926758, |
|
"learning_rate": 1.7666666666666668e-06, |
|
"loss": 0.1636, |
|
"num_input_tokens_seen": 1476592, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3708769106999195, |
|
"grad_norm": 28.318584442138672, |
|
"learning_rate": 1.7750000000000002e-06, |
|
"loss": 0.1702, |
|
"num_input_tokens_seen": 1483632, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.3773129525341914, |
|
"grad_norm": 20.225502014160156, |
|
"learning_rate": 1.7833333333333336e-06, |
|
"loss": 0.1562, |
|
"num_input_tokens_seen": 1490528, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3837489943684633, |
|
"grad_norm": 5.386298179626465, |
|
"learning_rate": 1.7916666666666667e-06, |
|
"loss": 0.1537, |
|
"num_input_tokens_seen": 1497648, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.3901850362027353, |
|
"grad_norm": 6.181918144226074, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 0.1114, |
|
"num_input_tokens_seen": 1504800, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.3966210780370072, |
|
"grad_norm": 5.554294109344482, |
|
"learning_rate": 1.8083333333333335e-06, |
|
"loss": 0.1017, |
|
"num_input_tokens_seen": 1512240, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4030571198712791, |
|
"grad_norm": 5.2657880783081055, |
|
"learning_rate": 1.816666666666667e-06, |
|
"loss": 0.1184, |
|
"num_input_tokens_seen": 1519200, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.409493161705551, |
|
"grad_norm": 8.627300262451172, |
|
"learning_rate": 1.825e-06, |
|
"loss": 0.1343, |
|
"num_input_tokens_seen": 1526272, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.415929203539823, |
|
"grad_norm": 7.965896129608154, |
|
"learning_rate": 1.8333333333333333e-06, |
|
"loss": 0.1271, |
|
"num_input_tokens_seen": 1533440, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.422365245374095, |
|
"grad_norm": 7.089397430419922, |
|
"learning_rate": 1.8416666666666669e-06, |
|
"loss": 0.1383, |
|
"num_input_tokens_seen": 1540272, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.4288012872083669, |
|
"grad_norm": 4.354486465454102, |
|
"learning_rate": 1.85e-06, |
|
"loss": 0.1558, |
|
"num_input_tokens_seen": 1547632, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.4352373290426388, |
|
"grad_norm": 7.841838836669922, |
|
"learning_rate": 1.8583333333333335e-06, |
|
"loss": 0.1312, |
|
"num_input_tokens_seen": 1554608, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.4416733708769107, |
|
"grad_norm": 6.812905311584473, |
|
"learning_rate": 1.8666666666666669e-06, |
|
"loss": 0.1212, |
|
"num_input_tokens_seen": 1561472, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4481094127111827, |
|
"grad_norm": 5.038280963897705, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.1342, |
|
"num_input_tokens_seen": 1568496, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 4.255394458770752, |
|
"learning_rate": 1.8833333333333334e-06, |
|
"loss": 0.096, |
|
"num_input_tokens_seen": 1575184, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.4609814963797265, |
|
"grad_norm": 3.311915397644043, |
|
"learning_rate": 1.8916666666666668e-06, |
|
"loss": 0.0982, |
|
"num_input_tokens_seen": 1582080, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.4674175382139985, |
|
"grad_norm": 4.303693771362305, |
|
"learning_rate": 1.9000000000000002e-06, |
|
"loss": 0.1099, |
|
"num_input_tokens_seen": 1588688, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4738535800482704, |
|
"grad_norm": 14.854019165039062, |
|
"learning_rate": 1.9083333333333334e-06, |
|
"loss": 0.1265, |
|
"num_input_tokens_seen": 1595216, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4802896218825423, |
|
"grad_norm": 10.509958267211914, |
|
"learning_rate": 1.916666666666667e-06, |
|
"loss": 0.1066, |
|
"num_input_tokens_seen": 1602336, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4867256637168142, |
|
"grad_norm": 9.096975326538086, |
|
"learning_rate": 1.925e-06, |
|
"loss": 0.1593, |
|
"num_input_tokens_seen": 1609024, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.4931617055510862, |
|
"grad_norm": 18.944650650024414, |
|
"learning_rate": 1.9333333333333336e-06, |
|
"loss": 0.1891, |
|
"num_input_tokens_seen": 1615712, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.4995977473853581, |
|
"grad_norm": 6.735738754272461, |
|
"learning_rate": 1.9416666666666666e-06, |
|
"loss": 0.0867, |
|
"num_input_tokens_seen": 1622608, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.50603378921963, |
|
"grad_norm": 12.395522117614746, |
|
"learning_rate": 1.9500000000000004e-06, |
|
"loss": 0.1286, |
|
"num_input_tokens_seen": 1629520, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.512469831053902, |
|
"grad_norm": 13.864114761352539, |
|
"learning_rate": 1.9583333333333334e-06, |
|
"loss": 0.1262, |
|
"num_input_tokens_seen": 1636320, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.518905872888174, |
|
"grad_norm": 4.206810474395752, |
|
"learning_rate": 1.9666666666666668e-06, |
|
"loss": 0.0878, |
|
"num_input_tokens_seen": 1643216, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5253419147224458, |
|
"grad_norm": 9.294787406921387, |
|
"learning_rate": 1.975e-06, |
|
"loss": 0.1532, |
|
"num_input_tokens_seen": 1650256, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.5317779565567178, |
|
"grad_norm": 5.397519111633301, |
|
"learning_rate": 1.9833333333333335e-06, |
|
"loss": 0.1232, |
|
"num_input_tokens_seen": 1657328, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5382139983909895, |
|
"grad_norm": 4.74614953994751, |
|
"learning_rate": 1.991666666666667e-06, |
|
"loss": 0.1119, |
|
"num_input_tokens_seen": 1664192, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.5446500402252614, |
|
"grad_norm": 8.80385971069336, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.1334, |
|
"num_input_tokens_seen": 1670944, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.5510860820595334, |
|
"grad_norm": 12.17174243927002, |
|
"learning_rate": 2.0083333333333337e-06, |
|
"loss": 0.1224, |
|
"num_input_tokens_seen": 1677792, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.5575221238938053, |
|
"grad_norm": 6.9399800300598145, |
|
"learning_rate": 2.0166666666666667e-06, |
|
"loss": 0.106, |
|
"num_input_tokens_seen": 1684640, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.5639581657280772, |
|
"grad_norm": 5.804976463317871, |
|
"learning_rate": 2.025e-06, |
|
"loss": 0.1237, |
|
"num_input_tokens_seen": 1691664, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.5703942075623492, |
|
"grad_norm": 5.245293617248535, |
|
"learning_rate": 2.0333333333333335e-06, |
|
"loss": 0.095, |
|
"num_input_tokens_seen": 1698528, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.576830249396621, |
|
"grad_norm": 2.9305763244628906, |
|
"learning_rate": 2.041666666666667e-06, |
|
"loss": 0.0741, |
|
"num_input_tokens_seen": 1705600, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.583266291230893, |
|
"grad_norm": 10.269381523132324, |
|
"learning_rate": 2.05e-06, |
|
"loss": 0.1239, |
|
"num_input_tokens_seen": 1712704, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.589702333065165, |
|
"grad_norm": 4.453558921813965, |
|
"learning_rate": 2.0583333333333337e-06, |
|
"loss": 0.091, |
|
"num_input_tokens_seen": 1719568, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.5961383748994369, |
|
"grad_norm": 16.549911499023438, |
|
"learning_rate": 2.0666666666666666e-06, |
|
"loss": 0.1403, |
|
"num_input_tokens_seen": 1726480, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6025744167337088, |
|
"grad_norm": 17.650426864624023, |
|
"learning_rate": 2.075e-06, |
|
"loss": 0.1638, |
|
"num_input_tokens_seen": 1733936, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.6090104585679805, |
|
"grad_norm": 5.322378158569336, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 0.1343, |
|
"num_input_tokens_seen": 1741008, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6154465004022525, |
|
"grad_norm": 11.570721626281738, |
|
"learning_rate": 2.091666666666667e-06, |
|
"loss": 0.1558, |
|
"num_input_tokens_seen": 1748240, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.6218825422365244, |
|
"grad_norm": 2.901578426361084, |
|
"learning_rate": 2.1000000000000002e-06, |
|
"loss": 0.0809, |
|
"num_input_tokens_seen": 1755072, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6283185840707963, |
|
"grad_norm": 8.972208023071289, |
|
"learning_rate": 2.1083333333333336e-06, |
|
"loss": 0.1435, |
|
"num_input_tokens_seen": 1762048, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.6347546259050683, |
|
"grad_norm": 2.364783525466919, |
|
"learning_rate": 2.116666666666667e-06, |
|
"loss": 0.0887, |
|
"num_input_tokens_seen": 1769200, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.6411906677393402, |
|
"grad_norm": 3.7692675590515137, |
|
"learning_rate": 2.125e-06, |
|
"loss": 0.1038, |
|
"num_input_tokens_seen": 1776112, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6476267095736121, |
|
"grad_norm": 3.0572264194488525, |
|
"learning_rate": 2.133333333333334e-06, |
|
"loss": 0.0889, |
|
"num_input_tokens_seen": 1783664, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.654062751407884, |
|
"grad_norm": 3.8316140174865723, |
|
"learning_rate": 2.1416666666666668e-06, |
|
"loss": 0.0751, |
|
"num_input_tokens_seen": 1790096, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.660498793242156, |
|
"grad_norm": 5.133974552154541, |
|
"learning_rate": 2.15e-06, |
|
"loss": 0.0921, |
|
"num_input_tokens_seen": 1796912, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.666934835076428, |
|
"grad_norm": 5.002286911010742, |
|
"learning_rate": 2.1583333333333336e-06, |
|
"loss": 0.1102, |
|
"num_input_tokens_seen": 1804144, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.6733708769106999, |
|
"grad_norm": 8.221644401550293, |
|
"learning_rate": 2.166666666666667e-06, |
|
"loss": 0.1036, |
|
"num_input_tokens_seen": 1811040, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6798069187449718, |
|
"grad_norm": 6.029963493347168, |
|
"learning_rate": 2.1750000000000004e-06, |
|
"loss": 0.1093, |
|
"num_input_tokens_seen": 1818064, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.6862429605792437, |
|
"grad_norm": 6.715224742889404, |
|
"learning_rate": 2.1833333333333333e-06, |
|
"loss": 0.1714, |
|
"num_input_tokens_seen": 1825056, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.6926790024135157, |
|
"grad_norm": 6.136181354522705, |
|
"learning_rate": 2.191666666666667e-06, |
|
"loss": 0.1007, |
|
"num_input_tokens_seen": 1831968, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.6991150442477876, |
|
"grad_norm": 5.392821788787842, |
|
"learning_rate": 2.2e-06, |
|
"loss": 0.109, |
|
"num_input_tokens_seen": 1838656, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7055510860820595, |
|
"grad_norm": 3.0743072032928467, |
|
"learning_rate": 2.2083333333333335e-06, |
|
"loss": 0.0574, |
|
"num_input_tokens_seen": 1845760, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.7119871279163315, |
|
"grad_norm": 4.986932277679443, |
|
"learning_rate": 2.216666666666667e-06, |
|
"loss": 0.0697, |
|
"num_input_tokens_seen": 1852480, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.7184231697506034, |
|
"grad_norm": 3.588496685028076, |
|
"learning_rate": 2.2250000000000003e-06, |
|
"loss": 0.1188, |
|
"num_input_tokens_seen": 1859312, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.7248592115848753, |
|
"grad_norm": 3.850637912750244, |
|
"learning_rate": 2.2333333333333333e-06, |
|
"loss": 0.0998, |
|
"num_input_tokens_seen": 1866256, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.7312952534191473, |
|
"grad_norm": 10.427441596984863, |
|
"learning_rate": 2.2416666666666667e-06, |
|
"loss": 0.1083, |
|
"num_input_tokens_seen": 1873104, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.7377312952534192, |
|
"grad_norm": 6.516834259033203, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.0749, |
|
"num_input_tokens_seen": 1880192, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7441673370876911, |
|
"grad_norm": 5.243050575256348, |
|
"learning_rate": 2.2583333333333335e-06, |
|
"loss": 0.0771, |
|
"num_input_tokens_seen": 1887008, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.750603378921963, |
|
"grad_norm": 3.874545097351074, |
|
"learning_rate": 2.266666666666667e-06, |
|
"loss": 0.0646, |
|
"num_input_tokens_seen": 1894096, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.757039420756235, |
|
"grad_norm": 4.2995476722717285, |
|
"learning_rate": 2.2750000000000002e-06, |
|
"loss": 0.1147, |
|
"num_input_tokens_seen": 1901216, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.763475462590507, |
|
"grad_norm": 9.720036506652832, |
|
"learning_rate": 2.2833333333333336e-06, |
|
"loss": 0.0917, |
|
"num_input_tokens_seen": 1908160, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.7699115044247788, |
|
"grad_norm": 7.985558986663818, |
|
"learning_rate": 2.2916666666666666e-06, |
|
"loss": 0.106, |
|
"num_input_tokens_seen": 1915104, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7763475462590508, |
|
"grad_norm": 4.0768327713012695, |
|
"learning_rate": 2.3000000000000004e-06, |
|
"loss": 0.0849, |
|
"num_input_tokens_seen": 1922128, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.7827835880933227, |
|
"grad_norm": 5.870975017547607, |
|
"learning_rate": 2.3083333333333334e-06, |
|
"loss": 0.1074, |
|
"num_input_tokens_seen": 1929200, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.7892196299275946, |
|
"grad_norm": 3.490455389022827, |
|
"learning_rate": 2.316666666666667e-06, |
|
"loss": 0.0981, |
|
"num_input_tokens_seen": 1936144, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.7956556717618666, |
|
"grad_norm": 4.1171183586120605, |
|
"learning_rate": 2.325e-06, |
|
"loss": 0.1008, |
|
"num_input_tokens_seen": 1943136, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.8020917135961385, |
|
"grad_norm": 7.664264678955078, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 0.1032, |
|
"num_input_tokens_seen": 1950208, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8085277554304104, |
|
"grad_norm": 4.865798473358154, |
|
"learning_rate": 2.341666666666667e-06, |
|
"loss": 0.0711, |
|
"num_input_tokens_seen": 1957056, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.8149637972646824, |
|
"grad_norm": 2.5436036586761475, |
|
"learning_rate": 2.35e-06, |
|
"loss": 0.0901, |
|
"num_input_tokens_seen": 1964176, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.8213998390989543, |
|
"grad_norm": 6.305140972137451, |
|
"learning_rate": 2.3583333333333338e-06, |
|
"loss": 0.0847, |
|
"num_input_tokens_seen": 1970736, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.827835880933226, |
|
"grad_norm": 2.6688449382781982, |
|
"learning_rate": 2.3666666666666667e-06, |
|
"loss": 0.0752, |
|
"num_input_tokens_seen": 1977440, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.834271922767498, |
|
"grad_norm": 2.5124077796936035, |
|
"learning_rate": 2.375e-06, |
|
"loss": 0.068, |
|
"num_input_tokens_seen": 1984464, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.8407079646017699, |
|
"grad_norm": 6.168980121612549, |
|
"learning_rate": 2.3833333333333335e-06, |
|
"loss": 0.1088, |
|
"num_input_tokens_seen": 1991248, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.8471440064360418, |
|
"grad_norm": 5.883851051330566, |
|
"learning_rate": 2.391666666666667e-06, |
|
"loss": 0.1017, |
|
"num_input_tokens_seen": 1998496, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.8535800482703138, |
|
"grad_norm": 9.373373985290527, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.13, |
|
"num_input_tokens_seen": 2005552, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.8600160901045857, |
|
"grad_norm": 9.111586570739746, |
|
"learning_rate": 2.4083333333333337e-06, |
|
"loss": 0.0998, |
|
"num_input_tokens_seen": 2012272, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.8664521319388576, |
|
"grad_norm": 5.353252410888672, |
|
"learning_rate": 2.4166666666666667e-06, |
|
"loss": 0.0779, |
|
"num_input_tokens_seen": 2019056, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.8728881737731295, |
|
"grad_norm": 6.586206436157227, |
|
"learning_rate": 2.425e-06, |
|
"loss": 0.0907, |
|
"num_input_tokens_seen": 2025760, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.8793242156074015, |
|
"grad_norm": 5.485732555389404, |
|
"learning_rate": 2.4333333333333335e-06, |
|
"loss": 0.0911, |
|
"num_input_tokens_seen": 2032928, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.8857602574416734, |
|
"grad_norm": 3.5151724815368652, |
|
"learning_rate": 2.441666666666667e-06, |
|
"loss": 0.0987, |
|
"num_input_tokens_seen": 2039856, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.8921962992759453, |
|
"grad_norm": 3.680494546890259, |
|
"learning_rate": 2.4500000000000003e-06, |
|
"loss": 0.1254, |
|
"num_input_tokens_seen": 2046896, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.898632341110217, |
|
"grad_norm": 3.302248001098633, |
|
"learning_rate": 2.4583333333333332e-06, |
|
"loss": 0.0494, |
|
"num_input_tokens_seen": 2053600, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.905068382944489, |
|
"grad_norm": 3.605039119720459, |
|
"learning_rate": 2.466666666666667e-06, |
|
"loss": 0.1082, |
|
"num_input_tokens_seen": 2060240, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.911504424778761, |
|
"grad_norm": 2.6599857807159424, |
|
"learning_rate": 2.475e-06, |
|
"loss": 0.0785, |
|
"num_input_tokens_seen": 2067936, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.9179404666130329, |
|
"grad_norm": 7.149720191955566, |
|
"learning_rate": 2.4833333333333334e-06, |
|
"loss": 0.1026, |
|
"num_input_tokens_seen": 2074656, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9243765084473048, |
|
"grad_norm": 4.549108982086182, |
|
"learning_rate": 2.491666666666667e-06, |
|
"loss": 0.0617, |
|
"num_input_tokens_seen": 2081568, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.9308125502815767, |
|
"grad_norm": 2.900601625442505, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0659, |
|
"num_input_tokens_seen": 2088368, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9372485921158487, |
|
"grad_norm": 6.378200531005859, |
|
"learning_rate": 2.5083333333333336e-06, |
|
"loss": 0.088, |
|
"num_input_tokens_seen": 2095728, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.9436846339501206, |
|
"grad_norm": 6.718885898590088, |
|
"learning_rate": 2.5166666666666666e-06, |
|
"loss": 0.0771, |
|
"num_input_tokens_seen": 2103104, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.9501206757843925, |
|
"grad_norm": 3.587820291519165, |
|
"learning_rate": 2.5250000000000004e-06, |
|
"loss": 0.0642, |
|
"num_input_tokens_seen": 2110032, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.9565567176186645, |
|
"grad_norm": 7.106460094451904, |
|
"learning_rate": 2.5333333333333338e-06, |
|
"loss": 0.0947, |
|
"num_input_tokens_seen": 2117056, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.9629927594529364, |
|
"grad_norm": 3.480973243713379, |
|
"learning_rate": 2.5416666666666668e-06, |
|
"loss": 0.0975, |
|
"num_input_tokens_seen": 2123552, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.9694288012872083, |
|
"grad_norm": 2.709892511367798, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.0527, |
|
"num_input_tokens_seen": 2130128, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.9758648431214803, |
|
"grad_norm": 3.3756306171417236, |
|
"learning_rate": 2.558333333333334e-06, |
|
"loss": 0.0869, |
|
"num_input_tokens_seen": 2137232, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.9823008849557522, |
|
"grad_norm": 6.785555839538574, |
|
"learning_rate": 2.566666666666667e-06, |
|
"loss": 0.0605, |
|
"num_input_tokens_seen": 2143776, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9887369267900241, |
|
"grad_norm": 3.4628372192382812, |
|
"learning_rate": 2.5750000000000003e-06, |
|
"loss": 0.0684, |
|
"num_input_tokens_seen": 2150976, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.995172968624296, |
|
"grad_norm": 3.56925892829895, |
|
"learning_rate": 2.5833333333333337e-06, |
|
"loss": 0.0701, |
|
"num_input_tokens_seen": 2158080, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.001609010458568, |
|
"grad_norm": 4.06324577331543, |
|
"learning_rate": 2.5916666666666667e-06, |
|
"loss": 0.0699, |
|
"num_input_tokens_seen": 2164992, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.00804505229284, |
|
"grad_norm": 7.733395576477051, |
|
"learning_rate": 2.6e-06, |
|
"loss": 0.0949, |
|
"num_input_tokens_seen": 2171952, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.014481094127112, |
|
"grad_norm": 7.6149139404296875, |
|
"learning_rate": 2.608333333333333e-06, |
|
"loss": 0.0911, |
|
"num_input_tokens_seen": 2179072, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.020917135961384, |
|
"grad_norm": 2.538379192352295, |
|
"learning_rate": 2.616666666666667e-06, |
|
"loss": 0.0615, |
|
"num_input_tokens_seen": 2185872, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.0273531777956557, |
|
"grad_norm": 2.5334603786468506, |
|
"learning_rate": 2.6250000000000003e-06, |
|
"loss": 0.0448, |
|
"num_input_tokens_seen": 2192656, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.0337892196299276, |
|
"grad_norm": 4.8344340324401855, |
|
"learning_rate": 2.6333333333333332e-06, |
|
"loss": 0.0619, |
|
"num_input_tokens_seen": 2199728, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.0402252614641996, |
|
"grad_norm": 4.393861770629883, |
|
"learning_rate": 2.6416666666666666e-06, |
|
"loss": 0.0475, |
|
"num_input_tokens_seen": 2206608, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.0466613032984715, |
|
"grad_norm": 2.7922892570495605, |
|
"learning_rate": 2.6500000000000005e-06, |
|
"loss": 0.0438, |
|
"num_input_tokens_seen": 2213856, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.0530973451327434, |
|
"grad_norm": 1.5408401489257812, |
|
"learning_rate": 2.6583333333333334e-06, |
|
"loss": 0.0245, |
|
"num_input_tokens_seen": 2220528, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.0595333869670154, |
|
"grad_norm": 5.6088433265686035, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.0716, |
|
"num_input_tokens_seen": 2227616, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.0659694288012873, |
|
"grad_norm": 9.311470985412598, |
|
"learning_rate": 2.6750000000000002e-06, |
|
"loss": 0.1015, |
|
"num_input_tokens_seen": 2234304, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.0724054706355592, |
|
"grad_norm": 5.244096279144287, |
|
"learning_rate": 2.683333333333333e-06, |
|
"loss": 0.0753, |
|
"num_input_tokens_seen": 2241088, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.078841512469831, |
|
"grad_norm": 3.443998098373413, |
|
"learning_rate": 2.691666666666667e-06, |
|
"loss": 0.0521, |
|
"num_input_tokens_seen": 2247632, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.085277554304103, |
|
"grad_norm": 2.4997072219848633, |
|
"learning_rate": 2.7000000000000004e-06, |
|
"loss": 0.0287, |
|
"num_input_tokens_seen": 2254448, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.091713596138375, |
|
"grad_norm": 4.817678928375244, |
|
"learning_rate": 2.7083333333333334e-06, |
|
"loss": 0.0471, |
|
"num_input_tokens_seen": 2261424, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.098149637972647, |
|
"grad_norm": 6.326369285583496, |
|
"learning_rate": 2.7166666666666668e-06, |
|
"loss": 0.0697, |
|
"num_input_tokens_seen": 2268528, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.104585679806919, |
|
"grad_norm": 3.599905490875244, |
|
"learning_rate": 2.7250000000000006e-06, |
|
"loss": 0.0438, |
|
"num_input_tokens_seen": 2275328, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.111021721641191, |
|
"grad_norm": 2.8037264347076416, |
|
"learning_rate": 2.7333333333333336e-06, |
|
"loss": 0.0475, |
|
"num_input_tokens_seen": 2282400, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.1174577634754628, |
|
"grad_norm": 2.7425622940063477, |
|
"learning_rate": 2.741666666666667e-06, |
|
"loss": 0.0601, |
|
"num_input_tokens_seen": 2289312, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.1238938053097347, |
|
"grad_norm": 2.064824342727661, |
|
"learning_rate": 2.7500000000000004e-06, |
|
"loss": 0.0355, |
|
"num_input_tokens_seen": 2295824, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1303298471440066, |
|
"grad_norm": 3.695521593093872, |
|
"learning_rate": 2.7583333333333333e-06, |
|
"loss": 0.0515, |
|
"num_input_tokens_seen": 2303024, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.136765888978278, |
|
"grad_norm": 3.3290112018585205, |
|
"learning_rate": 2.766666666666667e-06, |
|
"loss": 0.0601, |
|
"num_input_tokens_seen": 2309904, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.14320193081255, |
|
"grad_norm": 2.751953363418579, |
|
"learning_rate": 2.7750000000000005e-06, |
|
"loss": 0.0288, |
|
"num_input_tokens_seen": 2316416, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.149637972646822, |
|
"grad_norm": 4.679827690124512, |
|
"learning_rate": 2.7833333333333335e-06, |
|
"loss": 0.0563, |
|
"num_input_tokens_seen": 2323088, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.156074014481094, |
|
"grad_norm": 9.301896095275879, |
|
"learning_rate": 2.791666666666667e-06, |
|
"loss": 0.1176, |
|
"num_input_tokens_seen": 2329968, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.162510056315366, |
|
"grad_norm": 6.16165828704834, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.0965, |
|
"num_input_tokens_seen": 2336656, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.168946098149638, |
|
"grad_norm": 2.442518711090088, |
|
"learning_rate": 2.8083333333333333e-06, |
|
"loss": 0.0359, |
|
"num_input_tokens_seen": 2343984, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.1753821399839097, |
|
"grad_norm": 3.537282943725586, |
|
"learning_rate": 2.816666666666667e-06, |
|
"loss": 0.0609, |
|
"num_input_tokens_seen": 2350912, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 5.1499223709106445, |
|
"learning_rate": 2.825e-06, |
|
"loss": 0.0768, |
|
"num_input_tokens_seen": 2357680, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.1882542236524536, |
|
"grad_norm": 8.193970680236816, |
|
"learning_rate": 2.8333333333333335e-06, |
|
"loss": 0.0849, |
|
"num_input_tokens_seen": 2364736, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.1946902654867255, |
|
"grad_norm": 2.2035670280456543, |
|
"learning_rate": 2.841666666666667e-06, |
|
"loss": 0.0581, |
|
"num_input_tokens_seen": 2371568, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.2011263073209975, |
|
"grad_norm": 2.7924435138702393, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.046, |
|
"num_input_tokens_seen": 2378384, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.2075623491552694, |
|
"grad_norm": 4.6174445152282715, |
|
"learning_rate": 2.8583333333333336e-06, |
|
"loss": 0.0674, |
|
"num_input_tokens_seen": 2385584, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.2139983909895413, |
|
"grad_norm": 2.4459989070892334, |
|
"learning_rate": 2.866666666666667e-06, |
|
"loss": 0.0563, |
|
"num_input_tokens_seen": 2392640, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.2204344328238133, |
|
"grad_norm": 2.3443846702575684, |
|
"learning_rate": 2.875e-06, |
|
"loss": 0.0621, |
|
"num_input_tokens_seen": 2399936, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.226870474658085, |
|
"grad_norm": 2.865879774093628, |
|
"learning_rate": 2.8833333333333334e-06, |
|
"loss": 0.0659, |
|
"num_input_tokens_seen": 2406928, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.233306516492357, |
|
"grad_norm": 4.03169059753418, |
|
"learning_rate": 2.8916666666666672e-06, |
|
"loss": 0.039, |
|
"num_input_tokens_seen": 2413888, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.239742558326629, |
|
"grad_norm": 1.693605899810791, |
|
"learning_rate": 2.9e-06, |
|
"loss": 0.0239, |
|
"num_input_tokens_seen": 2421104, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.246178600160901, |
|
"grad_norm": 2.7058444023132324, |
|
"learning_rate": 2.9083333333333336e-06, |
|
"loss": 0.0521, |
|
"num_input_tokens_seen": 2428128, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.252614641995173, |
|
"grad_norm": 3.9503567218780518, |
|
"learning_rate": 2.916666666666667e-06, |
|
"loss": 0.0561, |
|
"num_input_tokens_seen": 2434880, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.259050683829445, |
|
"grad_norm": 4.444098472595215, |
|
"learning_rate": 2.925e-06, |
|
"loss": 0.0622, |
|
"num_input_tokens_seen": 2441824, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.265486725663717, |
|
"grad_norm": 3.7014055252075195, |
|
"learning_rate": 2.9333333333333338e-06, |
|
"loss": 0.0875, |
|
"num_input_tokens_seen": 2448688, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.2719227674979887, |
|
"grad_norm": 4.078037261962891, |
|
"learning_rate": 2.941666666666667e-06, |
|
"loss": 0.0307, |
|
"num_input_tokens_seen": 2455488, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.2783588093322606, |
|
"grad_norm": 3.753711700439453, |
|
"learning_rate": 2.95e-06, |
|
"loss": 0.063, |
|
"num_input_tokens_seen": 2462240, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.2847948511665326, |
|
"grad_norm": 2.9653706550598145, |
|
"learning_rate": 2.9583333333333335e-06, |
|
"loss": 0.0404, |
|
"num_input_tokens_seen": 2469408, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.2912308930008045, |
|
"grad_norm": 3.8090925216674805, |
|
"learning_rate": 2.9666666666666673e-06, |
|
"loss": 0.0759, |
|
"num_input_tokens_seen": 2476240, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.2976669348350764, |
|
"grad_norm": 2.4684033393859863, |
|
"learning_rate": 2.9750000000000003e-06, |
|
"loss": 0.0488, |
|
"num_input_tokens_seen": 2482864, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.3041029766693484, |
|
"grad_norm": 2.0687243938446045, |
|
"learning_rate": 2.9833333333333337e-06, |
|
"loss": 0.0499, |
|
"num_input_tokens_seen": 2489664, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.3105390185036203, |
|
"grad_norm": 3.223965883255005, |
|
"learning_rate": 2.991666666666667e-06, |
|
"loss": 0.0441, |
|
"num_input_tokens_seen": 2496704, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.3169750603378922, |
|
"grad_norm": 2.1407270431518555, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0485, |
|
"num_input_tokens_seen": 2503920, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.323411102172164, |
|
"grad_norm": 2.632885217666626, |
|
"learning_rate": 3.0083333333333335e-06, |
|
"loss": 0.0674, |
|
"num_input_tokens_seen": 2510544, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.329847144006436, |
|
"grad_norm": 3.258030652999878, |
|
"learning_rate": 3.0166666666666673e-06, |
|
"loss": 0.0689, |
|
"num_input_tokens_seen": 2517408, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.336283185840708, |
|
"grad_norm": 6.024159908294678, |
|
"learning_rate": 3.0250000000000003e-06, |
|
"loss": 0.0618, |
|
"num_input_tokens_seen": 2524160, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.34271922767498, |
|
"grad_norm": 4.7281999588012695, |
|
"learning_rate": 3.0333333333333337e-06, |
|
"loss": 0.0629, |
|
"num_input_tokens_seen": 2531072, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.349155269509252, |
|
"grad_norm": 4.178661823272705, |
|
"learning_rate": 3.0416666666666666e-06, |
|
"loss": 0.0499, |
|
"num_input_tokens_seen": 2537920, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.355591311343524, |
|
"grad_norm": 1.5715197324752808, |
|
"learning_rate": 3.05e-06, |
|
"loss": 0.0361, |
|
"num_input_tokens_seen": 2544736, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.3620273531777958, |
|
"grad_norm": 2.835855722427368, |
|
"learning_rate": 3.058333333333334e-06, |
|
"loss": 0.0471, |
|
"num_input_tokens_seen": 2552016, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.3684633950120677, |
|
"grad_norm": 2.870889902114868, |
|
"learning_rate": 3.066666666666667e-06, |
|
"loss": 0.0622, |
|
"num_input_tokens_seen": 2559616, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.3748994368463396, |
|
"grad_norm": 1.7411049604415894, |
|
"learning_rate": 3.075e-06, |
|
"loss": 0.0328, |
|
"num_input_tokens_seen": 2566240, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.3813354786806116, |
|
"grad_norm": 3.0499918460845947, |
|
"learning_rate": 3.0833333333333336e-06, |
|
"loss": 0.0437, |
|
"num_input_tokens_seen": 2573392, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.3877715205148835, |
|
"grad_norm": 4.242414474487305, |
|
"learning_rate": 3.0916666666666666e-06, |
|
"loss": 0.0644, |
|
"num_input_tokens_seen": 2580544, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.3942075623491554, |
|
"grad_norm": 2.962906837463379, |
|
"learning_rate": 3.1000000000000004e-06, |
|
"loss": 0.0553, |
|
"num_input_tokens_seen": 2587344, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.4006436041834274, |
|
"grad_norm": 4.431301116943359, |
|
"learning_rate": 3.1083333333333338e-06, |
|
"loss": 0.061, |
|
"num_input_tokens_seen": 2594560, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.4070796460176993, |
|
"grad_norm": 5.075587272644043, |
|
"learning_rate": 3.1166666666666668e-06, |
|
"loss": 0.0866, |
|
"num_input_tokens_seen": 2601408, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.4135156878519712, |
|
"grad_norm": 3.877520799636841, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.0632, |
|
"num_input_tokens_seen": 2608624, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.419951729686243, |
|
"grad_norm": 2.9902503490448, |
|
"learning_rate": 3.133333333333334e-06, |
|
"loss": 0.0395, |
|
"num_input_tokens_seen": 2615456, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.426387771520515, |
|
"grad_norm": 3.7800397872924805, |
|
"learning_rate": 3.141666666666667e-06, |
|
"loss": 0.0819, |
|
"num_input_tokens_seen": 2622672, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.432823813354787, |
|
"grad_norm": 2.4674911499023438, |
|
"learning_rate": 3.1500000000000003e-06, |
|
"loss": 0.064, |
|
"num_input_tokens_seen": 2629952, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.439259855189059, |
|
"grad_norm": 5.3331146240234375, |
|
"learning_rate": 3.1583333333333337e-06, |
|
"loss": 0.0803, |
|
"num_input_tokens_seen": 2637168, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.445695897023331, |
|
"grad_norm": 9.950706481933594, |
|
"learning_rate": 3.1666666666666667e-06, |
|
"loss": 0.0798, |
|
"num_input_tokens_seen": 2644144, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.4521319388576024, |
|
"grad_norm": 5.1734442710876465, |
|
"learning_rate": 3.175e-06, |
|
"loss": 0.0544, |
|
"num_input_tokens_seen": 2651376, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.4585679806918743, |
|
"grad_norm": 2.5671188831329346, |
|
"learning_rate": 3.183333333333334e-06, |
|
"loss": 0.0629, |
|
"num_input_tokens_seen": 2658336, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.4650040225261463, |
|
"grad_norm": 4.357182025909424, |
|
"learning_rate": 3.191666666666667e-06, |
|
"loss": 0.0471, |
|
"num_input_tokens_seen": 2665360, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.471440064360418, |
|
"grad_norm": 4.694338321685791, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.0533, |
|
"num_input_tokens_seen": 2672704, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.47787610619469, |
|
"grad_norm": 2.391195774078369, |
|
"learning_rate": 3.2083333333333337e-06, |
|
"loss": 0.0542, |
|
"num_input_tokens_seen": 2679872, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.484312148028962, |
|
"grad_norm": 3.859102249145508, |
|
"learning_rate": 3.2166666666666666e-06, |
|
"loss": 0.034, |
|
"num_input_tokens_seen": 2686672, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.490748189863234, |
|
"grad_norm": 2.4710166454315186, |
|
"learning_rate": 3.2250000000000005e-06, |
|
"loss": 0.0517, |
|
"num_input_tokens_seen": 2693520, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.497184231697506, |
|
"grad_norm": 3.309068202972412, |
|
"learning_rate": 3.2333333333333334e-06, |
|
"loss": 0.0698, |
|
"num_input_tokens_seen": 2700432, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.503620273531778, |
|
"grad_norm": 4.21011209487915, |
|
"learning_rate": 3.241666666666667e-06, |
|
"loss": 0.0573, |
|
"num_input_tokens_seen": 2707184, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.51005631536605, |
|
"grad_norm": 4.34623908996582, |
|
"learning_rate": 3.2500000000000002e-06, |
|
"loss": 0.0568, |
|
"num_input_tokens_seen": 2713936, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.5164923572003217, |
|
"grad_norm": 3.361445188522339, |
|
"learning_rate": 3.258333333333333e-06, |
|
"loss": 0.0669, |
|
"num_input_tokens_seen": 2721216, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.5229283990345936, |
|
"grad_norm": 2.091728925704956, |
|
"learning_rate": 3.266666666666667e-06, |
|
"loss": 0.027, |
|
"num_input_tokens_seen": 2727968, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.5293644408688656, |
|
"grad_norm": 2.1977951526641846, |
|
"learning_rate": 3.2750000000000004e-06, |
|
"loss": 0.0303, |
|
"num_input_tokens_seen": 2734816, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.5358004827031375, |
|
"grad_norm": 2.7409942150115967, |
|
"learning_rate": 3.2833333333333334e-06, |
|
"loss": 0.0392, |
|
"num_input_tokens_seen": 2741744, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.5422365245374094, |
|
"grad_norm": 3.695770740509033, |
|
"learning_rate": 3.2916666666666668e-06, |
|
"loss": 0.0813, |
|
"num_input_tokens_seen": 2748640, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.5486725663716814, |
|
"grad_norm": 3.674891471862793, |
|
"learning_rate": 3.3000000000000006e-06, |
|
"loss": 0.0403, |
|
"num_input_tokens_seen": 2755888, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.5551086082059533, |
|
"grad_norm": 1.716131567955017, |
|
"learning_rate": 3.3083333333333336e-06, |
|
"loss": 0.0222, |
|
"num_input_tokens_seen": 2762464, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.5615446500402252, |
|
"grad_norm": 2.5081095695495605, |
|
"learning_rate": 3.316666666666667e-06, |
|
"loss": 0.0611, |
|
"num_input_tokens_seen": 2769712, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.567980691874497, |
|
"grad_norm": 1.9974850416183472, |
|
"learning_rate": 3.3250000000000004e-06, |
|
"loss": 0.035, |
|
"num_input_tokens_seen": 2776736, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.574416733708769, |
|
"grad_norm": 4.233558177947998, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.068, |
|
"num_input_tokens_seen": 2783376, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.580852775543041, |
|
"grad_norm": 3.359081983566284, |
|
"learning_rate": 3.341666666666667e-06, |
|
"loss": 0.0543, |
|
"num_input_tokens_seen": 2790528, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.587288817377313, |
|
"grad_norm": 2.669712543487549, |
|
"learning_rate": 3.3500000000000005e-06, |
|
"loss": 0.0466, |
|
"num_input_tokens_seen": 2797312, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.593724859211585, |
|
"grad_norm": 3.1529603004455566, |
|
"learning_rate": 3.3583333333333335e-06, |
|
"loss": 0.0626, |
|
"num_input_tokens_seen": 2804288, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.600160901045857, |
|
"grad_norm": 3.069842576980591, |
|
"learning_rate": 3.366666666666667e-06, |
|
"loss": 0.0589, |
|
"num_input_tokens_seen": 2811456, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.6065969428801288, |
|
"grad_norm": 1.881988525390625, |
|
"learning_rate": 3.3750000000000003e-06, |
|
"loss": 0.0415, |
|
"num_input_tokens_seen": 2818080, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.6130329847144007, |
|
"grad_norm": 1.862747073173523, |
|
"learning_rate": 3.3833333333333333e-06, |
|
"loss": 0.0344, |
|
"num_input_tokens_seen": 2825136, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.6194690265486726, |
|
"grad_norm": 2.6847071647644043, |
|
"learning_rate": 3.391666666666667e-06, |
|
"loss": 0.0423, |
|
"num_input_tokens_seen": 2832400, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.6259050683829446, |
|
"grad_norm": 3.631681203842163, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 0.0838, |
|
"num_input_tokens_seen": 2839712, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.6323411102172165, |
|
"grad_norm": 3.7878201007843018, |
|
"learning_rate": 3.4083333333333335e-06, |
|
"loss": 0.0732, |
|
"num_input_tokens_seen": 2846160, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.6387771520514884, |
|
"grad_norm": 2.826582431793213, |
|
"learning_rate": 3.416666666666667e-06, |
|
"loss": 0.0464, |
|
"num_input_tokens_seen": 2853520, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.6452131938857604, |
|
"grad_norm": 2.330638885498047, |
|
"learning_rate": 3.4250000000000007e-06, |
|
"loss": 0.0387, |
|
"num_input_tokens_seen": 2860384, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.6516492357200323, |
|
"grad_norm": 2.330439567565918, |
|
"learning_rate": 3.4333333333333336e-06, |
|
"loss": 0.0507, |
|
"num_input_tokens_seen": 2867360, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.6580852775543042, |
|
"grad_norm": 3.929145336151123, |
|
"learning_rate": 3.441666666666667e-06, |
|
"loss": 0.0549, |
|
"num_input_tokens_seen": 2873648, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.664521319388576, |
|
"grad_norm": 3.001359224319458, |
|
"learning_rate": 3.45e-06, |
|
"loss": 0.0285, |
|
"num_input_tokens_seen": 2880848, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.670957361222848, |
|
"grad_norm": 2.7936651706695557, |
|
"learning_rate": 3.4583333333333334e-06, |
|
"loss": 0.0668, |
|
"num_input_tokens_seen": 2888256, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.67739340305712, |
|
"grad_norm": 4.050117015838623, |
|
"learning_rate": 3.4666666666666672e-06, |
|
"loss": 0.0691, |
|
"num_input_tokens_seen": 2895040, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.6838294448913915, |
|
"grad_norm": 5.509685516357422, |
|
"learning_rate": 3.475e-06, |
|
"loss": 0.066, |
|
"num_input_tokens_seen": 2902320, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.6902654867256635, |
|
"grad_norm": 3.968433380126953, |
|
"learning_rate": 3.4833333333333336e-06, |
|
"loss": 0.0495, |
|
"num_input_tokens_seen": 2908960, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.6967015285599354, |
|
"grad_norm": 2.082157611846924, |
|
"learning_rate": 3.491666666666667e-06, |
|
"loss": 0.034, |
|
"num_input_tokens_seen": 2915808, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.7031375703942073, |
|
"grad_norm": 2.403968334197998, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.0604, |
|
"num_input_tokens_seen": 2922608, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.7095736122284793, |
|
"grad_norm": 4.667454719543457, |
|
"learning_rate": 3.5083333333333338e-06, |
|
"loss": 0.0535, |
|
"num_input_tokens_seen": 2929728, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.716009654062751, |
|
"grad_norm": 2.5968987941741943, |
|
"learning_rate": 3.516666666666667e-06, |
|
"loss": 0.0369, |
|
"num_input_tokens_seen": 2937024, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.722445695897023, |
|
"grad_norm": 3.4746780395507812, |
|
"learning_rate": 3.525e-06, |
|
"loss": 0.045, |
|
"num_input_tokens_seen": 2943760, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.728881737731295, |
|
"grad_norm": 1.9599398374557495, |
|
"learning_rate": 3.5333333333333335e-06, |
|
"loss": 0.0314, |
|
"num_input_tokens_seen": 2950848, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.735317779565567, |
|
"grad_norm": 2.971634864807129, |
|
"learning_rate": 3.5416666666666673e-06, |
|
"loss": 0.0611, |
|
"num_input_tokens_seen": 2957408, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.741753821399839, |
|
"grad_norm": 3.1944162845611572, |
|
"learning_rate": 3.5500000000000003e-06, |
|
"loss": 0.0478, |
|
"num_input_tokens_seen": 2964288, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.748189863234111, |
|
"grad_norm": 3.3659610748291016, |
|
"learning_rate": 3.5583333333333337e-06, |
|
"loss": 0.038, |
|
"num_input_tokens_seen": 2970912, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.754625905068383, |
|
"grad_norm": 2.965097188949585, |
|
"learning_rate": 3.566666666666667e-06, |
|
"loss": 0.043, |
|
"num_input_tokens_seen": 2978032, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.7610619469026547, |
|
"grad_norm": 2.4006049633026123, |
|
"learning_rate": 3.575e-06, |
|
"loss": 0.0478, |
|
"num_input_tokens_seen": 2985232, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.7674979887369267, |
|
"grad_norm": 3.7348554134368896, |
|
"learning_rate": 3.5833333333333335e-06, |
|
"loss": 0.0977, |
|
"num_input_tokens_seen": 2992240, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.7739340305711986, |
|
"grad_norm": 3.1373274326324463, |
|
"learning_rate": 3.5916666666666673e-06, |
|
"loss": 0.0835, |
|
"num_input_tokens_seen": 2999008, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.7803700724054705, |
|
"grad_norm": 1.9444302320480347, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.0406, |
|
"num_input_tokens_seen": 3005648, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.7868061142397424, |
|
"grad_norm": 1.8665870428085327, |
|
"learning_rate": 3.6083333333333337e-06, |
|
"loss": 0.0661, |
|
"num_input_tokens_seen": 3012224, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.7932421560740144, |
|
"grad_norm": 1.9893403053283691, |
|
"learning_rate": 3.616666666666667e-06, |
|
"loss": 0.0647, |
|
"num_input_tokens_seen": 3019104, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.7996781979082863, |
|
"grad_norm": 2.656529426574707, |
|
"learning_rate": 3.625e-06, |
|
"loss": 0.0499, |
|
"num_input_tokens_seen": 3026096, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.8061142397425582, |
|
"grad_norm": 1.7047683000564575, |
|
"learning_rate": 3.633333333333334e-06, |
|
"loss": 0.0422, |
|
"num_input_tokens_seen": 3032784, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.81255028157683, |
|
"grad_norm": 1.6727882623672485, |
|
"learning_rate": 3.6416666666666672e-06, |
|
"loss": 0.048, |
|
"num_input_tokens_seen": 3040096, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.818986323411102, |
|
"grad_norm": 4.0175251960754395, |
|
"learning_rate": 3.65e-06, |
|
"loss": 0.0474, |
|
"num_input_tokens_seen": 3046720, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.825422365245374, |
|
"grad_norm": 8.139860153198242, |
|
"learning_rate": 3.6583333333333336e-06, |
|
"loss": 0.0801, |
|
"num_input_tokens_seen": 3053712, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.831858407079646, |
|
"grad_norm": 3.832087278366089, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 0.0528, |
|
"num_input_tokens_seen": 3060528, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.838294448913918, |
|
"grad_norm": 2.881619930267334, |
|
"learning_rate": 3.6750000000000004e-06, |
|
"loss": 0.0461, |
|
"num_input_tokens_seen": 3067440, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.84473049074819, |
|
"grad_norm": 4.456245422363281, |
|
"learning_rate": 3.6833333333333338e-06, |
|
"loss": 0.0646, |
|
"num_input_tokens_seen": 3074208, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.8511665325824618, |
|
"grad_norm": 5.1570820808410645, |
|
"learning_rate": 3.6916666666666668e-06, |
|
"loss": 0.049, |
|
"num_input_tokens_seen": 3081072, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.8576025744167337, |
|
"grad_norm": 2.944526433944702, |
|
"learning_rate": 3.7e-06, |
|
"loss": 0.0531, |
|
"num_input_tokens_seen": 3088240, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.8640386162510056, |
|
"grad_norm": 2.021688222885132, |
|
"learning_rate": 3.708333333333334e-06, |
|
"loss": 0.0521, |
|
"num_input_tokens_seen": 3095504, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.8704746580852776, |
|
"grad_norm": 6.054248809814453, |
|
"learning_rate": 3.716666666666667e-06, |
|
"loss": 0.0927, |
|
"num_input_tokens_seen": 3102688, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.8769106999195495, |
|
"grad_norm": 3.5824503898620605, |
|
"learning_rate": 3.7250000000000003e-06, |
|
"loss": 0.0491, |
|
"num_input_tokens_seen": 3109440, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.8833467417538214, |
|
"grad_norm": 2.0240774154663086, |
|
"learning_rate": 3.7333333333333337e-06, |
|
"loss": 0.0399, |
|
"num_input_tokens_seen": 3116720, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.8897827835880934, |
|
"grad_norm": 4.0125579833984375, |
|
"learning_rate": 3.7416666666666667e-06, |
|
"loss": 0.0499, |
|
"num_input_tokens_seen": 3123568, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.8962188254223653, |
|
"grad_norm": 3.733275890350342, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.0569, |
|
"num_input_tokens_seen": 3130768, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.9026548672566372, |
|
"grad_norm": 4.261077880859375, |
|
"learning_rate": 3.758333333333334e-06, |
|
"loss": 0.0608, |
|
"num_input_tokens_seen": 3138128, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 1.4142907857894897, |
|
"learning_rate": 3.766666666666667e-06, |
|
"loss": 0.0325, |
|
"num_input_tokens_seen": 3145008, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.915526950925181, |
|
"grad_norm": 2.610344171524048, |
|
"learning_rate": 3.7750000000000003e-06, |
|
"loss": 0.0643, |
|
"num_input_tokens_seen": 3151792, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.921962992759453, |
|
"grad_norm": 2.9687604904174805, |
|
"learning_rate": 3.7833333333333337e-06, |
|
"loss": 0.0479, |
|
"num_input_tokens_seen": 3158800, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.928399034593725, |
|
"grad_norm": 2.2706518173217773, |
|
"learning_rate": 3.7916666666666666e-06, |
|
"loss": 0.0549, |
|
"num_input_tokens_seen": 3165744, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.934835076427997, |
|
"grad_norm": 3.606792449951172, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 0.0789, |
|
"num_input_tokens_seen": 3172896, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.941271118262269, |
|
"grad_norm": 1.8851637840270996, |
|
"learning_rate": 3.808333333333334e-06, |
|
"loss": 0.0319, |
|
"num_input_tokens_seen": 3179888, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.9477071600965408, |
|
"grad_norm": 2.6292834281921387, |
|
"learning_rate": 3.816666666666667e-06, |
|
"loss": 0.05, |
|
"num_input_tokens_seen": 3186960, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.9541432019308127, |
|
"grad_norm": 2.099109172821045, |
|
"learning_rate": 3.825000000000001e-06, |
|
"loss": 0.0677, |
|
"num_input_tokens_seen": 3194208, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.9605792437650846, |
|
"grad_norm": 2.5214834213256836, |
|
"learning_rate": 3.833333333333334e-06, |
|
"loss": 0.0512, |
|
"num_input_tokens_seen": 3201120, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.9670152855993566, |
|
"grad_norm": 6.318456649780273, |
|
"learning_rate": 3.841666666666667e-06, |
|
"loss": 0.0681, |
|
"num_input_tokens_seen": 3208160, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.9734513274336285, |
|
"grad_norm": 4.119838714599609, |
|
"learning_rate": 3.85e-06, |
|
"loss": 0.0651, |
|
"num_input_tokens_seen": 3214992, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.9798873692679004, |
|
"grad_norm": 3.248420238494873, |
|
"learning_rate": 3.858333333333333e-06, |
|
"loss": 0.0498, |
|
"num_input_tokens_seen": 3222192, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.9863234111021724, |
|
"grad_norm": 1.6198488473892212, |
|
"learning_rate": 3.866666666666667e-06, |
|
"loss": 0.0496, |
|
"num_input_tokens_seen": 3229504, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.9927594529364443, |
|
"grad_norm": 2.6008763313293457, |
|
"learning_rate": 3.875e-06, |
|
"loss": 0.0446, |
|
"num_input_tokens_seen": 3236400, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.9991954947707162, |
|
"grad_norm": 2.349928379058838, |
|
"learning_rate": 3.883333333333333e-06, |
|
"loss": 0.0543, |
|
"num_input_tokens_seen": 3243600, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.0056315366049877, |
|
"grad_norm": 0.8590204119682312, |
|
"learning_rate": 3.891666666666667e-06, |
|
"loss": 0.0137, |
|
"num_input_tokens_seen": 3249808, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.0120675784392597, |
|
"grad_norm": 1.2689623832702637, |
|
"learning_rate": 3.900000000000001e-06, |
|
"loss": 0.0201, |
|
"num_input_tokens_seen": 3257168, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.0185036202735316, |
|
"grad_norm": 1.329512596130371, |
|
"learning_rate": 3.908333333333334e-06, |
|
"loss": 0.0119, |
|
"num_input_tokens_seen": 3264064, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.0249396621078035, |
|
"grad_norm": 2.423644781112671, |
|
"learning_rate": 3.916666666666667e-06, |
|
"loss": 0.0305, |
|
"num_input_tokens_seen": 3270688, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.0313757039420755, |
|
"grad_norm": 3.6647322177886963, |
|
"learning_rate": 3.9250000000000005e-06, |
|
"loss": 0.0213, |
|
"num_input_tokens_seen": 3277664, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.0378117457763474, |
|
"grad_norm": 3.736281156539917, |
|
"learning_rate": 3.9333333333333335e-06, |
|
"loss": 0.035, |
|
"num_input_tokens_seen": 3284352, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.0442477876106193, |
|
"grad_norm": 2.274883270263672, |
|
"learning_rate": 3.941666666666667e-06, |
|
"loss": 0.0438, |
|
"num_input_tokens_seen": 3290864, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.0506838294448912, |
|
"grad_norm": 3.032172203063965, |
|
"learning_rate": 3.95e-06, |
|
"loss": 0.0464, |
|
"num_input_tokens_seen": 3297856, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.057119871279163, |
|
"grad_norm": 2.258751392364502, |
|
"learning_rate": 3.958333333333333e-06, |
|
"loss": 0.0172, |
|
"num_input_tokens_seen": 3305120, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.063555913113435, |
|
"grad_norm": 2.925736427307129, |
|
"learning_rate": 3.966666666666667e-06, |
|
"loss": 0.0287, |
|
"num_input_tokens_seen": 3312032, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.069991954947707, |
|
"grad_norm": 3.100857734680176, |
|
"learning_rate": 3.975000000000001e-06, |
|
"loss": 0.0579, |
|
"num_input_tokens_seen": 3319424, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.076427996781979, |
|
"grad_norm": 1.753515601158142, |
|
"learning_rate": 3.983333333333334e-06, |
|
"loss": 0.0095, |
|
"num_input_tokens_seen": 3326304, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.082864038616251, |
|
"grad_norm": 2.3217740058898926, |
|
"learning_rate": 3.991666666666667e-06, |
|
"loss": 0.0238, |
|
"num_input_tokens_seen": 3333184, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.089300080450523, |
|
"grad_norm": 2.512751579284668, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0313, |
|
"num_input_tokens_seen": 3340384, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.0957361222847948, |
|
"grad_norm": 1.2185322046279907, |
|
"learning_rate": 4.008333333333334e-06, |
|
"loss": 0.0146, |
|
"num_input_tokens_seen": 3347344, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.1021721641190667, |
|
"grad_norm": 1.1303057670593262, |
|
"learning_rate": 4.0166666666666675e-06, |
|
"loss": 0.0347, |
|
"num_input_tokens_seen": 3354080, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.1086082059533386, |
|
"grad_norm": 2.4247186183929443, |
|
"learning_rate": 4.0250000000000004e-06, |
|
"loss": 0.024, |
|
"num_input_tokens_seen": 3360848, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.1150442477876106, |
|
"grad_norm": 1.4767001867294312, |
|
"learning_rate": 4.033333333333333e-06, |
|
"loss": 0.0128, |
|
"num_input_tokens_seen": 3367616, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.1214802896218825, |
|
"grad_norm": 2.458953857421875, |
|
"learning_rate": 4.041666666666667e-06, |
|
"loss": 0.0311, |
|
"num_input_tokens_seen": 3374880, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.1279163314561544, |
|
"grad_norm": 0.5494964718818665, |
|
"learning_rate": 4.05e-06, |
|
"loss": 0.0178, |
|
"num_input_tokens_seen": 3381696, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.1343523732904264, |
|
"grad_norm": 1.5969914197921753, |
|
"learning_rate": 4.058333333333333e-06, |
|
"loss": 0.0379, |
|
"num_input_tokens_seen": 3388880, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.1407884151246983, |
|
"grad_norm": 1.7003910541534424, |
|
"learning_rate": 4.066666666666667e-06, |
|
"loss": 0.0299, |
|
"num_input_tokens_seen": 3395984, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.1472244569589702, |
|
"grad_norm": 2.297182083129883, |
|
"learning_rate": 4.075e-06, |
|
"loss": 0.0261, |
|
"num_input_tokens_seen": 3402896, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.153660498793242, |
|
"grad_norm": 2.3937814235687256, |
|
"learning_rate": 4.083333333333334e-06, |
|
"loss": 0.0347, |
|
"num_input_tokens_seen": 3409888, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.160096540627514, |
|
"grad_norm": 1.349425196647644, |
|
"learning_rate": 4.091666666666667e-06, |
|
"loss": 0.011, |
|
"num_input_tokens_seen": 3416928, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.166532582461786, |
|
"grad_norm": 3.0355069637298584, |
|
"learning_rate": 4.1e-06, |
|
"loss": 0.0541, |
|
"num_input_tokens_seen": 3423968, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.172968624296058, |
|
"grad_norm": 2.680206537246704, |
|
"learning_rate": 4.1083333333333335e-06, |
|
"loss": 0.0465, |
|
"num_input_tokens_seen": 3431120, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.17940466613033, |
|
"grad_norm": 1.5906095504760742, |
|
"learning_rate": 4.116666666666667e-06, |
|
"loss": 0.0187, |
|
"num_input_tokens_seen": 3437776, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.185840707964602, |
|
"grad_norm": 0.8296425938606262, |
|
"learning_rate": 4.125e-06, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 3444480, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.1922767497988738, |
|
"grad_norm": 2.857689142227173, |
|
"learning_rate": 4.133333333333333e-06, |
|
"loss": 0.0289, |
|
"num_input_tokens_seen": 3451232, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.1987127916331457, |
|
"grad_norm": 1.0910203456878662, |
|
"learning_rate": 4.141666666666667e-06, |
|
"loss": 0.0103, |
|
"num_input_tokens_seen": 3457776, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.2051488334674176, |
|
"grad_norm": 1.3560919761657715, |
|
"learning_rate": 4.15e-06, |
|
"loss": 0.0132, |
|
"num_input_tokens_seen": 3465056, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.2115848753016896, |
|
"grad_norm": 4.861215591430664, |
|
"learning_rate": 4.158333333333334e-06, |
|
"loss": 0.0375, |
|
"num_input_tokens_seen": 3471968, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.2180209171359615, |
|
"grad_norm": 1.8714208602905273, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.0143, |
|
"num_input_tokens_seen": 3479648, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.2244569589702334, |
|
"grad_norm": 1.6230028867721558, |
|
"learning_rate": 4.175e-06, |
|
"loss": 0.0159, |
|
"num_input_tokens_seen": 3486272, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.2308930008045054, |
|
"grad_norm": 0.7852226495742798, |
|
"learning_rate": 4.183333333333334e-06, |
|
"loss": 0.0073, |
|
"num_input_tokens_seen": 3493360, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.2373290426387773, |
|
"grad_norm": 2.3990976810455322, |
|
"learning_rate": 4.1916666666666675e-06, |
|
"loss": 0.0186, |
|
"num_input_tokens_seen": 3500336, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.2437650844730492, |
|
"grad_norm": 0.796851634979248, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 0.0035, |
|
"num_input_tokens_seen": 3507232, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.250201126307321, |
|
"grad_norm": 2.7951748371124268, |
|
"learning_rate": 4.208333333333333e-06, |
|
"loss": 0.0416, |
|
"num_input_tokens_seen": 3514144, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.256637168141593, |
|
"grad_norm": 2.40897274017334, |
|
"learning_rate": 4.216666666666667e-06, |
|
"loss": 0.0266, |
|
"num_input_tokens_seen": 3520976, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.263073209975865, |
|
"grad_norm": 2.3974061012268066, |
|
"learning_rate": 4.225e-06, |
|
"loss": 0.0351, |
|
"num_input_tokens_seen": 3527920, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.2695092518101365, |
|
"grad_norm": 2.30100154876709, |
|
"learning_rate": 4.233333333333334e-06, |
|
"loss": 0.0209, |
|
"num_input_tokens_seen": 3534864, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.2759452936444085, |
|
"grad_norm": 2.1172518730163574, |
|
"learning_rate": 4.241666666666667e-06, |
|
"loss": 0.0434, |
|
"num_input_tokens_seen": 3541872, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.2823813354786804, |
|
"grad_norm": 3.7030341625213623, |
|
"learning_rate": 4.25e-06, |
|
"loss": 0.0174, |
|
"num_input_tokens_seen": 3548384, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.2888173773129523, |
|
"grad_norm": 2.152125597000122, |
|
"learning_rate": 4.258333333333334e-06, |
|
"loss": 0.0529, |
|
"num_input_tokens_seen": 3555792, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.2952534191472242, |
|
"grad_norm": 0.6081152558326721, |
|
"learning_rate": 4.266666666666668e-06, |
|
"loss": 0.0033, |
|
"num_input_tokens_seen": 3562608, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.301689460981496, |
|
"grad_norm": 1.7042624950408936, |
|
"learning_rate": 4.2750000000000006e-06, |
|
"loss": 0.0196, |
|
"num_input_tokens_seen": 3569184, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.308125502815768, |
|
"grad_norm": 1.3502767086029053, |
|
"learning_rate": 4.2833333333333335e-06, |
|
"loss": 0.0242, |
|
"num_input_tokens_seen": 3576224, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.31456154465004, |
|
"grad_norm": 4.480360984802246, |
|
"learning_rate": 4.2916666666666665e-06, |
|
"loss": 0.0316, |
|
"num_input_tokens_seen": 3583328, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.320997586484312, |
|
"grad_norm": 2.2217299938201904, |
|
"learning_rate": 4.3e-06, |
|
"loss": 0.0268, |
|
"num_input_tokens_seen": 3590256, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.327433628318584, |
|
"grad_norm": 1.5919010639190674, |
|
"learning_rate": 4.308333333333334e-06, |
|
"loss": 0.0248, |
|
"num_input_tokens_seen": 3597328, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.333869670152856, |
|
"grad_norm": 2.425961971282959, |
|
"learning_rate": 4.316666666666667e-06, |
|
"loss": 0.032, |
|
"num_input_tokens_seen": 3604576, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.340305711987128, |
|
"grad_norm": 2.987424612045288, |
|
"learning_rate": 4.325e-06, |
|
"loss": 0.0202, |
|
"num_input_tokens_seen": 3611520, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.3467417538213997, |
|
"grad_norm": 2.633897304534912, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 0.0329, |
|
"num_input_tokens_seen": 3618288, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.3531777956556716, |
|
"grad_norm": 1.0696384906768799, |
|
"learning_rate": 4.341666666666667e-06, |
|
"loss": 0.019, |
|
"num_input_tokens_seen": 3625216, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.3596138374899436, |
|
"grad_norm": 2.400972604751587, |
|
"learning_rate": 4.350000000000001e-06, |
|
"loss": 0.0182, |
|
"num_input_tokens_seen": 3631888, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.3660498793242155, |
|
"grad_norm": 1.3744821548461914, |
|
"learning_rate": 4.358333333333334e-06, |
|
"loss": 0.0124, |
|
"num_input_tokens_seen": 3638848, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.3724859211584874, |
|
"grad_norm": 1.613145112991333, |
|
"learning_rate": 4.366666666666667e-06, |
|
"loss": 0.0122, |
|
"num_input_tokens_seen": 3646112, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.3789219629927594, |
|
"grad_norm": 2.450824499130249, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.0388, |
|
"num_input_tokens_seen": 3652928, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.3853580048270313, |
|
"grad_norm": 1.6122058629989624, |
|
"learning_rate": 4.383333333333334e-06, |
|
"loss": 0.0106, |
|
"num_input_tokens_seen": 3659632, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.3917940466613032, |
|
"grad_norm": 1.53513765335083, |
|
"learning_rate": 4.391666666666667e-06, |
|
"loss": 0.0305, |
|
"num_input_tokens_seen": 3666480, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.398230088495575, |
|
"grad_norm": 2.103663444519043, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.0512, |
|
"num_input_tokens_seen": 3673136, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.404666130329847, |
|
"grad_norm": 0.41373467445373535, |
|
"learning_rate": 4.408333333333334e-06, |
|
"loss": 0.0031, |
|
"num_input_tokens_seen": 3679760, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.411102172164119, |
|
"grad_norm": 2.9610488414764404, |
|
"learning_rate": 4.416666666666667e-06, |
|
"loss": 0.0309, |
|
"num_input_tokens_seen": 3686576, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.417538213998391, |
|
"grad_norm": 2.415531873703003, |
|
"learning_rate": 4.425e-06, |
|
"loss": 0.0472, |
|
"num_input_tokens_seen": 3693312, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.423974255832663, |
|
"grad_norm": 2.175546407699585, |
|
"learning_rate": 4.433333333333334e-06, |
|
"loss": 0.0222, |
|
"num_input_tokens_seen": 3700000, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.430410297666935, |
|
"grad_norm": 1.0903018712997437, |
|
"learning_rate": 4.441666666666667e-06, |
|
"loss": 0.0077, |
|
"num_input_tokens_seen": 3706736, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.4368463395012068, |
|
"grad_norm": 0.8305991888046265, |
|
"learning_rate": 4.450000000000001e-06, |
|
"loss": 0.0064, |
|
"num_input_tokens_seen": 3714192, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.4432823813354787, |
|
"grad_norm": 0.9347790479660034, |
|
"learning_rate": 4.4583333333333336e-06, |
|
"loss": 0.0104, |
|
"num_input_tokens_seen": 3721408, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.4497184231697506, |
|
"grad_norm": 1.7669559717178345, |
|
"learning_rate": 4.4666666666666665e-06, |
|
"loss": 0.0121, |
|
"num_input_tokens_seen": 3728144, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.4561544650040226, |
|
"grad_norm": 3.121467351913452, |
|
"learning_rate": 4.475e-06, |
|
"loss": 0.0386, |
|
"num_input_tokens_seen": 3734960, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.4625905068382945, |
|
"grad_norm": 2.683410882949829, |
|
"learning_rate": 4.483333333333333e-06, |
|
"loss": 0.0319, |
|
"num_input_tokens_seen": 3741728, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.4690265486725664, |
|
"grad_norm": 9.728205680847168, |
|
"learning_rate": 4.491666666666667e-06, |
|
"loss": 0.0579, |
|
"num_input_tokens_seen": 3749200, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.4754625905068384, |
|
"grad_norm": 4.415483474731445, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.0255, |
|
"num_input_tokens_seen": 3755856, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.4818986323411103, |
|
"grad_norm": 3.651423692703247, |
|
"learning_rate": 4.508333333333333e-06, |
|
"loss": 0.0301, |
|
"num_input_tokens_seen": 3762528, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.4883346741753822, |
|
"grad_norm": 2.318000078201294, |
|
"learning_rate": 4.516666666666667e-06, |
|
"loss": 0.0589, |
|
"num_input_tokens_seen": 3769632, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.494770716009654, |
|
"grad_norm": 4.982158660888672, |
|
"learning_rate": 4.525000000000001e-06, |
|
"loss": 0.0442, |
|
"num_input_tokens_seen": 3776592, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.501206757843926, |
|
"grad_norm": 3.0872108936309814, |
|
"learning_rate": 4.533333333333334e-06, |
|
"loss": 0.0366, |
|
"num_input_tokens_seen": 3783824, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.507642799678198, |
|
"grad_norm": 5.150477886199951, |
|
"learning_rate": 4.541666666666667e-06, |
|
"loss": 0.0643, |
|
"num_input_tokens_seen": 3790864, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.51407884151247, |
|
"grad_norm": 3.0513834953308105, |
|
"learning_rate": 4.5500000000000005e-06, |
|
"loss": 0.0213, |
|
"num_input_tokens_seen": 3797664, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.520514883346742, |
|
"grad_norm": 1.5530712604522705, |
|
"learning_rate": 4.5583333333333335e-06, |
|
"loss": 0.0154, |
|
"num_input_tokens_seen": 3804576, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.526950925181014, |
|
"grad_norm": 2.6350319385528564, |
|
"learning_rate": 4.566666666666667e-06, |
|
"loss": 0.0252, |
|
"num_input_tokens_seen": 3811440, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.5333869670152858, |
|
"grad_norm": 2.8993167877197266, |
|
"learning_rate": 4.575e-06, |
|
"loss": 0.038, |
|
"num_input_tokens_seen": 3818352, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.5398230088495577, |
|
"grad_norm": 2.0168752670288086, |
|
"learning_rate": 4.583333333333333e-06, |
|
"loss": 0.0169, |
|
"num_input_tokens_seen": 3825360, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.5462590506838296, |
|
"grad_norm": 2.4160525798797607, |
|
"learning_rate": 4.591666666666667e-06, |
|
"loss": 0.0253, |
|
"num_input_tokens_seen": 3832416, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.5526950925181016, |
|
"grad_norm": 1.543545126914978, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 0.0164, |
|
"num_input_tokens_seen": 3839344, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.5591311343523735, |
|
"grad_norm": 2.355316400527954, |
|
"learning_rate": 4.608333333333334e-06, |
|
"loss": 0.0269, |
|
"num_input_tokens_seen": 3846688, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.5655671761866454, |
|
"grad_norm": 1.4751020669937134, |
|
"learning_rate": 4.616666666666667e-06, |
|
"loss": 0.0192, |
|
"num_input_tokens_seen": 3853696, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.5720032180209174, |
|
"grad_norm": 0.9673195481300354, |
|
"learning_rate": 4.625000000000001e-06, |
|
"loss": 0.0132, |
|
"num_input_tokens_seen": 3860832, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.5784392598551893, |
|
"grad_norm": 1.1592040061950684, |
|
"learning_rate": 4.633333333333334e-06, |
|
"loss": 0.0156, |
|
"num_input_tokens_seen": 3868000, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.5848753016894612, |
|
"grad_norm": 1.01143217086792, |
|
"learning_rate": 4.641666666666667e-06, |
|
"loss": 0.0081, |
|
"num_input_tokens_seen": 3874672, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.591311343523733, |
|
"grad_norm": 2.855041980743408, |
|
"learning_rate": 4.65e-06, |
|
"loss": 0.0351, |
|
"num_input_tokens_seen": 3881744, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.597747385358005, |
|
"grad_norm": 2.0597968101501465, |
|
"learning_rate": 4.658333333333333e-06, |
|
"loss": 0.0288, |
|
"num_input_tokens_seen": 3888256, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 3.604183427192277, |
|
"grad_norm": 2.9965226650238037, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.0335, |
|
"num_input_tokens_seen": 3895104, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.6106194690265485, |
|
"grad_norm": 3.625206708908081, |
|
"learning_rate": 4.675000000000001e-06, |
|
"loss": 0.0492, |
|
"num_input_tokens_seen": 3902208, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.6170555108608204, |
|
"grad_norm": 2.021160840988159, |
|
"learning_rate": 4.683333333333334e-06, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 3909040, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.6234915526950924, |
|
"grad_norm": 3.4565329551696777, |
|
"learning_rate": 4.691666666666667e-06, |
|
"loss": 0.0491, |
|
"num_input_tokens_seen": 3916304, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 3.6299275945293643, |
|
"grad_norm": 3.2362654209136963, |
|
"learning_rate": 4.7e-06, |
|
"loss": 0.0568, |
|
"num_input_tokens_seen": 3923216, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 3.234666347503662, |
|
"learning_rate": 4.708333333333334e-06, |
|
"loss": 0.0414, |
|
"num_input_tokens_seen": 3930448, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.642799678197908, |
|
"grad_norm": 2.1742103099823, |
|
"learning_rate": 4.7166666666666675e-06, |
|
"loss": 0.034, |
|
"num_input_tokens_seen": 3937424, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.64923572003218, |
|
"grad_norm": 2.9156923294067383, |
|
"learning_rate": 4.7250000000000005e-06, |
|
"loss": 0.0392, |
|
"num_input_tokens_seen": 3944112, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 3.655671761866452, |
|
"grad_norm": 4.092429161071777, |
|
"learning_rate": 4.7333333333333335e-06, |
|
"loss": 0.051, |
|
"num_input_tokens_seen": 3951504, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.662107803700724, |
|
"grad_norm": 3.9395768642425537, |
|
"learning_rate": 4.741666666666667e-06, |
|
"loss": 0.034, |
|
"num_input_tokens_seen": 3958352, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 3.668543845534996, |
|
"grad_norm": 1.9961844682693481, |
|
"learning_rate": 4.75e-06, |
|
"loss": 0.014, |
|
"num_input_tokens_seen": 3965552, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.674979887369268, |
|
"grad_norm": 1.8078194856643677, |
|
"learning_rate": 4.758333333333334e-06, |
|
"loss": 0.0406, |
|
"num_input_tokens_seen": 3972544, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 3.6814159292035398, |
|
"grad_norm": 2.048532485961914, |
|
"learning_rate": 4.766666666666667e-06, |
|
"loss": 0.0407, |
|
"num_input_tokens_seen": 3979264, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.6878519710378117, |
|
"grad_norm": 1.9979974031448364, |
|
"learning_rate": 4.775e-06, |
|
"loss": 0.0282, |
|
"num_input_tokens_seen": 3986240, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 3.6942880128720836, |
|
"grad_norm": 3.6126463413238525, |
|
"learning_rate": 4.783333333333334e-06, |
|
"loss": 0.0326, |
|
"num_input_tokens_seen": 3993232, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.7007240547063556, |
|
"grad_norm": 3.131657838821411, |
|
"learning_rate": 4.791666666666668e-06, |
|
"loss": 0.0348, |
|
"num_input_tokens_seen": 3999952, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.7071600965406275, |
|
"grad_norm": 2.2662060260772705, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.0256, |
|
"num_input_tokens_seen": 4007456, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.7135961383748994, |
|
"grad_norm": 4.874523639678955, |
|
"learning_rate": 4.808333333333334e-06, |
|
"loss": 0.0765, |
|
"num_input_tokens_seen": 4015024, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 3.7200321802091714, |
|
"grad_norm": 0.882166862487793, |
|
"learning_rate": 4.816666666666667e-06, |
|
"loss": 0.0099, |
|
"num_input_tokens_seen": 4021920, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.7264682220434433, |
|
"grad_norm": 3.1239066123962402, |
|
"learning_rate": 4.825e-06, |
|
"loss": 0.0173, |
|
"num_input_tokens_seen": 4028720, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 3.7329042638777152, |
|
"grad_norm": 1.5819370746612549, |
|
"learning_rate": 4.833333333333333e-06, |
|
"loss": 0.0084, |
|
"num_input_tokens_seen": 4035584, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.739340305711987, |
|
"grad_norm": 2.6252429485321045, |
|
"learning_rate": 4.841666666666667e-06, |
|
"loss": 0.0251, |
|
"num_input_tokens_seen": 4042464, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 3.745776347546259, |
|
"grad_norm": 2.0619590282440186, |
|
"learning_rate": 4.85e-06, |
|
"loss": 0.0909, |
|
"num_input_tokens_seen": 4049600, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.752212389380531, |
|
"grad_norm": 2.547422409057617, |
|
"learning_rate": 4.858333333333334e-06, |
|
"loss": 0.039, |
|
"num_input_tokens_seen": 4056320, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 3.758648431214803, |
|
"grad_norm": 1.3179091215133667, |
|
"learning_rate": 4.866666666666667e-06, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 4063200, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.765084473049075, |
|
"grad_norm": 3.090376377105713, |
|
"learning_rate": 4.875e-06, |
|
"loss": 0.0242, |
|
"num_input_tokens_seen": 4070112, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.771520514883347, |
|
"grad_norm": 2.50468111038208, |
|
"learning_rate": 4.883333333333334e-06, |
|
"loss": 0.0138, |
|
"num_input_tokens_seen": 4076928, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.7779565567176188, |
|
"grad_norm": 3.921415090560913, |
|
"learning_rate": 4.8916666666666675e-06, |
|
"loss": 0.0467, |
|
"num_input_tokens_seen": 4083792, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 3.7843925985518907, |
|
"grad_norm": 1.2243348360061646, |
|
"learning_rate": 4.9000000000000005e-06, |
|
"loss": 0.0241, |
|
"num_input_tokens_seen": 4090672, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.7908286403861626, |
|
"grad_norm": 1.4968576431274414, |
|
"learning_rate": 4.9083333333333335e-06, |
|
"loss": 0.0404, |
|
"num_input_tokens_seen": 4097472, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 3.7972646822204346, |
|
"grad_norm": 1.235217809677124, |
|
"learning_rate": 4.9166666666666665e-06, |
|
"loss": 0.0094, |
|
"num_input_tokens_seen": 4104016, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.8037007240547065, |
|
"grad_norm": 1.3862783908843994, |
|
"learning_rate": 4.925e-06, |
|
"loss": 0.0196, |
|
"num_input_tokens_seen": 4110784, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 3.8101367658889784, |
|
"grad_norm": 3.560793399810791, |
|
"learning_rate": 4.933333333333334e-06, |
|
"loss": 0.0514, |
|
"num_input_tokens_seen": 4117984, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 3.8165728077232504, |
|
"grad_norm": 2.008575677871704, |
|
"learning_rate": 4.941666666666667e-06, |
|
"loss": 0.0286, |
|
"num_input_tokens_seen": 4125072, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 3.823008849557522, |
|
"grad_norm": 2.3213093280792236, |
|
"learning_rate": 4.95e-06, |
|
"loss": 0.0417, |
|
"num_input_tokens_seen": 4132160, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 3.829444891391794, |
|
"grad_norm": 1.3540257215499878, |
|
"learning_rate": 4.958333333333334e-06, |
|
"loss": 0.0347, |
|
"num_input_tokens_seen": 4139136, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.8358809332260657, |
|
"grad_norm": 1.289825677871704, |
|
"learning_rate": 4.966666666666667e-06, |
|
"loss": 0.0229, |
|
"num_input_tokens_seen": 4146240, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 3.8423169750603376, |
|
"grad_norm": 2.4050135612487793, |
|
"learning_rate": 4.975000000000001e-06, |
|
"loss": 0.0176, |
|
"num_input_tokens_seen": 4153152, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 3.8487530168946096, |
|
"grad_norm": 1.523977518081665, |
|
"learning_rate": 4.983333333333334e-06, |
|
"loss": 0.0274, |
|
"num_input_tokens_seen": 4160080, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 3.8551890587288815, |
|
"grad_norm": 1.1898863315582275, |
|
"learning_rate": 4.991666666666667e-06, |
|
"loss": 0.0253, |
|
"num_input_tokens_seen": 4167008, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 3.8616251005631534, |
|
"grad_norm": 1.992311954498291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0429, |
|
"num_input_tokens_seen": 4174080, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.8680611423974254, |
|
"grad_norm": 0.9558950066566467, |
|
"learning_rate": 4.999597169822646e-06, |
|
"loss": 0.0142, |
|
"num_input_tokens_seen": 4181104, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 3.8744971842316973, |
|
"grad_norm": 0.9275301694869995, |
|
"learning_rate": 4.998388809108304e-06, |
|
"loss": 0.0148, |
|
"num_input_tokens_seen": 4188096, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 3.8809332260659692, |
|
"grad_norm": 1.6707432270050049, |
|
"learning_rate": 4.996375307268303e-06, |
|
"loss": 0.0166, |
|
"num_input_tokens_seen": 4195152, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 3.887369267900241, |
|
"grad_norm": 5.857227325439453, |
|
"learning_rate": 4.993557313182086e-06, |
|
"loss": 0.0224, |
|
"num_input_tokens_seen": 4201952, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 3.893805309734513, |
|
"grad_norm": 5.273613452911377, |
|
"learning_rate": 4.989935734988098e-06, |
|
"loss": 0.0227, |
|
"num_input_tokens_seen": 4209104, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.900241351568785, |
|
"grad_norm": 6.268670082092285, |
|
"learning_rate": 4.985511739791129e-06, |
|
"loss": 0.0597, |
|
"num_input_tokens_seen": 4216496, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 3.906677393403057, |
|
"grad_norm": 3.373368501663208, |
|
"learning_rate": 4.980286753286196e-06, |
|
"loss": 0.0339, |
|
"num_input_tokens_seen": 4223840, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 3.913113435237329, |
|
"grad_norm": 1.3991198539733887, |
|
"learning_rate": 4.974262459299088e-06, |
|
"loss": 0.0192, |
|
"num_input_tokens_seen": 4230752, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 3.919549477071601, |
|
"grad_norm": 0.7424534559249878, |
|
"learning_rate": 4.967440799243739e-06, |
|
"loss": 0.007, |
|
"num_input_tokens_seen": 4237360, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 3.9259855189058728, |
|
"grad_norm": 3.0347440242767334, |
|
"learning_rate": 4.959823971496575e-06, |
|
"loss": 0.017, |
|
"num_input_tokens_seen": 4244128, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.9324215607401447, |
|
"grad_norm": 2.929175853729248, |
|
"learning_rate": 4.9514144306880506e-06, |
|
"loss": 0.0296, |
|
"num_input_tokens_seen": 4251264, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 3.9388576025744166, |
|
"grad_norm": 4.076401710510254, |
|
"learning_rate": 4.942214886911619e-06, |
|
"loss": 0.0429, |
|
"num_input_tokens_seen": 4258256, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 3.9452936444086886, |
|
"grad_norm": 0.7720851302146912, |
|
"learning_rate": 4.932228304850363e-06, |
|
"loss": 0.0027, |
|
"num_input_tokens_seen": 4265280, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 3.9517296862429605, |
|
"grad_norm": 1.500545859336853, |
|
"learning_rate": 4.921457902821578e-06, |
|
"loss": 0.0395, |
|
"num_input_tokens_seen": 4271968, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 3.9581657280772324, |
|
"grad_norm": 3.0767860412597656, |
|
"learning_rate": 4.909907151739634e-06, |
|
"loss": 0.03, |
|
"num_input_tokens_seen": 4278848, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.9646017699115044, |
|
"grad_norm": 1.5455620288848877, |
|
"learning_rate": 4.897579773997415e-06, |
|
"loss": 0.0178, |
|
"num_input_tokens_seen": 4285808, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 3.9710378117457763, |
|
"grad_norm": 1.1472654342651367, |
|
"learning_rate": 4.884479742266731e-06, |
|
"loss": 0.0139, |
|
"num_input_tokens_seen": 4292912, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 3.9774738535800482, |
|
"grad_norm": 1.3290921449661255, |
|
"learning_rate": 4.870611278218066e-06, |
|
"loss": 0.0076, |
|
"num_input_tokens_seen": 4300176, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 3.98390989541432, |
|
"grad_norm": 4.543910026550293, |
|
"learning_rate": 4.855978851160088e-06, |
|
"loss": 0.0683, |
|
"num_input_tokens_seen": 4307776, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 3.990345937248592, |
|
"grad_norm": 3.424959421157837, |
|
"learning_rate": 4.8405871765993435e-06, |
|
"loss": 0.0367, |
|
"num_input_tokens_seen": 4314688, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.996781979082864, |
|
"grad_norm": 1.5345810651779175, |
|
"learning_rate": 4.824441214720629e-06, |
|
"loss": 0.0497, |
|
"num_input_tokens_seen": 4321840, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 4.003218020917136, |
|
"grad_norm": 0.5405219793319702, |
|
"learning_rate": 4.8075461687884935e-06, |
|
"loss": 0.0054, |
|
"num_input_tokens_seen": 4328736, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 4.009654062751408, |
|
"grad_norm": 2.3540198802948, |
|
"learning_rate": 4.7899074834704165e-06, |
|
"loss": 0.0259, |
|
"num_input_tokens_seen": 4335952, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 4.01609010458568, |
|
"grad_norm": 0.7733599543571472, |
|
"learning_rate": 4.771530843082187e-06, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 4342816, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 4.022526146419952, |
|
"grad_norm": 3.051017999649048, |
|
"learning_rate": 4.752422169756048e-06, |
|
"loss": 0.0359, |
|
"num_input_tokens_seen": 4349456, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.028962188254224, |
|
"grad_norm": 0.4645274579524994, |
|
"learning_rate": 4.732587621532214e-06, |
|
"loss": 0.0081, |
|
"num_input_tokens_seen": 4356032, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 4.035398230088496, |
|
"grad_norm": 1.9294419288635254, |
|
"learning_rate": 4.712033590374346e-06, |
|
"loss": 0.0118, |
|
"num_input_tokens_seen": 4362928, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 4.041834271922768, |
|
"grad_norm": 2.5432851314544678, |
|
"learning_rate": 4.690766700109659e-06, |
|
"loss": 0.0235, |
|
"num_input_tokens_seen": 4369616, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 4.0482703137570395, |
|
"grad_norm": 1.8334590196609497, |
|
"learning_rate": 4.668793804294294e-06, |
|
"loss": 0.0145, |
|
"num_input_tokens_seen": 4376656, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 4.054706355591311, |
|
"grad_norm": 0.6473208069801331, |
|
"learning_rate": 4.646121984004666e-06, |
|
"loss": 0.006, |
|
"num_input_tokens_seen": 4383696, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.061142397425583, |
|
"grad_norm": 2.0988128185272217, |
|
"learning_rate": 4.622758545555485e-06, |
|
"loss": 0.0191, |
|
"num_input_tokens_seen": 4390880, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 4.067578439259855, |
|
"grad_norm": 1.8957973718643188, |
|
"learning_rate": 4.598711018145193e-06, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 4398000, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 4.074014481094127, |
|
"grad_norm": 1.117255449295044, |
|
"learning_rate": 4.573987151429579e-06, |
|
"loss": 0.0253, |
|
"num_input_tokens_seen": 4404640, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 4.080450522928399, |
|
"grad_norm": 2.326129198074341, |
|
"learning_rate": 4.54859491302433e-06, |
|
"loss": 0.0317, |
|
"num_input_tokens_seen": 4411760, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 4.086886564762671, |
|
"grad_norm": 1.6843276023864746, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 4418896, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.093322606596943, |
|
"grad_norm": 2.301496744155884, |
|
"learning_rate": 4.495838265931754e-06, |
|
"loss": 0.0101, |
|
"num_input_tokens_seen": 4425776, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 4.099758648431215, |
|
"grad_norm": 1.434444546699524, |
|
"learning_rate": 4.4684908588200305e-06, |
|
"loss": 0.0112, |
|
"num_input_tokens_seen": 4432656, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 4.106194690265487, |
|
"grad_norm": 1.3446779251098633, |
|
"learning_rate": 4.440509077690883e-06, |
|
"loss": 0.0034, |
|
"num_input_tokens_seen": 4439424, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.112630732099759, |
|
"grad_norm": 0.6733867526054382, |
|
"learning_rate": 4.411901940068997e-06, |
|
"loss": 0.0037, |
|
"num_input_tokens_seen": 4446160, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 4.119066773934031, |
|
"grad_norm": 1.339034080505371, |
|
"learning_rate": 4.382678665009028e-06, |
|
"loss": 0.0085, |
|
"num_input_tokens_seen": 4453376, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.125502815768303, |
|
"grad_norm": 3.2036638259887695, |
|
"learning_rate": 4.352848670124637e-06, |
|
"loss": 0.0328, |
|
"num_input_tokens_seen": 4459952, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 4.131938857602575, |
|
"grad_norm": 1.1791878938674927, |
|
"learning_rate": 4.322421568553529e-06, |
|
"loss": 0.0098, |
|
"num_input_tokens_seen": 4466880, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 4.1383748994368466, |
|
"grad_norm": 1.8526674509048462, |
|
"learning_rate": 4.291407165859481e-06, |
|
"loss": 0.0051, |
|
"num_input_tokens_seen": 4474064, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 4.1448109412711185, |
|
"grad_norm": 0.4795032739639282, |
|
"learning_rate": 4.259815456872363e-06, |
|
"loss": 0.0047, |
|
"num_input_tokens_seen": 4480864, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 4.15124698310539, |
|
"grad_norm": 1.4392155408859253, |
|
"learning_rate": 4.227656622467162e-06, |
|
"loss": 0.0111, |
|
"num_input_tokens_seen": 4487504, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.157683024939662, |
|
"grad_norm": 3.185128688812256, |
|
"learning_rate": 4.194941026283053e-06, |
|
"loss": 0.0334, |
|
"num_input_tokens_seen": 4494512, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 4.164119066773934, |
|
"grad_norm": 1.7285927534103394, |
|
"learning_rate": 4.161679211383565e-06, |
|
"loss": 0.013, |
|
"num_input_tokens_seen": 4501296, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 4.170555108608206, |
|
"grad_norm": 4.266958713531494, |
|
"learning_rate": 4.127881896858934e-06, |
|
"loss": 0.0305, |
|
"num_input_tokens_seen": 4508128, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 4.176991150442478, |
|
"grad_norm": 1.000532627105713, |
|
"learning_rate": 4.093559974371725e-06, |
|
"loss": 0.0092, |
|
"num_input_tokens_seen": 4515008, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 4.18342719227675, |
|
"grad_norm": 1.1824270486831665, |
|
"learning_rate": 4.058724504646834e-06, |
|
"loss": 0.0223, |
|
"num_input_tokens_seen": 4521920, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.189863234111022, |
|
"grad_norm": 2.444427728652954, |
|
"learning_rate": 4.023386713907021e-06, |
|
"loss": 0.0234, |
|
"num_input_tokens_seen": 4528912, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 4.196299275945294, |
|
"grad_norm": 1.421184778213501, |
|
"learning_rate": 3.987557990255093e-06, |
|
"loss": 0.0185, |
|
"num_input_tokens_seen": 4535664, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 4.202735317779566, |
|
"grad_norm": 0.9019869565963745, |
|
"learning_rate": 3.951249880003934e-06, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 4542832, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 4.209171359613838, |
|
"grad_norm": 1.7373372316360474, |
|
"learning_rate": 3.914474083955537e-06, |
|
"loss": 0.0217, |
|
"num_input_tokens_seen": 4549552, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 4.21560740144811, |
|
"grad_norm": 0.31386592984199524, |
|
"learning_rate": 3.8772424536302565e-06, |
|
"loss": 0.0027, |
|
"num_input_tokens_seen": 4556192, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.222043443282382, |
|
"grad_norm": 1.8379613161087036, |
|
"learning_rate": 3.839566987447492e-06, |
|
"loss": 0.0153, |
|
"num_input_tokens_seen": 4563168, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 4.228479485116654, |
|
"grad_norm": 1.221056342124939, |
|
"learning_rate": 3.801459826859022e-06, |
|
"loss": 0.0092, |
|
"num_input_tokens_seen": 4570704, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 4.2349155269509255, |
|
"grad_norm": 0.7823006510734558, |
|
"learning_rate": 3.7629332524362532e-06, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 4578016, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 4.2413515687851975, |
|
"grad_norm": 1.149715781211853, |
|
"learning_rate": 3.7239996799126315e-06, |
|
"loss": 0.0163, |
|
"num_input_tokens_seen": 4584896, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 4.247787610619469, |
|
"grad_norm": 0.6069539189338684, |
|
"learning_rate": 3.684671656182497e-06, |
|
"loss": 0.0099, |
|
"num_input_tokens_seen": 4591984, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.254223652453741, |
|
"grad_norm": 2.427281141281128, |
|
"learning_rate": 3.644961855257669e-06, |
|
"loss": 0.0269, |
|
"num_input_tokens_seen": 4598656, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 4.260659694288013, |
|
"grad_norm": 1.0770633220672607, |
|
"learning_rate": 3.6048830741830678e-06, |
|
"loss": 0.007, |
|
"num_input_tokens_seen": 4606032, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 4.267095736122285, |
|
"grad_norm": 2.4310688972473145, |
|
"learning_rate": 3.564448228912682e-06, |
|
"loss": 0.0427, |
|
"num_input_tokens_seen": 4613056, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 4.273531777956556, |
|
"grad_norm": 1.2328161001205444, |
|
"learning_rate": 3.523670350147227e-06, |
|
"loss": 0.0122, |
|
"num_input_tokens_seen": 4619776, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 4.279967819790828, |
|
"grad_norm": 1.519998550415039, |
|
"learning_rate": 3.4825625791348093e-06, |
|
"loss": 0.0137, |
|
"num_input_tokens_seen": 4626240, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.2864038616251, |
|
"grad_norm": 1.4114880561828613, |
|
"learning_rate": 3.44113816343598e-06, |
|
"loss": 0.02, |
|
"num_input_tokens_seen": 4633216, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 4.292839903459372, |
|
"grad_norm": 1.4585809707641602, |
|
"learning_rate": 3.399410452654518e-06, |
|
"loss": 0.006, |
|
"num_input_tokens_seen": 4639856, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 4.299275945293644, |
|
"grad_norm": 1.594936490058899, |
|
"learning_rate": 3.357392894135329e-06, |
|
"loss": 0.0085, |
|
"num_input_tokens_seen": 4646832, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 4.305711987127916, |
|
"grad_norm": 2.5802690982818604, |
|
"learning_rate": 3.315099028630855e-06, |
|
"loss": 0.0112, |
|
"num_input_tokens_seen": 4653648, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 4.312148028962188, |
|
"grad_norm": 1.3826483488082886, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 0.0131, |
|
"num_input_tokens_seen": 4660672, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.31858407079646, |
|
"grad_norm": 2.1874148845672607, |
|
"learning_rate": 3.229736980502584e-06, |
|
"loss": 0.0124, |
|
"num_input_tokens_seen": 4667888, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 4.325020112630732, |
|
"grad_norm": 1.61604642868042, |
|
"learning_rate": 3.186696307005976e-06, |
|
"loss": 0.0042, |
|
"num_input_tokens_seen": 4675072, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 4.331456154465004, |
|
"grad_norm": 0.40999871492385864, |
|
"learning_rate": 3.1434343359132565e-06, |
|
"loss": 0.0011, |
|
"num_input_tokens_seen": 4682016, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 4.337892196299276, |
|
"grad_norm": 0.1305094212293625, |
|
"learning_rate": 3.099965009006415e-06, |
|
"loss": 0.0008, |
|
"num_input_tokens_seen": 4688912, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 4.3443282381335475, |
|
"grad_norm": 1.6623185873031616, |
|
"learning_rate": 3.056302334890786e-06, |
|
"loss": 0.0056, |
|
"num_input_tokens_seen": 4695936, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.3507642799678194, |
|
"grad_norm": 1.034837007522583, |
|
"learning_rate": 3.0124603844805767e-06, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 4703184, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 4.357200321802091, |
|
"grad_norm": 2.2049107551574707, |
|
"learning_rate": 2.9684532864643123e-06, |
|
"loss": 0.0216, |
|
"num_input_tokens_seen": 4710064, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 4.32258939743042, |
|
"learning_rate": 2.9242952227516726e-06, |
|
"loss": 0.0258, |
|
"num_input_tokens_seen": 4716336, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 4.370072405470635, |
|
"grad_norm": 1.0949031114578247, |
|
"learning_rate": 2.8800004239031687e-06, |
|
"loss": 0.0049, |
|
"num_input_tokens_seen": 4723360, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 4.376508447304907, |
|
"grad_norm": 1.563004493713379, |
|
"learning_rate": 2.835583164544139e-06, |
|
"loss": 0.0034, |
|
"num_input_tokens_seen": 4730464, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.382944489139179, |
|
"grad_norm": 2.775270938873291, |
|
"learning_rate": 2.791057758764557e-06, |
|
"loss": 0.0341, |
|
"num_input_tokens_seen": 4737056, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 4.389380530973451, |
|
"grad_norm": 3.1517560482025146, |
|
"learning_rate": 2.7464385555061092e-06, |
|
"loss": 0.0074, |
|
"num_input_tokens_seen": 4743936, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 4.395816572807723, |
|
"grad_norm": 1.2521913051605225, |
|
"learning_rate": 2.7017399339380435e-06, |
|
"loss": 0.0272, |
|
"num_input_tokens_seen": 4751024, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 4.402252614641995, |
|
"grad_norm": 3.4706435203552246, |
|
"learning_rate": 2.6569762988232838e-06, |
|
"loss": 0.0168, |
|
"num_input_tokens_seen": 4758000, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 4.408688656476267, |
|
"grad_norm": 0.8021034598350525, |
|
"learning_rate": 2.6121620758762877e-06, |
|
"loss": 0.0047, |
|
"num_input_tokens_seen": 4764816, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.415124698310539, |
|
"grad_norm": 4.709753036499023, |
|
"learning_rate": 2.5673117071141574e-06, |
|
"loss": 0.0198, |
|
"num_input_tokens_seen": 4772144, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 4.421560740144811, |
|
"grad_norm": 0.40973323583602905, |
|
"learning_rate": 2.522439646202495e-06, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 4778960, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 4.427996781979083, |
|
"grad_norm": 3.179236888885498, |
|
"learning_rate": 2.4775603537975055e-06, |
|
"loss": 0.0256, |
|
"num_input_tokens_seen": 4785952, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 4.434432823813355, |
|
"grad_norm": 2.5204341411590576, |
|
"learning_rate": 2.4326882928858435e-06, |
|
"loss": 0.0187, |
|
"num_input_tokens_seen": 4792608, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 4.4408688656476265, |
|
"grad_norm": 3.6536998748779297, |
|
"learning_rate": 2.3878379241237136e-06, |
|
"loss": 0.0135, |
|
"num_input_tokens_seen": 4799232, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.447304907481898, |
|
"grad_norm": 1.0689839124679565, |
|
"learning_rate": 2.3430237011767166e-06, |
|
"loss": 0.0036, |
|
"num_input_tokens_seen": 4806080, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 4.45374094931617, |
|
"grad_norm": 2.071629762649536, |
|
"learning_rate": 2.2982600660619574e-06, |
|
"loss": 0.0135, |
|
"num_input_tokens_seen": 4813728, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 4.460176991150442, |
|
"grad_norm": 3.4168224334716797, |
|
"learning_rate": 2.253561444493891e-06, |
|
"loss": 0.0046, |
|
"num_input_tokens_seen": 4820608, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 4.466613032984714, |
|
"grad_norm": 0.3058677017688751, |
|
"learning_rate": 2.2089422412354434e-06, |
|
"loss": 0.0019, |
|
"num_input_tokens_seen": 4827056, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 4.473049074818986, |
|
"grad_norm": 0.4175882935523987, |
|
"learning_rate": 2.1644168354558623e-06, |
|
"loss": 0.0022, |
|
"num_input_tokens_seen": 4834080, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.479485116653258, |
|
"grad_norm": 0.7226863503456116, |
|
"learning_rate": 2.119999576096832e-06, |
|
"loss": 0.0093, |
|
"num_input_tokens_seen": 4840912, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 4.48592115848753, |
|
"grad_norm": 0.1190720871090889, |
|
"learning_rate": 2.0757047772483278e-06, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 4848112, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 4.492357200321802, |
|
"grad_norm": 1.0061287879943848, |
|
"learning_rate": 2.031546713535688e-06, |
|
"loss": 0.0036, |
|
"num_input_tokens_seen": 4855072, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 4.498793242156074, |
|
"grad_norm": 0.9472126364707947, |
|
"learning_rate": 1.987539615519424e-06, |
|
"loss": 0.0071, |
|
"num_input_tokens_seen": 4862064, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 4.505229283990346, |
|
"grad_norm": 0.8338857889175415, |
|
"learning_rate": 1.9436976651092143e-06, |
|
"loss": 0.0055, |
|
"num_input_tokens_seen": 4869104, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.511665325824618, |
|
"grad_norm": 3.2061474323272705, |
|
"learning_rate": 1.9000349909935852e-06, |
|
"loss": 0.0291, |
|
"num_input_tokens_seen": 4876112, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 4.51810136765889, |
|
"grad_norm": 3.644125461578369, |
|
"learning_rate": 1.8565656640867448e-06, |
|
"loss": 0.0407, |
|
"num_input_tokens_seen": 4883264, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 4.524537409493162, |
|
"grad_norm": 2.2370316982269287, |
|
"learning_rate": 1.813303692994025e-06, |
|
"loss": 0.0245, |
|
"num_input_tokens_seen": 4890192, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 4.530973451327434, |
|
"grad_norm": 3.3120510578155518, |
|
"learning_rate": 1.770263019497417e-06, |
|
"loss": 0.0207, |
|
"num_input_tokens_seen": 4897200, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 4.5374094931617055, |
|
"grad_norm": 1.256335973739624, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 0.0269, |
|
"num_input_tokens_seen": 4904016, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.543845534995977, |
|
"grad_norm": 0.10977872461080551, |
|
"learning_rate": 1.6849009713691456e-06, |
|
"loss": 0.001, |
|
"num_input_tokens_seen": 4910944, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 4.550281576830249, |
|
"grad_norm": 1.9825077056884766, |
|
"learning_rate": 1.6426071058646718e-06, |
|
"loss": 0.0205, |
|
"num_input_tokens_seen": 4917424, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 4.556717618664521, |
|
"grad_norm": 0.7529383897781372, |
|
"learning_rate": 1.6005895473454836e-06, |
|
"loss": 0.0148, |
|
"num_input_tokens_seen": 4924288, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 4.563153660498793, |
|
"grad_norm": 2.29215145111084, |
|
"learning_rate": 1.55886183656402e-06, |
|
"loss": 0.0239, |
|
"num_input_tokens_seen": 4931040, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 4.569589702333065, |
|
"grad_norm": 1.639636754989624, |
|
"learning_rate": 1.5174374208651913e-06, |
|
"loss": 0.0165, |
|
"num_input_tokens_seen": 4937968, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.576025744167337, |
|
"grad_norm": 1.8043317794799805, |
|
"learning_rate": 1.4763296498527744e-06, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 4945456, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 4.582461786001609, |
|
"grad_norm": 1.8007737398147583, |
|
"learning_rate": 1.4355517710873184e-06, |
|
"loss": 0.0338, |
|
"num_input_tokens_seen": 4952080, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 4.588897827835881, |
|
"grad_norm": 0.6810876131057739, |
|
"learning_rate": 1.395116925816934e-06, |
|
"loss": 0.0136, |
|
"num_input_tokens_seen": 4958944, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 4.595333869670153, |
|
"grad_norm": 1.0080180168151855, |
|
"learning_rate": 1.3550381447423317e-06, |
|
"loss": 0.0126, |
|
"num_input_tokens_seen": 4966320, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 4.601769911504425, |
|
"grad_norm": 1.1210750341415405, |
|
"learning_rate": 1.3153283438175036e-06, |
|
"loss": 0.0174, |
|
"num_input_tokens_seen": 4973344, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.608205953338697, |
|
"grad_norm": 2.2793147563934326, |
|
"learning_rate": 1.27600032008737e-06, |
|
"loss": 0.0155, |
|
"num_input_tokens_seen": 4980304, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 4.614641995172969, |
|
"grad_norm": 2.0746471881866455, |
|
"learning_rate": 1.2370667475637474e-06, |
|
"loss": 0.0349, |
|
"num_input_tokens_seen": 4987616, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 4.621078037007241, |
|
"grad_norm": 1.9974377155303955, |
|
"learning_rate": 1.1985401731409793e-06, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 4994656, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 4.627514078841513, |
|
"grad_norm": 0.9225305914878845, |
|
"learning_rate": 1.160433012552508e-06, |
|
"loss": 0.0204, |
|
"num_input_tokens_seen": 5001776, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 4.6339501206757845, |
|
"grad_norm": 0.6030845642089844, |
|
"learning_rate": 1.122757546369744e-06, |
|
"loss": 0.0074, |
|
"num_input_tokens_seen": 5008688, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.640386162510056, |
|
"grad_norm": 1.1969950199127197, |
|
"learning_rate": 1.085525916044464e-06, |
|
"loss": 0.0154, |
|
"num_input_tokens_seen": 5015680, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 4.646822204344328, |
|
"grad_norm": 1.7312675714492798, |
|
"learning_rate": 1.048750119996066e-06, |
|
"loss": 0.0101, |
|
"num_input_tokens_seen": 5022336, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 4.6532582461786, |
|
"grad_norm": 0.9403418898582458, |
|
"learning_rate": 1.0124420097449077e-06, |
|
"loss": 0.0107, |
|
"num_input_tokens_seen": 5029184, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 4.659694288012872, |
|
"grad_norm": 2.2545931339263916, |
|
"learning_rate": 9.7661328609298e-07, |
|
"loss": 0.0279, |
|
"num_input_tokens_seen": 5036000, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 4.666130329847144, |
|
"grad_norm": 0.5637010931968689, |
|
"learning_rate": 9.412754953531664e-07, |
|
"loss": 0.0044, |
|
"num_input_tokens_seen": 5042944, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.672566371681416, |
|
"grad_norm": 0.24136967957019806, |
|
"learning_rate": 9.064400256282757e-07, |
|
"loss": 0.0021, |
|
"num_input_tokens_seen": 5049840, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 4.679002413515688, |
|
"grad_norm": 1.0340116024017334, |
|
"learning_rate": 8.721181031410661e-07, |
|
"loss": 0.0086, |
|
"num_input_tokens_seen": 5057296, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 4.68543845534996, |
|
"grad_norm": 0.548861026763916, |
|
"learning_rate": 8.383207886164366e-07, |
|
"loss": 0.005, |
|
"num_input_tokens_seen": 5064560, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 4.691874497184232, |
|
"grad_norm": 1.089135766029358, |
|
"learning_rate": 8.050589737169485e-07, |
|
"loss": 0.0096, |
|
"num_input_tokens_seen": 5071472, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 4.698310539018504, |
|
"grad_norm": 0.3106631636619568, |
|
"learning_rate": 7.723433775328385e-07, |
|
"loss": 0.0029, |
|
"num_input_tokens_seen": 5078512, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.704746580852776, |
|
"grad_norm": 1.3499066829681396, |
|
"learning_rate": 7.401845431276378e-07, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 5085248, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 4.711182622687048, |
|
"grad_norm": 0.30332618951797485, |
|
"learning_rate": 7.085928341405193e-07, |
|
"loss": 0.0033, |
|
"num_input_tokens_seen": 5092160, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 4.71761866452132, |
|
"grad_norm": 0.7549375295639038, |
|
"learning_rate": 6.775784314464717e-07, |
|
"loss": 0.0253, |
|
"num_input_tokens_seen": 5099360, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 4.7240547063555915, |
|
"grad_norm": 1.567395567893982, |
|
"learning_rate": 6.471513298753634e-07, |
|
"loss": 0.0117, |
|
"num_input_tokens_seen": 5106160, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 4.7304907481898635, |
|
"grad_norm": 1.192610502243042, |
|
"learning_rate": 6.17321334990973e-07, |
|
"loss": 0.0052, |
|
"num_input_tokens_seen": 5113264, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.736926790024135, |
|
"grad_norm": 3.9402077198028564, |
|
"learning_rate": 5.880980599310041e-07, |
|
"loss": 0.0305, |
|
"num_input_tokens_seen": 5120032, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 4.743362831858407, |
|
"grad_norm": 0.3623356223106384, |
|
"learning_rate": 5.59490922309118e-07, |
|
"loss": 0.0018, |
|
"num_input_tokens_seen": 5127280, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 4.749798873692679, |
|
"grad_norm": 0.815592885017395, |
|
"learning_rate": 5.3150914117997e-07, |
|
"loss": 0.0066, |
|
"num_input_tokens_seen": 5134400, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 4.756234915526951, |
|
"grad_norm": 0.4423564076423645, |
|
"learning_rate": 5.041617340682467e-07, |
|
"loss": 0.0032, |
|
"num_input_tokens_seen": 5141488, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 4.762670957361223, |
|
"grad_norm": 0.5768114924430847, |
|
"learning_rate": 4.774575140626317e-07, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 5148432, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.769106999195495, |
|
"grad_norm": 1.2286343574523926, |
|
"learning_rate": 4.514050869756703e-07, |
|
"loss": 0.0124, |
|
"num_input_tokens_seen": 5155328, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 4.775543041029767, |
|
"grad_norm": 0.552872359752655, |
|
"learning_rate": 4.2601284857042263e-07, |
|
"loss": 0.0022, |
|
"num_input_tokens_seen": 5163008, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 4.781979082864039, |
|
"grad_norm": 0.6165493726730347, |
|
"learning_rate": 4.012889818548069e-07, |
|
"loss": 0.0063, |
|
"num_input_tokens_seen": 5170096, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 4.788415124698311, |
|
"grad_norm": 1.1403653621673584, |
|
"learning_rate": 3.772414544445163e-07, |
|
"loss": 0.0149, |
|
"num_input_tokens_seen": 5177536, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 4.794851166532583, |
|
"grad_norm": 0.1795167326927185, |
|
"learning_rate": 3.538780159953348e-07, |
|
"loss": 0.0012, |
|
"num_input_tokens_seen": 5184608, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 4.801287208366855, |
|
"grad_norm": 0.9326004981994629, |
|
"learning_rate": 3.312061957057061e-07, |
|
"loss": 0.0127, |
|
"num_input_tokens_seen": 5191344, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 4.807723250201127, |
|
"grad_norm": 0.41363996267318726, |
|
"learning_rate": 3.092332998903416e-07, |
|
"loss": 0.0018, |
|
"num_input_tokens_seen": 5198416, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 4.814159292035399, |
|
"grad_norm": 0.538027286529541, |
|
"learning_rate": 2.8796640962565374e-07, |
|
"loss": 0.0034, |
|
"num_input_tokens_seen": 5205392, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 4.8205953338696705, |
|
"grad_norm": 1.531555414199829, |
|
"learning_rate": 2.674123784677868e-07, |
|
"loss": 0.0137, |
|
"num_input_tokens_seen": 5213216, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 4.8270313757039425, |
|
"grad_norm": 1.671035647392273, |
|
"learning_rate": 2.4757783024395244e-07, |
|
"loss": 0.0219, |
|
"num_input_tokens_seen": 5220032, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.833467417538214, |
|
"grad_norm": 0.30722492933273315, |
|
"learning_rate": 2.284691569178138e-07, |
|
"loss": 0.0014, |
|
"num_input_tokens_seen": 5226816, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 4.839903459372486, |
|
"grad_norm": 1.3107943534851074, |
|
"learning_rate": 2.100925165295839e-07, |
|
"loss": 0.019, |
|
"num_input_tokens_seen": 5233920, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 4.846339501206758, |
|
"grad_norm": 2.1163885593414307, |
|
"learning_rate": 1.9245383121150678e-07, |
|
"loss": 0.0075, |
|
"num_input_tokens_seen": 5241344, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 4.85277554304103, |
|
"grad_norm": 1.2636387348175049, |
|
"learning_rate": 1.7555878527937164e-07, |
|
"loss": 0.0078, |
|
"num_input_tokens_seen": 5248256, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 4.859211584875302, |
|
"grad_norm": 4.166254997253418, |
|
"learning_rate": 1.59412823400657e-07, |
|
"loss": 0.0244, |
|
"num_input_tokens_seen": 5255248, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 4.865647626709574, |
|
"grad_norm": 1.078273892402649, |
|
"learning_rate": 1.4402114883991318e-07, |
|
"loss": 0.0218, |
|
"num_input_tokens_seen": 5262048, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 4.872083668543846, |
|
"grad_norm": 2.091312885284424, |
|
"learning_rate": 1.2938872178193395e-07, |
|
"loss": 0.0044, |
|
"num_input_tokens_seen": 5268848, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 4.878519710378118, |
|
"grad_norm": 1.7236751317977905, |
|
"learning_rate": 1.1552025773327008e-07, |
|
"loss": 0.0122, |
|
"num_input_tokens_seen": 5275664, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 4.88495575221239, |
|
"grad_norm": 0.9874201416969299, |
|
"learning_rate": 1.0242022600258611e-07, |
|
"loss": 0.007, |
|
"num_input_tokens_seen": 5282112, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 4.891391794046662, |
|
"grad_norm": 0.6303602457046509, |
|
"learning_rate": 9.00928482603669e-08, |
|
"loss": 0.0019, |
|
"num_input_tokens_seen": 5288912, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.897827835880933, |
|
"grad_norm": 0.7971038818359375, |
|
"learning_rate": 7.854209717842231e-08, |
|
"loss": 0.0147, |
|
"num_input_tokens_seen": 5295920, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 4.904263877715205, |
|
"grad_norm": 1.0757670402526855, |
|
"learning_rate": 6.777169514963766e-08, |
|
"loss": 0.0087, |
|
"num_input_tokens_seen": 5302816, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 4.910699919549477, |
|
"grad_norm": 1.8044992685317993, |
|
"learning_rate": 5.778511308838108e-08, |
|
"loss": 0.0085, |
|
"num_input_tokens_seen": 5309680, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 4.917135961383749, |
|
"grad_norm": 0.3801545202732086, |
|
"learning_rate": 4.8585569311949966e-08, |
|
"loss": 0.0026, |
|
"num_input_tokens_seen": 5316848, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 4.923572003218021, |
|
"grad_norm": 0.20918627083301544, |
|
"learning_rate": 4.017602850342584e-08, |
|
"loss": 0.0018, |
|
"num_input_tokens_seen": 5323760, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 4.9300080450522925, |
|
"grad_norm": 2.037950277328491, |
|
"learning_rate": 3.2559200756260845e-08, |
|
"loss": 0.0072, |
|
"num_input_tokens_seen": 5330336, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 4.936444086886564, |
|
"grad_norm": 0.8903030753135681, |
|
"learning_rate": 2.5737540700912777e-08, |
|
"loss": 0.0079, |
|
"num_input_tokens_seen": 5336816, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 4.942880128720836, |
|
"grad_norm": 1.0508862733840942, |
|
"learning_rate": 1.9713246713805588e-08, |
|
"loss": 0.0275, |
|
"num_input_tokens_seen": 5344064, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 4.949316170555108, |
|
"grad_norm": 1.0068142414093018, |
|
"learning_rate": 1.4488260208871397e-08, |
|
"loss": 0.0036, |
|
"num_input_tokens_seen": 5351328, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 4.95575221238938, |
|
"grad_norm": 1.5033273696899414, |
|
"learning_rate": 1.006426501190233e-08, |
|
"loss": 0.0501, |
|
"num_input_tokens_seen": 5358672, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.962188254223652, |
|
"grad_norm": 0.667352557182312, |
|
"learning_rate": 6.442686817914878e-09, |
|
"loss": 0.0082, |
|
"num_input_tokens_seen": 5365648, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 4.968624296057924, |
|
"grad_norm": 0.9037322998046875, |
|
"learning_rate": 3.6246927316976875e-09, |
|
"loss": 0.0032, |
|
"num_input_tokens_seen": 5372432, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 4.975060337892196, |
|
"grad_norm": 0.3071233630180359, |
|
"learning_rate": 1.6111908916965902e-09, |
|
"loss": 0.0017, |
|
"num_input_tokens_seen": 5379648, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 4.981496379726468, |
|
"grad_norm": 0.7171315550804138, |
|
"learning_rate": 4.0283017735454066e-10, |
|
"loss": 0.0042, |
|
"num_input_tokens_seen": 5386864, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 4.98793242156074, |
|
"grad_norm": 2.855295181274414, |
|
"learning_rate": 0.0, |
|
"loss": 0.0176, |
|
"num_input_tokens_seen": 5393616, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.98793242156074, |
|
"num_input_tokens_seen": 5393616, |
|
"step": 775, |
|
"total_flos": 2.1382484588285133e+17, |
|
"train_loss": 0.5414434323177463, |
|
"train_runtime": 8640.8816, |
|
"train_samples_per_second": 11.503, |
|
"train_steps_per_second": 0.09 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 775, |
|
"num_input_tokens_seen": 5393616, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1382484588285133e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|