|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0564542753383954, |
|
"eval_steps": 200, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002641135688345989, |
|
"grad_norm": 38990.80078125, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 39.1249, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002641135688345989, |
|
"eval_loss": 10.096823692321777, |
|
"eval_runtime": 2.1873, |
|
"eval_samples_per_second": 226.308, |
|
"eval_steps_per_second": 28.346, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005282271376691978, |
|
"grad_norm": 22047.08203125, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 36.867, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007923407065037967, |
|
"grad_norm": 81682.6796875, |
|
"learning_rate": 3e-06, |
|
"loss": 39.9853, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010564542753383956, |
|
"grad_norm": 26123.87109375, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 38.4879, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013205678441729944, |
|
"grad_norm": 29102.25390625, |
|
"learning_rate": 5e-06, |
|
"loss": 37.9034, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015846814130075933, |
|
"grad_norm": 24646.03125, |
|
"learning_rate": 6e-06, |
|
"loss": 37.7639, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01848794981842192, |
|
"grad_norm": 56427.59375, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 37.4074, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02112908550676791, |
|
"grad_norm": 28639.47265625, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 37.1164, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0237702211951139, |
|
"grad_norm": 18426.78515625, |
|
"learning_rate": 9e-06, |
|
"loss": 37.2912, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02641135688345989, |
|
"grad_norm": 29681.62109375, |
|
"learning_rate": 1e-05, |
|
"loss": 37.1851, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029052492571805876, |
|
"grad_norm": 61535.3671875, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 38.4262, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03169362826015187, |
|
"grad_norm": 28930.455078125, |
|
"learning_rate": 1.2e-05, |
|
"loss": 37.3699, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.034334763948497854, |
|
"grad_norm": 31548.685546875, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 36.4952, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03697589963684384, |
|
"grad_norm": 18435.33203125, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 36.2176, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03961703532518983, |
|
"grad_norm": 28708.28515625, |
|
"learning_rate": 1.5e-05, |
|
"loss": 36.6194, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04225817101353582, |
|
"grad_norm": 20564.423828125, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 36.3482, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04489930670188181, |
|
"grad_norm": 17695.943359375, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 35.715, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0475404423902278, |
|
"grad_norm": 15335.0712890625, |
|
"learning_rate": 1.8e-05, |
|
"loss": 34.6268, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.050181578078573784, |
|
"grad_norm": 13583.33203125, |
|
"learning_rate": 1.9e-05, |
|
"loss": 35.5606, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05282271376691978, |
|
"grad_norm": 26019.890625, |
|
"learning_rate": 2e-05, |
|
"loss": 35.5296, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.055463849455265765, |
|
"grad_norm": 14379.9052734375, |
|
"learning_rate": 2.1e-05, |
|
"loss": 34.4806, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05810498514361175, |
|
"grad_norm": 27736.314453125, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 36.2501, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06074612083195774, |
|
"grad_norm": 260855.0, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 150.3672, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06338725652030373, |
|
"grad_norm": 422515.90625, |
|
"learning_rate": 2.4e-05, |
|
"loss": 292.9531, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06602839220864971, |
|
"grad_norm": 1018015.1875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 355.2773, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06866952789699571, |
|
"grad_norm": 817252.75, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 268.3635, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0713106635853417, |
|
"grad_norm": 650477.3125, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 304.9252, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07395179927368768, |
|
"grad_norm": 405537.28125, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 298.1852, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07659293496203368, |
|
"grad_norm": 1309428.0, |
|
"learning_rate": 2.9e-05, |
|
"loss": 405.8535, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07923407065037966, |
|
"grad_norm": 574544.0625, |
|
"learning_rate": 3e-05, |
|
"loss": 341.7129, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08187520633872565, |
|
"grad_norm": 425538.625, |
|
"learning_rate": 3.1e-05, |
|
"loss": 260.0619, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08451634202707164, |
|
"grad_norm": 566855.6875, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 272.2906, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08715747771541763, |
|
"grad_norm": 300651.28125, |
|
"learning_rate": 3.3e-05, |
|
"loss": 79.3429, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08979861340376362, |
|
"grad_norm": 14083.6494140625, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 35.4547, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09243974909210961, |
|
"grad_norm": 22081.201171875, |
|
"learning_rate": 3.5e-05, |
|
"loss": 34.5699, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0950808847804556, |
|
"grad_norm": 11022.6884765625, |
|
"learning_rate": 3.6e-05, |
|
"loss": 34.3495, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09772202046880159, |
|
"grad_norm": 11900.990234375, |
|
"learning_rate": 3.7e-05, |
|
"loss": 35.6864, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10036315615714757, |
|
"grad_norm": 13156.771484375, |
|
"learning_rate": 3.8e-05, |
|
"loss": 34.4444, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10300429184549356, |
|
"grad_norm": 11813.6083984375, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 34.9737, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10564542753383956, |
|
"grad_norm": 15030.2021484375, |
|
"learning_rate": 4e-05, |
|
"loss": 34.0348, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10828656322218554, |
|
"grad_norm": 11196.2529296875, |
|
"learning_rate": 4.1e-05, |
|
"loss": 34.3456, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11092769891053153, |
|
"grad_norm": 11016.130859375, |
|
"learning_rate": 4.2e-05, |
|
"loss": 34.3275, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11356883459887751, |
|
"grad_norm": 14342.5283203125, |
|
"learning_rate": 4.3e-05, |
|
"loss": 33.6461, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1162099702872235, |
|
"grad_norm": 13592.828125, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 35.0608, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1188511059755695, |
|
"grad_norm": 14278.0205078125, |
|
"learning_rate": 4.5e-05, |
|
"loss": 33.6034, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12149224166391548, |
|
"grad_norm": 15676.8076171875, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 34.7689, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12413337735226147, |
|
"grad_norm": 16533.037109375, |
|
"learning_rate": 4.7e-05, |
|
"loss": 34.1397, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12677451304060747, |
|
"grad_norm": 17516.21484375, |
|
"learning_rate": 4.8e-05, |
|
"loss": 35.4853, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12941564872895345, |
|
"grad_norm": 22093.806640625, |
|
"learning_rate": 4.9e-05, |
|
"loss": 36.8646, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13205678441729943, |
|
"grad_norm": 34389.921875, |
|
"learning_rate": 5e-05, |
|
"loss": 39.5266, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13469792010564544, |
|
"grad_norm": 7149.9775390625, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 35.8088, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13733905579399142, |
|
"grad_norm": 6511.89306640625, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 35.0745, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1399801914823374, |
|
"grad_norm": 11293.515625, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 33.7064, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1426213271706834, |
|
"grad_norm": 7394.4853515625, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 34.612, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14526246285902938, |
|
"grad_norm": 7513.56982421875, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 34.2795, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14790359854737536, |
|
"grad_norm": 12561.0849609375, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 34.1079, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15054473423572137, |
|
"grad_norm": 7255.42724609375, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 33.7773, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15318586992406735, |
|
"grad_norm": 8305.197265625, |
|
"learning_rate": 5.8e-05, |
|
"loss": 33.5425, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.15582700561241333, |
|
"grad_norm": 7724.32666015625, |
|
"learning_rate": 5.9e-05, |
|
"loss": 34.3069, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1584681413007593, |
|
"grad_norm": 6973.86669921875, |
|
"learning_rate": 6e-05, |
|
"loss": 31.8323, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16110927698910532, |
|
"grad_norm": 8178.408203125, |
|
"learning_rate": 6.1e-05, |
|
"loss": 33.4728, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1637504126774513, |
|
"grad_norm": 7446.3310546875, |
|
"learning_rate": 6.2e-05, |
|
"loss": 32.2049, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.16639154836579728, |
|
"grad_norm": 7538.81494140625, |
|
"learning_rate": 6.3e-05, |
|
"loss": 31.9451, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1690326840541433, |
|
"grad_norm": 7067.33154296875, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 32.0696, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17167381974248927, |
|
"grad_norm": 7199.02294921875, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 31.7234, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17431495543083525, |
|
"grad_norm": 6351.2900390625, |
|
"learning_rate": 6.6e-05, |
|
"loss": 31.4103, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17695609111918126, |
|
"grad_norm": 9954.1572265625, |
|
"learning_rate": 6.7e-05, |
|
"loss": 31.2581, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17959722680752724, |
|
"grad_norm": 6812.11083984375, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 31.0586, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18223836249587322, |
|
"grad_norm": 6788.81787109375, |
|
"learning_rate": 6.9e-05, |
|
"loss": 31.2012, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.18487949818421923, |
|
"grad_norm": 6330.77880859375, |
|
"learning_rate": 7e-05, |
|
"loss": 31.758, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1875206338725652, |
|
"grad_norm": 6925.2958984375, |
|
"learning_rate": 7.1e-05, |
|
"loss": 31.7811, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1901617695609112, |
|
"grad_norm": 15530.548828125, |
|
"learning_rate": 7.2e-05, |
|
"loss": 38.1656, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19280290524925717, |
|
"grad_norm": 248175.0, |
|
"learning_rate": 7.3e-05, |
|
"loss": 314.6807, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.19544404093760318, |
|
"grad_norm": 348192.3125, |
|
"learning_rate": 7.4e-05, |
|
"loss": 319.9785, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.19808517662594916, |
|
"grad_norm": 399153.90625, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 245.481, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.20072631231429514, |
|
"grad_norm": 293389.5, |
|
"learning_rate": 7.6e-05, |
|
"loss": 269.3301, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20336744800264114, |
|
"grad_norm": 536027.375, |
|
"learning_rate": 7.7e-05, |
|
"loss": 240.3848, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.20600858369098712, |
|
"grad_norm": 340636.96875, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 257.0401, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2086497193793331, |
|
"grad_norm": 343861.0625, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 246.5806, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2112908550676791, |
|
"grad_norm": 377362.75, |
|
"learning_rate": 8e-05, |
|
"loss": 204.2622, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2139319907560251, |
|
"grad_norm": 495172.15625, |
|
"learning_rate": 8.1e-05, |
|
"loss": 152.9565, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.21657312644437107, |
|
"grad_norm": 410514.21875, |
|
"learning_rate": 8.2e-05, |
|
"loss": 120.1336, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.21921426213271708, |
|
"grad_norm": 37318.89453125, |
|
"learning_rate": 8.3e-05, |
|
"loss": 43.507, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.22185539782106306, |
|
"grad_norm": 46563.8515625, |
|
"learning_rate": 8.4e-05, |
|
"loss": 42.6661, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22449653350940904, |
|
"grad_norm": 25882.45703125, |
|
"learning_rate": 8.5e-05, |
|
"loss": 41.1904, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.22713766919775502, |
|
"grad_norm": 21462.017578125, |
|
"learning_rate": 8.6e-05, |
|
"loss": 35.6957, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.22977880488610103, |
|
"grad_norm": 11826.3798828125, |
|
"learning_rate": 8.7e-05, |
|
"loss": 33.1654, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.232419940574447, |
|
"grad_norm": 10408.4365234375, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 32.1405, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.235061076262793, |
|
"grad_norm": 9028.2587890625, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 30.806, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.237702211951139, |
|
"grad_norm": 14064.7021484375, |
|
"learning_rate": 9e-05, |
|
"loss": 32.0673, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24034334763948498, |
|
"grad_norm": 10274.6611328125, |
|
"learning_rate": 9.1e-05, |
|
"loss": 30.8923, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.24298448332783096, |
|
"grad_norm": 13376.0947265625, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 32.0376, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24562561901617697, |
|
"grad_norm": 13412.4970703125, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 32.5937, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.24826675470452295, |
|
"grad_norm": 17289.099609375, |
|
"learning_rate": 9.4e-05, |
|
"loss": 32.0219, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2509078903928689, |
|
"grad_norm": 10165.4990234375, |
|
"learning_rate": 9.5e-05, |
|
"loss": 32.7753, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.25354902608121493, |
|
"grad_norm": 16371.439453125, |
|
"learning_rate": 9.6e-05, |
|
"loss": 31.7399, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2561901617695609, |
|
"grad_norm": 28360.642578125, |
|
"learning_rate": 9.7e-05, |
|
"loss": 32.4525, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2588312974579069, |
|
"grad_norm": 19952.9296875, |
|
"learning_rate": 9.8e-05, |
|
"loss": 33.4285, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2614724331462529, |
|
"grad_norm": 20724.11328125, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 34.1331, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.26411356883459886, |
|
"grad_norm": 38431.6328125, |
|
"learning_rate": 0.0001, |
|
"loss": 40.0183, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26675470452294486, |
|
"grad_norm": 9295.7626953125, |
|
"learning_rate": 9.99999993018897e-05, |
|
"loss": 32.908, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.26939584021129087, |
|
"grad_norm": 9068.3134765625, |
|
"learning_rate": 9.999999720755877e-05, |
|
"loss": 32.7796, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2720369758996368, |
|
"grad_norm": 9507.033203125, |
|
"learning_rate": 9.99999937170073e-05, |
|
"loss": 33.0735, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.27467811158798283, |
|
"grad_norm": 9898.73046875, |
|
"learning_rate": 9.999998883023537e-05, |
|
"loss": 34.4524, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27731924727632884, |
|
"grad_norm": 8197.7294921875, |
|
"learning_rate": 9.999998254724313e-05, |
|
"loss": 34.3429, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2799603829646748, |
|
"grad_norm": 7723.392578125, |
|
"learning_rate": 9.999997486803075e-05, |
|
"loss": 33.3132, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2826015186530208, |
|
"grad_norm": 9174.4091796875, |
|
"learning_rate": 9.999996579259843e-05, |
|
"loss": 32.9465, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2852426543413668, |
|
"grad_norm": 10098.0283203125, |
|
"learning_rate": 9.999995532094644e-05, |
|
"loss": 34.1124, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.28788379002971276, |
|
"grad_norm": 7904.126953125, |
|
"learning_rate": 9.999994345307508e-05, |
|
"loss": 32.68, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.29052492571805877, |
|
"grad_norm": 7395.32177734375, |
|
"learning_rate": 9.999993018898466e-05, |
|
"loss": 32.1147, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2931660614064048, |
|
"grad_norm": 15490.7314453125, |
|
"learning_rate": 9.999991552867558e-05, |
|
"loss": 32.7157, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.29580719709475073, |
|
"grad_norm": 6962.9326171875, |
|
"learning_rate": 9.99998994721482e-05, |
|
"loss": 32.5103, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29844833278309674, |
|
"grad_norm": 6746.60546875, |
|
"learning_rate": 9.999988201940302e-05, |
|
"loss": 31.9245, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.30108946847144274, |
|
"grad_norm": 6943.94140625, |
|
"learning_rate": 9.999986317044051e-05, |
|
"loss": 30.399, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3037306041597887, |
|
"grad_norm": 6095.4384765625, |
|
"learning_rate": 9.999984292526118e-05, |
|
"loss": 29.941, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3063717398481347, |
|
"grad_norm": 6518.970703125, |
|
"learning_rate": 9.999982128386562e-05, |
|
"loss": 30.5093, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3090128755364807, |
|
"grad_norm": 5806.0927734375, |
|
"learning_rate": 9.99997982462544e-05, |
|
"loss": 29.6937, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.31165401122482667, |
|
"grad_norm": 6216.46435546875, |
|
"learning_rate": 9.999977381242821e-05, |
|
"loss": 29.7115, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3142951469131727, |
|
"grad_norm": 5445.48828125, |
|
"learning_rate": 9.999974798238769e-05, |
|
"loss": 28.9644, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3169362826015186, |
|
"grad_norm": 4930.64453125, |
|
"learning_rate": 9.99997207561336e-05, |
|
"loss": 29.803, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31957741828986463, |
|
"grad_norm": 5866.5478515625, |
|
"learning_rate": 9.999969213366667e-05, |
|
"loss": 29.2732, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.32221855397821064, |
|
"grad_norm": 13160.4111328125, |
|
"learning_rate": 9.99996621149877e-05, |
|
"loss": 31.3179, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3248596896665566, |
|
"grad_norm": 577362.4375, |
|
"learning_rate": 9.999963070009755e-05, |
|
"loss": 192.1116, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3275008253549026, |
|
"grad_norm": 447577.625, |
|
"learning_rate": 9.999959788899706e-05, |
|
"loss": 353.353, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3301419610432486, |
|
"grad_norm": 422884.03125, |
|
"learning_rate": 9.999956368168719e-05, |
|
"loss": 328.2871, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.33278309673159456, |
|
"grad_norm": 217372.875, |
|
"learning_rate": 9.999952807816888e-05, |
|
"loss": 311.041, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.33542423241994057, |
|
"grad_norm": 197269.15625, |
|
"learning_rate": 9.99994910784431e-05, |
|
"loss": 331.5454, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3380653681082866, |
|
"grad_norm": 447190.15625, |
|
"learning_rate": 9.999945268251092e-05, |
|
"loss": 292.5098, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.34070650379663253, |
|
"grad_norm": 156708.53125, |
|
"learning_rate": 9.999941289037338e-05, |
|
"loss": 329.5899, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.34334763948497854, |
|
"grad_norm": 214527.265625, |
|
"learning_rate": 9.999937170203162e-05, |
|
"loss": 295.6437, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34598877517332455, |
|
"grad_norm": 144792.09375, |
|
"learning_rate": 9.999932911748678e-05, |
|
"loss": 321.7724, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3486299108616705, |
|
"grad_norm": 183092.328125, |
|
"learning_rate": 9.999928513674004e-05, |
|
"loss": 138.0811, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3512710465500165, |
|
"grad_norm": 6041.10107421875, |
|
"learning_rate": 9.999923975979262e-05, |
|
"loss": 30.1601, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3539121822383625, |
|
"grad_norm": 5054.18798828125, |
|
"learning_rate": 9.999919298664582e-05, |
|
"loss": 29.4563, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.35655331792670847, |
|
"grad_norm": 9742.12890625, |
|
"learning_rate": 9.999914481730092e-05, |
|
"loss": 29.5483, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3591944536150545, |
|
"grad_norm": 13321.4970703125, |
|
"learning_rate": 9.999909525175927e-05, |
|
"loss": 29.7589, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3618355893034005, |
|
"grad_norm": 9211.091796875, |
|
"learning_rate": 9.999904429002225e-05, |
|
"loss": 30.0795, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.36447672499174644, |
|
"grad_norm": 10673.2529296875, |
|
"learning_rate": 9.99989919320913e-05, |
|
"loss": 29.8073, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.36711786068009244, |
|
"grad_norm": 9673.37109375, |
|
"learning_rate": 9.999893817796786e-05, |
|
"loss": 30.8933, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.36975899636843845, |
|
"grad_norm": 10085.38671875, |
|
"learning_rate": 9.999888302765345e-05, |
|
"loss": 29.8822, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3724001320567844, |
|
"grad_norm": 10791.521484375, |
|
"learning_rate": 9.99988264811496e-05, |
|
"loss": 30.1218, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3750412677451304, |
|
"grad_norm": 11358.93359375, |
|
"learning_rate": 9.99987685384579e-05, |
|
"loss": 30.735, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3776824034334764, |
|
"grad_norm": 7013.380859375, |
|
"learning_rate": 9.999870919957996e-05, |
|
"loss": 29.9077, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3803235391218224, |
|
"grad_norm": 7458.63525390625, |
|
"learning_rate": 9.999864846451744e-05, |
|
"loss": 30.7425, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3829646748101684, |
|
"grad_norm": 8038.50732421875, |
|
"learning_rate": 9.999858633327201e-05, |
|
"loss": 31.053, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.38560581049851433, |
|
"grad_norm": 7841.15283203125, |
|
"learning_rate": 9.999852280584544e-05, |
|
"loss": 30.7345, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.38824694618686034, |
|
"grad_norm": 7719.5048828125, |
|
"learning_rate": 9.999845788223949e-05, |
|
"loss": 30.9241, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.39088808187520635, |
|
"grad_norm": 13179.359375, |
|
"learning_rate": 9.999839156245598e-05, |
|
"loss": 31.945, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3935292175635523, |
|
"grad_norm": 11153.3046875, |
|
"learning_rate": 9.999832384649674e-05, |
|
"loss": 34.644, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3961703532518983, |
|
"grad_norm": 47252.56640625, |
|
"learning_rate": 9.999825473436369e-05, |
|
"loss": 39.1459, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3988114889402443, |
|
"grad_norm": 5307.408203125, |
|
"learning_rate": 9.999818422605875e-05, |
|
"loss": 32.3124, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.40145262462859027, |
|
"grad_norm": 8414.1484375, |
|
"learning_rate": 9.999811232158389e-05, |
|
"loss": 31.5456, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4040937603169363, |
|
"grad_norm": 5779.16943359375, |
|
"learning_rate": 9.999803902094109e-05, |
|
"loss": 32.0291, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.4067348960052823, |
|
"grad_norm": 6989.2958984375, |
|
"learning_rate": 9.999796432413244e-05, |
|
"loss": 32.4468, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.40937603169362824, |
|
"grad_norm": 10169.005859375, |
|
"learning_rate": 9.999788823116001e-05, |
|
"loss": 33.1476, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.41201716738197425, |
|
"grad_norm": 6967.77197265625, |
|
"learning_rate": 9.999781074202592e-05, |
|
"loss": 32.4884, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.41465830307032026, |
|
"grad_norm": 8052.6611328125, |
|
"learning_rate": 9.999773185673232e-05, |
|
"loss": 33.8162, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.4172994387586662, |
|
"grad_norm": 13675.26953125, |
|
"learning_rate": 9.999765157528145e-05, |
|
"loss": 33.4981, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4199405744470122, |
|
"grad_norm": 23900.8515625, |
|
"learning_rate": 9.99975698976755e-05, |
|
"loss": 33.6758, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4225817101353582, |
|
"grad_norm": 8697.0146484375, |
|
"learning_rate": 9.99974868239168e-05, |
|
"loss": 33.4007, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4252228458237042, |
|
"grad_norm": 7423.0234375, |
|
"learning_rate": 9.999740235400765e-05, |
|
"loss": 32.8796, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4278639815120502, |
|
"grad_norm": 8968.0107421875, |
|
"learning_rate": 9.999731648795041e-05, |
|
"loss": 35.1091, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4305051172003962, |
|
"grad_norm": 8960.2470703125, |
|
"learning_rate": 9.999722922574749e-05, |
|
"loss": 34.6028, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.43314625288874214, |
|
"grad_norm": 9324.4716796875, |
|
"learning_rate": 9.999714056740129e-05, |
|
"loss": 35.0468, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.43578738857708815, |
|
"grad_norm": 15031.443359375, |
|
"learning_rate": 9.999705051291432e-05, |
|
"loss": 33.8078, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.43842852426543416, |
|
"grad_norm": 10380.2470703125, |
|
"learning_rate": 9.999695906228908e-05, |
|
"loss": 34.8672, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4410696599537801, |
|
"grad_norm": 18920.16796875, |
|
"learning_rate": 9.999686621552813e-05, |
|
"loss": 34.697, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4437107956421261, |
|
"grad_norm": 17273.609375, |
|
"learning_rate": 9.999677197263406e-05, |
|
"loss": 35.5471, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.44635193133047213, |
|
"grad_norm": 10327.810546875, |
|
"learning_rate": 9.999667633360952e-05, |
|
"loss": 33.3773, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4489930670188181, |
|
"grad_norm": 15529.2939453125, |
|
"learning_rate": 9.999657929845714e-05, |
|
"loss": 35.3255, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4516342027071641, |
|
"grad_norm": 15885.65625, |
|
"learning_rate": 9.999648086717966e-05, |
|
"loss": 35.0333, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.45427533839551004, |
|
"grad_norm": 16440.353515625, |
|
"learning_rate": 9.999638103977982e-05, |
|
"loss": 36.1782, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.45691647408385605, |
|
"grad_norm": 623966.3125, |
|
"learning_rate": 9.999627981626041e-05, |
|
"loss": 117.4766, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.45955760977220206, |
|
"grad_norm": 443642.1875, |
|
"learning_rate": 9.999617719662426e-05, |
|
"loss": 256.9298, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.462198745460548, |
|
"grad_norm": 426303.78125, |
|
"learning_rate": 9.999607318087423e-05, |
|
"loss": 213.1021, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.464839881148894, |
|
"grad_norm": 753837.9375, |
|
"learning_rate": 9.999596776901322e-05, |
|
"loss": 234.3458, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.46748101683724, |
|
"grad_norm": 500841.875, |
|
"learning_rate": 9.999586096104419e-05, |
|
"loss": 242.1502, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.470122152525586, |
|
"grad_norm": 488348.28125, |
|
"learning_rate": 9.99957527569701e-05, |
|
"loss": 259.3533, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.472763288213932, |
|
"grad_norm": 599034.6875, |
|
"learning_rate": 9.999564315679398e-05, |
|
"loss": 254.9457, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.475404423902278, |
|
"grad_norm": 740236.3125, |
|
"learning_rate": 9.99955321605189e-05, |
|
"loss": 200.2197, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47804555959062395, |
|
"grad_norm": 279145.40625, |
|
"learning_rate": 9.999541976814796e-05, |
|
"loss": 211.6974, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.48068669527896996, |
|
"grad_norm": 565175.5625, |
|
"learning_rate": 9.999530597968428e-05, |
|
"loss": 152.1028, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.48332783096731596, |
|
"grad_norm": 29268.025390625, |
|
"learning_rate": 9.999519079513107e-05, |
|
"loss": 37.5746, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4859689666556619, |
|
"grad_norm": 19322.490234375, |
|
"learning_rate": 9.999507421449151e-05, |
|
"loss": 38.4138, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4886101023440079, |
|
"grad_norm": 27010.8203125, |
|
"learning_rate": 9.999495623776886e-05, |
|
"loss": 35.2608, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.49125123803235393, |
|
"grad_norm": 25924.7890625, |
|
"learning_rate": 9.999483686496645e-05, |
|
"loss": 38.389, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4938923737206999, |
|
"grad_norm": 33607.66015625, |
|
"learning_rate": 9.999471609608757e-05, |
|
"loss": 35.7422, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4965335094090459, |
|
"grad_norm": 19824.349609375, |
|
"learning_rate": 9.999459393113561e-05, |
|
"loss": 37.8325, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4991746450973919, |
|
"grad_norm": 16384.638671875, |
|
"learning_rate": 9.9994470370114e-05, |
|
"loss": 37.3911, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5018157807857379, |
|
"grad_norm": 15732.8330078125, |
|
"learning_rate": 9.999434541302616e-05, |
|
"loss": 35.9949, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5044569164740839, |
|
"grad_norm": 23623.61328125, |
|
"learning_rate": 9.99942190598756e-05, |
|
"loss": 36.3237, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5070980521624299, |
|
"grad_norm": 32387.189453125, |
|
"learning_rate": 9.999409131066583e-05, |
|
"loss": 36.7266, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5097391878507759, |
|
"grad_norm": 19656.185546875, |
|
"learning_rate": 9.999396216540044e-05, |
|
"loss": 36.9575, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5123803235391218, |
|
"grad_norm": 20705.455078125, |
|
"learning_rate": 9.999383162408304e-05, |
|
"loss": 37.2048, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5150214592274678, |
|
"grad_norm": 21470.52734375, |
|
"learning_rate": 9.999369968671723e-05, |
|
"loss": 36.1668, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5176625949158138, |
|
"grad_norm": 19358.25, |
|
"learning_rate": 9.999356635330674e-05, |
|
"loss": 33.7397, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5203037306041598, |
|
"grad_norm": 19253.916015625, |
|
"learning_rate": 9.999343162385529e-05, |
|
"loss": 36.7927, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5229448662925058, |
|
"grad_norm": 41119.46875, |
|
"learning_rate": 9.99932954983666e-05, |
|
"loss": 36.7557, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5255860019808518, |
|
"grad_norm": 23741.87109375, |
|
"learning_rate": 9.999315797684451e-05, |
|
"loss": 38.4819, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5282271376691977, |
|
"grad_norm": 33874.09765625, |
|
"learning_rate": 9.999301905929286e-05, |
|
"loss": 42.3858, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5282271376691977, |
|
"eval_loss": 7.334134578704834, |
|
"eval_runtime": 2.2174, |
|
"eval_samples_per_second": 223.237, |
|
"eval_steps_per_second": 27.961, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5308682733575437, |
|
"grad_norm": 15801.6083984375, |
|
"learning_rate": 9.999287874571552e-05, |
|
"loss": 38.8128, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5335094090458897, |
|
"grad_norm": 12974.27734375, |
|
"learning_rate": 9.99927370361164e-05, |
|
"loss": 38.6081, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5361505447342357, |
|
"grad_norm": 12007.9013671875, |
|
"learning_rate": 9.999259393049947e-05, |
|
"loss": 37.1496, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5387916804225817, |
|
"grad_norm": 13070.220703125, |
|
"learning_rate": 9.999244942886871e-05, |
|
"loss": 38.7187, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5414328161109278, |
|
"grad_norm": 16807.220703125, |
|
"learning_rate": 9.999230353122819e-05, |
|
"loss": 41.07, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5440739517992736, |
|
"grad_norm": 14268.9052734375, |
|
"learning_rate": 9.999215623758194e-05, |
|
"loss": 40.1817, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5467150874876197, |
|
"grad_norm": 13336.4287109375, |
|
"learning_rate": 9.99920075479341e-05, |
|
"loss": 37.3859, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5493562231759657, |
|
"grad_norm": 15000.0390625, |
|
"learning_rate": 9.999185746228882e-05, |
|
"loss": 37.9181, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5519973588643117, |
|
"grad_norm": 11059.775390625, |
|
"learning_rate": 9.999170598065028e-05, |
|
"loss": 37.7867, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5546384945526577, |
|
"grad_norm": 12954.494140625, |
|
"learning_rate": 9.999155310302273e-05, |
|
"loss": 38.3371, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5572796302410036, |
|
"grad_norm": 10920.3037109375, |
|
"learning_rate": 9.999139882941043e-05, |
|
"loss": 35.1785, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5599207659293496, |
|
"grad_norm": 15022.30078125, |
|
"learning_rate": 9.999124315981766e-05, |
|
"loss": 35.528, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5625619016176956, |
|
"grad_norm": 10339.8525390625, |
|
"learning_rate": 9.999108609424881e-05, |
|
"loss": 34.5773, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5652030373060416, |
|
"grad_norm": 9615.1484375, |
|
"learning_rate": 9.999092763270823e-05, |
|
"loss": 34.6027, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5678441729943876, |
|
"grad_norm": 13707.630859375, |
|
"learning_rate": 9.999076777520037e-05, |
|
"loss": 34.8469, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5704853086827336, |
|
"grad_norm": 13718.404296875, |
|
"learning_rate": 9.99906065217297e-05, |
|
"loss": 34.0409, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5731264443710795, |
|
"grad_norm": 12160.12109375, |
|
"learning_rate": 9.99904438723007e-05, |
|
"loss": 32.9267, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5757675800594255, |
|
"grad_norm": 9693.056640625, |
|
"learning_rate": 9.999027982691793e-05, |
|
"loss": 33.0474, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5784087157477715, |
|
"grad_norm": 14817.9755859375, |
|
"learning_rate": 9.999011438558595e-05, |
|
"loss": 33.6275, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5810498514361175, |
|
"grad_norm": 12656.400390625, |
|
"learning_rate": 9.99899475483094e-05, |
|
"loss": 33.9675, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5836909871244635, |
|
"grad_norm": 17197.283203125, |
|
"learning_rate": 9.998977931509291e-05, |
|
"loss": 35.6857, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5863321228128096, |
|
"grad_norm": 215147.109375, |
|
"learning_rate": 9.998960968594121e-05, |
|
"loss": 88.1464, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5889732585011554, |
|
"grad_norm": 625456.3125, |
|
"learning_rate": 9.998943866085903e-05, |
|
"loss": 186.8345, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5916143941895015, |
|
"grad_norm": 491068.96875, |
|
"learning_rate": 9.998926623985114e-05, |
|
"loss": 158.0338, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5942555298778475, |
|
"grad_norm": 626101.125, |
|
"learning_rate": 9.998909242292235e-05, |
|
"loss": 218.7658, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5968966655661935, |
|
"grad_norm": 303837.34375, |
|
"learning_rate": 9.998891721007752e-05, |
|
"loss": 186.0703, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5995378012545395, |
|
"grad_norm": 354231.84375, |
|
"learning_rate": 9.998874060132155e-05, |
|
"loss": 162.2602, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6021789369428855, |
|
"grad_norm": 570096.0625, |
|
"learning_rate": 9.998856259665936e-05, |
|
"loss": 165.2661, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6048200726312314, |
|
"grad_norm": 405688.65625, |
|
"learning_rate": 9.998838319609591e-05, |
|
"loss": 159.5345, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6074612083195774, |
|
"grad_norm": 592211.125, |
|
"learning_rate": 9.998820239963624e-05, |
|
"loss": 141.6046, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6101023440079234, |
|
"grad_norm": 678225.0625, |
|
"learning_rate": 9.998802020728537e-05, |
|
"loss": 84.9725, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6127434796962694, |
|
"grad_norm": 22088.375, |
|
"learning_rate": 9.998783661904843e-05, |
|
"loss": 38.1227, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 19927.962890625, |
|
"learning_rate": 9.99876516349305e-05, |
|
"loss": 37.8816, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6180257510729614, |
|
"grad_norm": 33203.27734375, |
|
"learning_rate": 9.998746525493674e-05, |
|
"loss": 34.0087, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6206668867613073, |
|
"grad_norm": 10135.03515625, |
|
"learning_rate": 9.99872774790724e-05, |
|
"loss": 34.0175, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6233080224496533, |
|
"grad_norm": 11513.166015625, |
|
"learning_rate": 9.99870883073427e-05, |
|
"loss": 32.6651, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6259491581379993, |
|
"grad_norm": 7397.00732421875, |
|
"learning_rate": 9.998689773975291e-05, |
|
"loss": 32.2064, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6285902938263453, |
|
"grad_norm": 10573.4638671875, |
|
"learning_rate": 9.998670577630838e-05, |
|
"loss": 32.1057, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6312314295146914, |
|
"grad_norm": 10578.8310546875, |
|
"learning_rate": 9.998651241701445e-05, |
|
"loss": 32.1381, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6338725652030373, |
|
"grad_norm": 9302.189453125, |
|
"learning_rate": 9.998631766187651e-05, |
|
"loss": 32.8179, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6365137008913833, |
|
"grad_norm": 8694.892578125, |
|
"learning_rate": 9.998612151090003e-05, |
|
"loss": 32.7711, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6391548365797293, |
|
"grad_norm": 10467.7099609375, |
|
"learning_rate": 9.998592396409047e-05, |
|
"loss": 33.1121, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6417959722680753, |
|
"grad_norm": 11832.251953125, |
|
"learning_rate": 9.998572502145334e-05, |
|
"loss": 32.8568, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6444371079564213, |
|
"grad_norm": 14376.9228515625, |
|
"learning_rate": 9.998552468299421e-05, |
|
"loss": 32.5907, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6470782436447673, |
|
"grad_norm": 13190.787109375, |
|
"learning_rate": 9.998532294871866e-05, |
|
"loss": 32.6583, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6497193793331132, |
|
"grad_norm": 10301.1328125, |
|
"learning_rate": 9.998511981863232e-05, |
|
"loss": 31.7794, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6523605150214592, |
|
"grad_norm": 18970.587890625, |
|
"learning_rate": 9.998491529274089e-05, |
|
"loss": 32.5321, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6550016507098052, |
|
"grad_norm": 10323.8408203125, |
|
"learning_rate": 9.998470937105006e-05, |
|
"loss": 32.6962, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6576427863981512, |
|
"grad_norm": 13553.1123046875, |
|
"learning_rate": 9.998450205356557e-05, |
|
"loss": 34.1782, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6602839220864972, |
|
"grad_norm": 34080.28125, |
|
"learning_rate": 9.998429334029323e-05, |
|
"loss": 37.3095, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6629250577748432, |
|
"grad_norm": 12205.15234375, |
|
"learning_rate": 9.998408323123887e-05, |
|
"loss": 33.7182, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6655661934631891, |
|
"grad_norm": 11019.15234375, |
|
"learning_rate": 9.998387172640834e-05, |
|
"loss": 34.2941, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6682073291515351, |
|
"grad_norm": 10185.3310546875, |
|
"learning_rate": 9.998365882580756e-05, |
|
"loss": 34.5573, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6708484648398811, |
|
"grad_norm": 8710.2685546875, |
|
"learning_rate": 9.998344452944247e-05, |
|
"loss": 33.6592, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6734896005282272, |
|
"grad_norm": 8050.28759765625, |
|
"learning_rate": 9.998322883731903e-05, |
|
"loss": 33.1733, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6761307362165732, |
|
"grad_norm": 6891.90673828125, |
|
"learning_rate": 9.998301174944332e-05, |
|
"loss": 32.2699, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6787718719049192, |
|
"grad_norm": 6904.37060546875, |
|
"learning_rate": 9.998279326582134e-05, |
|
"loss": 33.2969, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6814130075932651, |
|
"grad_norm": 6681.41162109375, |
|
"learning_rate": 9.998257338645924e-05, |
|
"loss": 32.5617, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6840541432816111, |
|
"grad_norm": 7499.51025390625, |
|
"learning_rate": 9.998235211136312e-05, |
|
"loss": 31.2502, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6866952789699571, |
|
"grad_norm": 5850.79931640625, |
|
"learning_rate": 9.99821294405392e-05, |
|
"loss": 31.384, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6893364146583031, |
|
"grad_norm": 5846.03271484375, |
|
"learning_rate": 9.998190537399366e-05, |
|
"loss": 31.2545, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6919775503466491, |
|
"grad_norm": 7224.54833984375, |
|
"learning_rate": 9.998167991173277e-05, |
|
"loss": 31.2568, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6946186860349951, |
|
"grad_norm": 6079.56982421875, |
|
"learning_rate": 9.998145305376286e-05, |
|
"loss": 31.7204, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.697259821723341, |
|
"grad_norm": 7802.859375, |
|
"learning_rate": 9.99812248000902e-05, |
|
"loss": 30.3375, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.699900957411687, |
|
"grad_norm": 7014.5146484375, |
|
"learning_rate": 9.998099515072122e-05, |
|
"loss": 30.6416, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.702542093100033, |
|
"grad_norm": 6766.64208984375, |
|
"learning_rate": 9.998076410566229e-05, |
|
"loss": 30.4145, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.705183228788379, |
|
"grad_norm": 6723.0986328125, |
|
"learning_rate": 9.99805316649199e-05, |
|
"loss": 29.3229, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.707824364476725, |
|
"grad_norm": 8847.9677734375, |
|
"learning_rate": 9.998029782850051e-05, |
|
"loss": 29.2886, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7104655001650709, |
|
"grad_norm": 5896.45458984375, |
|
"learning_rate": 9.998006259641068e-05, |
|
"loss": 29.5852, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7131066358534169, |
|
"grad_norm": 7112.9150390625, |
|
"learning_rate": 9.997982596865695e-05, |
|
"loss": 29.5084, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.715747771541763, |
|
"grad_norm": 8039.98876953125, |
|
"learning_rate": 9.997958794524594e-05, |
|
"loss": 31.9893, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.718388907230109, |
|
"grad_norm": 179267.265625, |
|
"learning_rate": 9.99793485261843e-05, |
|
"loss": 140.9562, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.721030042918455, |
|
"grad_norm": 578681.125, |
|
"learning_rate": 9.997910771147872e-05, |
|
"loss": 262.198, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.723671178606801, |
|
"grad_norm": 322541.34375, |
|
"learning_rate": 9.99788655011359e-05, |
|
"loss": 237.3132, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7263123142951469, |
|
"grad_norm": 235946.640625, |
|
"learning_rate": 9.997862189516263e-05, |
|
"loss": 300.6354, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7289534499834929, |
|
"grad_norm": 262057.515625, |
|
"learning_rate": 9.99783768935657e-05, |
|
"loss": 209.6862, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7315945856718389, |
|
"grad_norm": 221274.765625, |
|
"learning_rate": 9.997813049635195e-05, |
|
"loss": 208.7495, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7342357213601849, |
|
"grad_norm": 363778.46875, |
|
"learning_rate": 9.997788270352827e-05, |
|
"loss": 234.0036, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7368768570485309, |
|
"grad_norm": 198016.546875, |
|
"learning_rate": 9.997763351510157e-05, |
|
"loss": 221.2396, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7395179927368769, |
|
"grad_norm": 383717.4375, |
|
"learning_rate": 9.997738293107881e-05, |
|
"loss": 166.7505, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7421591284252228, |
|
"grad_norm": 471310.09375, |
|
"learning_rate": 9.9977130951467e-05, |
|
"loss": 155.5116, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7448002641135688, |
|
"grad_norm": 135402.15625, |
|
"learning_rate": 9.997687757627316e-05, |
|
"loss": 71.9904, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7474413998019148, |
|
"grad_norm": 6735.1005859375, |
|
"learning_rate": 9.997662280550437e-05, |
|
"loss": 30.8698, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7500825354902608, |
|
"grad_norm": 11189.4736328125, |
|
"learning_rate": 9.997636663916776e-05, |
|
"loss": 30.6788, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7527236711786068, |
|
"grad_norm": 9472.00390625, |
|
"learning_rate": 9.997610907727046e-05, |
|
"loss": 32.5548, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7553648068669528, |
|
"grad_norm": 10074.7333984375, |
|
"learning_rate": 9.997585011981966e-05, |
|
"loss": 30.9945, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7580059425552987, |
|
"grad_norm": 11928.4619140625, |
|
"learning_rate": 9.997558976682262e-05, |
|
"loss": 30.6684, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7606470782436447, |
|
"grad_norm": 13231.986328125, |
|
"learning_rate": 9.997532801828658e-05, |
|
"loss": 30.9457, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7632882139319908, |
|
"grad_norm": 8904.8466796875, |
|
"learning_rate": 9.997506487421888e-05, |
|
"loss": 31.3361, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7659293496203368, |
|
"grad_norm": 9125.240234375, |
|
"learning_rate": 9.997480033462683e-05, |
|
"loss": 30.7196, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7685704853086828, |
|
"grad_norm": 9812.6181640625, |
|
"learning_rate": 9.997453439951784e-05, |
|
"loss": 30.7277, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7712116209970287, |
|
"grad_norm": 7082.22607421875, |
|
"learning_rate": 9.997426706889935e-05, |
|
"loss": 31.2053, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7738527566853747, |
|
"grad_norm": 9316.9384765625, |
|
"learning_rate": 9.997399834277878e-05, |
|
"loss": 31.5169, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7764938923737207, |
|
"grad_norm": 19302.771484375, |
|
"learning_rate": 9.997372822116368e-05, |
|
"loss": 31.651, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7791350280620667, |
|
"grad_norm": 10954.8271484375, |
|
"learning_rate": 9.99734567040616e-05, |
|
"loss": 30.4, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7817761637504127, |
|
"grad_norm": 9081.9521484375, |
|
"learning_rate": 9.997318379148007e-05, |
|
"loss": 30.8718, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7844172994387587, |
|
"grad_norm": 6827.958984375, |
|
"learning_rate": 9.997290948342673e-05, |
|
"loss": 31.0843, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7870584351271046, |
|
"grad_norm": 10805.7939453125, |
|
"learning_rate": 9.997263377990926e-05, |
|
"loss": 31.6845, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7896995708154506, |
|
"grad_norm": 11347.0078125, |
|
"learning_rate": 9.997235668093535e-05, |
|
"loss": 33.4166, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7923407065037966, |
|
"grad_norm": 16983.841796875, |
|
"learning_rate": 9.997207818651274e-05, |
|
"loss": 35.7603, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7949818421921426, |
|
"grad_norm": 3815.614990234375, |
|
"learning_rate": 9.997179829664918e-05, |
|
"loss": 33.1237, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7976229778804886, |
|
"grad_norm": 4439.759765625, |
|
"learning_rate": 9.997151701135253e-05, |
|
"loss": 32.6201, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8002641135688346, |
|
"grad_norm": 6584.0, |
|
"learning_rate": 9.997123433063062e-05, |
|
"loss": 31.9738, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8029052492571805, |
|
"grad_norm": 8394.333984375, |
|
"learning_rate": 9.997095025449134e-05, |
|
"loss": 34.1952, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.8055463849455266, |
|
"grad_norm": 8264.888671875, |
|
"learning_rate": 9.997066478294262e-05, |
|
"loss": 34.1646, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8081875206338726, |
|
"grad_norm": 6815.27587890625, |
|
"learning_rate": 9.997037791599245e-05, |
|
"loss": 32.8399, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8108286563222186, |
|
"grad_norm": 6638.54296875, |
|
"learning_rate": 9.997008965364884e-05, |
|
"loss": 32.737, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.8134697920105646, |
|
"grad_norm": 6356.19287109375, |
|
"learning_rate": 9.996979999591983e-05, |
|
"loss": 33.2864, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8161109276989106, |
|
"grad_norm": 10876.560546875, |
|
"learning_rate": 9.996950894281349e-05, |
|
"loss": 32.8353, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8187520633872565, |
|
"grad_norm": 18334.380859375, |
|
"learning_rate": 9.996921649433796e-05, |
|
"loss": 33.1125, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8213931990756025, |
|
"grad_norm": 5925.57080078125, |
|
"learning_rate": 9.996892265050144e-05, |
|
"loss": 33.4775, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.8240343347639485, |
|
"grad_norm": 5512.29541015625, |
|
"learning_rate": 9.99686274113121e-05, |
|
"loss": 32.4073, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8266754704522945, |
|
"grad_norm": 6770.63232421875, |
|
"learning_rate": 9.996833077677819e-05, |
|
"loss": 33.0255, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8293166061406405, |
|
"grad_norm": 9025.830078125, |
|
"learning_rate": 9.9968032746908e-05, |
|
"loss": 31.7732, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8319577418289865, |
|
"grad_norm": 5815.4296875, |
|
"learning_rate": 9.996773332170983e-05, |
|
"loss": 31.5946, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8345988775173324, |
|
"grad_norm": 7221.68603515625, |
|
"learning_rate": 9.996743250119209e-05, |
|
"loss": 31.5973, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8372400132056784, |
|
"grad_norm": 7172.86962890625, |
|
"learning_rate": 9.996713028536313e-05, |
|
"loss": 31.4948, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.8398811488940244, |
|
"grad_norm": 11000.0458984375, |
|
"learning_rate": 9.99668266742314e-05, |
|
"loss": 31.3127, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8425222845823704, |
|
"grad_norm": 8431.4716796875, |
|
"learning_rate": 9.99665216678054e-05, |
|
"loss": 30.8608, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8451634202707164, |
|
"grad_norm": 7308.78466796875, |
|
"learning_rate": 9.996621526609364e-05, |
|
"loss": 30.8716, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8478045559590623, |
|
"grad_norm": 8358.787109375, |
|
"learning_rate": 9.996590746910467e-05, |
|
"loss": 31.0737, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8504456916474084, |
|
"grad_norm": 29319.46484375, |
|
"learning_rate": 9.996559827684709e-05, |
|
"loss": 46.2402, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8530868273357544, |
|
"grad_norm": 903961.25, |
|
"learning_rate": 9.996528768932951e-05, |
|
"loss": 161.367, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8557279630241004, |
|
"grad_norm": 436229.9375, |
|
"learning_rate": 9.996497570656062e-05, |
|
"loss": 215.2534, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8583690987124464, |
|
"grad_norm": 310716.5, |
|
"learning_rate": 9.996466232854915e-05, |
|
"loss": 218.9532, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8610102344007924, |
|
"grad_norm": 935038.75, |
|
"learning_rate": 9.996434755530384e-05, |
|
"loss": 204.1668, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8636513700891383, |
|
"grad_norm": 577125.0625, |
|
"learning_rate": 9.996403138683347e-05, |
|
"loss": 225.0228, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8662925057774843, |
|
"grad_norm": 429562.09375, |
|
"learning_rate": 9.996371382314686e-05, |
|
"loss": 221.4529, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8689336414658303, |
|
"grad_norm": 469087.0625, |
|
"learning_rate": 9.996339486425291e-05, |
|
"loss": 161.132, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8715747771541763, |
|
"grad_norm": 946113.1875, |
|
"learning_rate": 9.99630745101605e-05, |
|
"loss": 169.4336, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8742159128425223, |
|
"grad_norm": 537740.1875, |
|
"learning_rate": 9.996275276087859e-05, |
|
"loss": 166.9042, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8768570485308683, |
|
"grad_norm": 377986.5, |
|
"learning_rate": 9.996242961641615e-05, |
|
"loss": 139.4483, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8794981842192142, |
|
"grad_norm": 6949.21044921875, |
|
"learning_rate": 9.996210507678223e-05, |
|
"loss": 32.8323, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8821393199075602, |
|
"grad_norm": 6551.869140625, |
|
"learning_rate": 9.996177914198586e-05, |
|
"loss": 31.1956, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8847804555959062, |
|
"grad_norm": 8210.8974609375, |
|
"learning_rate": 9.996145181203615e-05, |
|
"loss": 30.2494, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8874215912842522, |
|
"grad_norm": 12632.7666015625, |
|
"learning_rate": 9.996112308694225e-05, |
|
"loss": 30.7789, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8900627269725983, |
|
"grad_norm": 11905.80078125, |
|
"learning_rate": 9.996079296671334e-05, |
|
"loss": 30.9992, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8927038626609443, |
|
"grad_norm": 11776.396484375, |
|
"learning_rate": 9.996046145135865e-05, |
|
"loss": 30.6118, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8953449983492902, |
|
"grad_norm": 10494.625, |
|
"learning_rate": 9.99601285408874e-05, |
|
"loss": 30.6983, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8979861340376362, |
|
"grad_norm": 8309.9296875, |
|
"learning_rate": 9.995979423530892e-05, |
|
"loss": 30.6617, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9006272697259822, |
|
"grad_norm": 11482.9853515625, |
|
"learning_rate": 9.995945853463253e-05, |
|
"loss": 30.5696, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9032684054143282, |
|
"grad_norm": 8950.994140625, |
|
"learning_rate": 9.995912143886763e-05, |
|
"loss": 29.6077, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9059095411026742, |
|
"grad_norm": 8950.931640625, |
|
"learning_rate": 9.995878294802357e-05, |
|
"loss": 30.4176, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9085506767910201, |
|
"grad_norm": 6688.57470703125, |
|
"learning_rate": 9.995844306210988e-05, |
|
"loss": 29.8723, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9111918124793661, |
|
"grad_norm": 7882.67431640625, |
|
"learning_rate": 9.995810178113599e-05, |
|
"loss": 30.049, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9138329481677121, |
|
"grad_norm": 9309.5625, |
|
"learning_rate": 9.995775910511147e-05, |
|
"loss": 30.2998, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.9164740838560581, |
|
"grad_norm": 9403.8974609375, |
|
"learning_rate": 9.995741503404587e-05, |
|
"loss": 30.4171, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.9191152195444041, |
|
"grad_norm": 10254.1376953125, |
|
"learning_rate": 9.995706956794879e-05, |
|
"loss": 32.398, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9217563552327501, |
|
"grad_norm": 11519.509765625, |
|
"learning_rate": 9.99567227068299e-05, |
|
"loss": 33.4377, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.924397490921096, |
|
"grad_norm": 17227.236328125, |
|
"learning_rate": 9.995637445069887e-05, |
|
"loss": 36.9788, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.927038626609442, |
|
"grad_norm": 8033.53369140625, |
|
"learning_rate": 9.995602479956545e-05, |
|
"loss": 32.1, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.929679762297788, |
|
"grad_norm": 10333.927734375, |
|
"learning_rate": 9.995567375343937e-05, |
|
"loss": 32.4024, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.932320897986134, |
|
"grad_norm": 5577.73486328125, |
|
"learning_rate": 9.995532131233044e-05, |
|
"loss": 33.2651, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.93496203367448, |
|
"grad_norm": 5001.80615234375, |
|
"learning_rate": 9.99549674762485e-05, |
|
"loss": 33.2199, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9376031693628261, |
|
"grad_norm": 6995.62255859375, |
|
"learning_rate": 9.995461224520345e-05, |
|
"loss": 33.0332, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.940244305051172, |
|
"grad_norm": 5345.10888671875, |
|
"learning_rate": 9.995425561920519e-05, |
|
"loss": 32.4465, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.942885440739518, |
|
"grad_norm": 5311.36376953125, |
|
"learning_rate": 9.99538975982637e-05, |
|
"loss": 33.3183, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.945526576427864, |
|
"grad_norm": 4239.72021484375, |
|
"learning_rate": 9.995353818238895e-05, |
|
"loss": 30.5123, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.94816771211621, |
|
"grad_norm": 6135.8544921875, |
|
"learning_rate": 9.9953177371591e-05, |
|
"loss": 30.1126, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.950808847804556, |
|
"grad_norm": 3885.701904296875, |
|
"learning_rate": 9.995281516587991e-05, |
|
"loss": 30.1448, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.953449983492902, |
|
"grad_norm": 17259.177734375, |
|
"learning_rate": 9.99524515652658e-05, |
|
"loss": 30.9694, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9560911191812479, |
|
"grad_norm": 5949.1728515625, |
|
"learning_rate": 9.995208656975884e-05, |
|
"loss": 30.8493, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9587322548695939, |
|
"grad_norm": 231986.453125, |
|
"learning_rate": 9.995172017936919e-05, |
|
"loss": 141.9035, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9613733905579399, |
|
"grad_norm": 103330.5546875, |
|
"learning_rate": 9.99513523941071e-05, |
|
"loss": 188.4911, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9640145262462859, |
|
"grad_norm": 307991.03125, |
|
"learning_rate": 9.995098321398284e-05, |
|
"loss": 160.2285, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9666556619346319, |
|
"grad_norm": 190517.765625, |
|
"learning_rate": 9.995061263900671e-05, |
|
"loss": 152.4148, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9692967976229779, |
|
"grad_norm": 134986.078125, |
|
"learning_rate": 9.995024066918908e-05, |
|
"loss": 119.8174, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9719379333113238, |
|
"grad_norm": 215117.609375, |
|
"learning_rate": 9.994986730454031e-05, |
|
"loss": 125.8479, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9745790689996698, |
|
"grad_norm": 22283.35546875, |
|
"learning_rate": 9.994949254507084e-05, |
|
"loss": 34.5446, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9772202046880158, |
|
"grad_norm": 12405.2236328125, |
|
"learning_rate": 9.994911639079112e-05, |
|
"loss": 35.1761, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9798613403763619, |
|
"grad_norm": 13683.02734375, |
|
"learning_rate": 9.994873884171167e-05, |
|
"loss": 32.3272, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9825024760647079, |
|
"grad_norm": 8963.904296875, |
|
"learning_rate": 9.994835989784305e-05, |
|
"loss": 31.0019, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9851436117530538, |
|
"grad_norm": 19926.2734375, |
|
"learning_rate": 9.994797955919581e-05, |
|
"loss": 30.4514, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9877847474413998, |
|
"grad_norm": 37221.25, |
|
"learning_rate": 9.994759782578058e-05, |
|
"loss": 32.0492, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9904258831297458, |
|
"grad_norm": 10019.828125, |
|
"learning_rate": 9.994721469760801e-05, |
|
"loss": 31.7783, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9930670188180918, |
|
"grad_norm": 8898.4228515625, |
|
"learning_rate": 9.994683017468883e-05, |
|
"loss": 30.9381, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9957081545064378, |
|
"grad_norm": 13350.8203125, |
|
"learning_rate": 9.994644425703374e-05, |
|
"loss": 32.4939, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9983492901947838, |
|
"grad_norm": 29945.037109375, |
|
"learning_rate": 9.994605694465355e-05, |
|
"loss": 34.0366, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.0009904258831297, |
|
"grad_norm": 19480.009765625, |
|
"learning_rate": 9.994566823755907e-05, |
|
"loss": 37.1069, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.0036315615714757, |
|
"grad_norm": 4824.83544921875, |
|
"learning_rate": 9.99452781357611e-05, |
|
"loss": 35.9486, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0062726972598217, |
|
"grad_norm": 4898.34423828125, |
|
"learning_rate": 9.994488663927062e-05, |
|
"loss": 34.3521, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.0089138329481677, |
|
"grad_norm": 7551.79736328125, |
|
"learning_rate": 9.994449374809851e-05, |
|
"loss": 36.7028, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.0115549686365137, |
|
"grad_norm": 8357.705078125, |
|
"learning_rate": 9.994409946225574e-05, |
|
"loss": 36.5134, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.0141961043248597, |
|
"grad_norm": 5780.6787109375, |
|
"learning_rate": 9.994370378175332e-05, |
|
"loss": 37.3621, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.0168372400132057, |
|
"grad_norm": 5624.93896484375, |
|
"learning_rate": 9.994330670660235e-05, |
|
"loss": 37.6676, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0194783757015518, |
|
"grad_norm": 6545.541015625, |
|
"learning_rate": 9.994290823681385e-05, |
|
"loss": 37.2885, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.0221195113898978, |
|
"grad_norm": 9896.431640625, |
|
"learning_rate": 9.994250837239897e-05, |
|
"loss": 37.8031, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.0247606470782435, |
|
"grad_norm": 6628.89453125, |
|
"learning_rate": 9.994210711336891e-05, |
|
"loss": 39.4998, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.0274017827665896, |
|
"grad_norm": 7230.349609375, |
|
"learning_rate": 9.994170445973483e-05, |
|
"loss": 37.6952, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.0300429184549356, |
|
"grad_norm": 5001.923828125, |
|
"learning_rate": 9.994130041150798e-05, |
|
"loss": 37.2387, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0326840541432816, |
|
"grad_norm": 8473.236328125, |
|
"learning_rate": 9.994089496869968e-05, |
|
"loss": 37.7243, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.0353251898316276, |
|
"grad_norm": 12679.2109375, |
|
"learning_rate": 9.994048813132119e-05, |
|
"loss": 35.9025, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.0379663255199736, |
|
"grad_norm": 7488.9248046875, |
|
"learning_rate": 9.994007989938392e-05, |
|
"loss": 36.2572, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.0406074612083196, |
|
"grad_norm": 8192.458984375, |
|
"learning_rate": 9.993967027289927e-05, |
|
"loss": 38.7854, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.0432485968966656, |
|
"grad_norm": 6160.6787109375, |
|
"learning_rate": 9.993925925187865e-05, |
|
"loss": 35.9352, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0458897325850116, |
|
"grad_norm": 6419.31103515625, |
|
"learning_rate": 9.993884683633354e-05, |
|
"loss": 37.7825, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.0485308682733576, |
|
"grad_norm": 8226.6005859375, |
|
"learning_rate": 9.993843302627549e-05, |
|
"loss": 35.5052, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.0511720039617036, |
|
"grad_norm": 8380.81640625, |
|
"learning_rate": 9.993801782171603e-05, |
|
"loss": 36.5649, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.0538131396500496, |
|
"grad_norm": 10895.78515625, |
|
"learning_rate": 9.993760122266676e-05, |
|
"loss": 37.1919, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.0564542753383954, |
|
"grad_norm": 14454.5390625, |
|
"learning_rate": 9.99371832291393e-05, |
|
"loss": 38.3564, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0564542753383954, |
|
"eval_loss": 8.69857406616211, |
|
"eval_runtime": 2.1301, |
|
"eval_samples_per_second": 232.388, |
|
"eval_steps_per_second": 29.107, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 18900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1043682507620352.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|