|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3952569169960474, |
|
"eval_steps": 38, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002635046113306983, |
|
"grad_norm": NaN, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002635046113306983, |
|
"eval_loss": NaN, |
|
"eval_runtime": 15.6332, |
|
"eval_samples_per_second": 10.235, |
|
"eval_steps_per_second": 5.117, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005270092226613966, |
|
"grad_norm": NaN, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007905138339920948, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012, |
|
"loss": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010540184453227932, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016, |
|
"loss": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013175230566534914, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015810276679841896, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00024, |
|
"loss": 6.5685, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01844532279314888, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00028, |
|
"loss": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.021080368906455864, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00032, |
|
"loss": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.023715415019762844, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00036, |
|
"loss": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.026350461133069828, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.028985507246376812, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003999496469885013, |
|
"loss": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03162055335968379, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039979861330826294, |
|
"loss": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.034255599472990776, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003995469750092912, |
|
"loss": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03689064558629776, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039919485879904784, |
|
"loss": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.039525691699604744, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039874244197864856, |
|
"loss": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04216073781291173, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039818995235358696, |
|
"loss": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04479578392621871, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039753766811902755, |
|
"loss": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04743083003952569, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003967859177197259, |
|
"loss": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05006587615283267, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039593507968464716, |
|
"loss": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.052700922266139656, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003949855824363647, |
|
"loss": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05533596837944664, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003939379040753374, |
|
"loss": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.057971014492753624, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039279257213917066, |
|
"loss": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06060606060606061, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003915501633369861, |
|
"loss": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06324110671936758, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039021130325903074, |
|
"loss": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06587615283267458, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00038877666606167355, |
|
"loss": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06851119894598155, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00038724697412794747, |
|
"loss": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07114624505928854, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003856229977038078, |
|
"loss": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07378129117259552, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003839055545102902, |
|
"loss": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0764163372859025, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00038209550933176323, |
|
"loss": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07905138339920949, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003801937735804838, |
|
"loss": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08168642951251646, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003782013048376736, |
|
"loss": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08432147562582346, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003761191063713476, |
|
"loss": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003739482266311391, |
|
"loss": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08959156785243742, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00037168975872037323, |
|
"loss": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0922266139657444, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00036934483984565685, |
|
"loss": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09486166007905138, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00036691465074426054, |
|
"loss": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09749670619235837, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00036440041508958203, |
|
"loss": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10013175230566534, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003618033988749895, |
|
"loss": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10013175230566534, |
|
"eval_loss": NaN, |
|
"eval_runtime": 14.097, |
|
"eval_samples_per_second": 11.35, |
|
"eval_steps_per_second": 5.675, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10276679841897234, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00035912490977635625, |
|
"loss": 2.2343, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10540184453227931, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000356366296493606, |
|
"loss": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1080368906455863, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003535289480716022, |
|
"loss": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11067193675889328, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00035061429320072223, |
|
"loss": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11330698287220026, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00034762379949746815, |
|
"loss": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11594202898550725, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003445589727654783, |
|
"loss": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11857707509881422, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003414213562373095, |
|
"loss": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00033821252979737297, |
|
"loss": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12384716732542819, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003349341091864149, |
|
"loss": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12648221343873517, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00033158774518794254, |
|
"loss": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12911725955204217, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003281751227970048, |
|
"loss": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13175230566534915, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00032469796037174674, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13438735177865613, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000321158008768164, |
|
"loss": 0.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1370223978919631, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00031755705045849464, |
|
"loss": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13965744400527008, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003138968986336904, |
|
"loss": 0.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1422924901185771, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003101793962904205, |
|
"loss": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14492753623188406, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00030640641530306733, |
|
"loss": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14756258234519104, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00030257985548118126, |
|
"loss": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15019762845849802, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002987016436128694, |
|
"loss": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.152832674571805, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002947737324945997, |
|
"loss": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.155467720685112, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00029079809994790937, |
|
"loss": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15810276679841898, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00028677674782351165, |
|
"loss": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16073781291172595, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00028271170099330415, |
|
"loss": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16337285902503293, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00027860500633078477, |
|
"loss": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.16600790513833993, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00027445873168038907, |
|
"loss": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1686429512516469, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002702749648162686, |
|
"loss": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1712779973649539, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00026605581239103347, |
|
"loss": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00026180339887498953, |
|
"loss": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17654808959156784, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00025751986548640346, |
|
"loss": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17918313570487485, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00025320736911333503, |
|
"loss": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002488680812275788, |
|
"loss": 0.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1844532279314888, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002445041867912629, |
|
"loss": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18708827404479578, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00024011788315665458, |
|
"loss": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18972332015810275, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00023571137895972733, |
|
"loss": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19235836627140976, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002312868930080462, |
|
"loss": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.19499341238471674, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002268466531635311, |
|
"loss": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1976284584980237, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00022239289522066157, |
|
"loss": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2002635046113307, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00021792786178068672, |
|
"loss": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2002635046113307, |
|
"eval_loss": NaN, |
|
"eval_runtime": 14.106, |
|
"eval_samples_per_second": 11.343, |
|
"eval_steps_per_second": 5.671, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2028985507246377, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00021345380112240797, |
|
"loss": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.20553359683794467, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00020897296607010301, |
|
"loss": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.20816864295125165, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00020448761285916104, |
|
"loss": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.21080368906455862, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2134387351778656, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019551238714083903, |
|
"loss": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2160737812911726, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019102703392989709, |
|
"loss": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.21870882740447958, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018654619887759207, |
|
"loss": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.22134387351778656, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018207213821931333, |
|
"loss": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22397891963109354, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017760710477933845, |
|
"loss": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.22661396574440051, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017315334683646897, |
|
"loss": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.22924901185770752, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016871310699195379, |
|
"loss": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2318840579710145, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016428862104027268, |
|
"loss": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23451910408432147, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015988211684334546, |
|
"loss": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.23715415019762845, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015549581320873715, |
|
"loss": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23978919631093545, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015113191877242117, |
|
"loss": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014679263088666499, |
|
"loss": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2450592885375494, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014248013451359656, |
|
"loss": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.24769433465085638, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013819660112501054, |
|
"loss": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2503293807641634, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013394418760896666, |
|
"loss": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.25296442687747034, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012972503518373144, |
|
"loss": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25559947299077734, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012554126831961098, |
|
"loss": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.25823451910408435, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001213949936692153, |
|
"loss": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011728829900669591, |
|
"loss": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2635046113306983, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011322325217648839, |
|
"loss": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26613965744400525, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010920190005209065, |
|
"loss": 0.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.26877470355731226, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010522626750540028, |
|
"loss": 0.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.27140974967061926, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010129835638713063, |
|
"loss": 0.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2740447957839262, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.74201445188188e-05, |
|
"loss": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2766798418972332, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.359358469693271e-05, |
|
"loss": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.27931488801054016, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.982060370957952e-05, |
|
"loss": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.28194993412384717, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.610310136630962e-05, |
|
"loss": 0.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2845849802371542, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.24429495415054e-05, |
|
"loss": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2872200263504611, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.884199123183605e-05, |
|
"loss": 0.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2898550724637681, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.530203962825331e-05, |
|
"loss": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2924901185770751, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.182487720299517e-05, |
|
"loss": 0.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2951251646903821, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.841225481205749e-05, |
|
"loss": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2977602108036891, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.506589081358514e-05, |
|
"loss": 0.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.30039525691699603, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.178747020262707e-05, |
|
"loss": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.30039525691699603, |
|
"eval_loss": NaN, |
|
"eval_runtime": 14.1098, |
|
"eval_samples_per_second": 11.34, |
|
"eval_steps_per_second": 5.67, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.857864376269051e-05, |
|
"loss": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.30566534914361, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.544102723452171e-05, |
|
"loss": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.308300395256917, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.237620050253189e-05, |
|
"loss": 0.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.310935441370224, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.938570679927783e-05, |
|
"loss": 0.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.31357048748353095, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.647105192839778e-05, |
|
"loss": 0.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.31620553359683795, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.363370350639404e-05, |
|
"loss": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3188405797101449, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.087509022364382e-05, |
|
"loss": 0.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3214756258234519, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.819660112501053e-05, |
|
"loss": 0.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3241106719367589, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5599584910418035e-05, |
|
"loss": 0.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.32674571805006586, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.3085349255739474e-05, |
|
"loss": 0.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.32938076416337286, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0655160154343174e-05, |
|
"loss": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.33201581027667987, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.831024127962678e-05, |
|
"loss": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3346508563899868, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6051773368860934e-05, |
|
"loss": 0.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3372859025032938, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.38808936286524e-05, |
|
"loss": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.33992094861660077, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1798695162326442e-05, |
|
"loss": 0.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3425559947299078, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9806226419516192e-05, |
|
"loss": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3451910408432148, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.790449066823683e-05, |
|
"loss": 0.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6094445489709885e-05, |
|
"loss": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.35046113306982873, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4377002296192233e-05, |
|
"loss": 0.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3530961791831357, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.275302587205256e-05, |
|
"loss": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3557312252964427, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1223333938326485e-05, |
|
"loss": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3583662714097497, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.788696740969295e-06, |
|
"loss": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.36100131752305664, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.44983666301391e-06, |
|
"loss": 0.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.2074278608293525e-06, |
|
"loss": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3662714097496706, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.062095924662625e-06, |
|
"loss": 0.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3689064558629776, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.0144175636352765e-06, |
|
"loss": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3715415019762846, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.064920315352904e-06, |
|
"loss": 0.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.37417654808959155, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.2140822802740668e-06, |
|
"loss": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.37681159420289856, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.462331880972468e-06, |
|
"loss": 0.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3794466403162055, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.81004764641306e-06, |
|
"loss": 0.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3820816864295125, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2575580213514792e-06, |
|
"loss": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3847167325428195, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.051412009521864e-07, |
|
"loss": 0.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.38735177865612647, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.530249907087836e-07, |
|
"loss": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.38998682476943347, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0138669173708213e-07, |
|
"loss": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3926218708827404, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.035301149869387e-08, |
|
"loss": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3952569169960474, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 150, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 38, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8626338709635072.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|