|
{ |
|
"best_metric": 3.130511999130249, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.10231749117511639, |
|
"eval_steps": 100, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00020463498235023277, |
|
"grad_norm": 17.940895080566406, |
|
"learning_rate": 5e-06, |
|
"loss": 14.0433, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00020463498235023277, |
|
"eval_loss": 3.9562315940856934, |
|
"eval_runtime": 47.1989, |
|
"eval_samples_per_second": 174.368, |
|
"eval_steps_per_second": 43.603, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00040926996470046554, |
|
"grad_norm": 22.070707321166992, |
|
"learning_rate": 1e-05, |
|
"loss": 14.7688, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0006139049470506984, |
|
"grad_norm": 20.095142364501953, |
|
"learning_rate": 1.5e-05, |
|
"loss": 14.6774, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0008185399294009311, |
|
"grad_norm": 17.93838119506836, |
|
"learning_rate": 2e-05, |
|
"loss": 14.672, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.001023174911751164, |
|
"grad_norm": 13.304869651794434, |
|
"learning_rate": 2.5e-05, |
|
"loss": 14.7464, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0012278098941013967, |
|
"grad_norm": 10.018509864807129, |
|
"learning_rate": 3e-05, |
|
"loss": 14.7743, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0014324448764516293, |
|
"grad_norm": 8.531241416931152, |
|
"learning_rate": 3.5e-05, |
|
"loss": 14.7664, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0016370798588018621, |
|
"grad_norm": 8.279265403747559, |
|
"learning_rate": 4e-05, |
|
"loss": 14.8116, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.001841714841152095, |
|
"grad_norm": 7.955838680267334, |
|
"learning_rate": 4.5e-05, |
|
"loss": 14.0356, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.002046349823502328, |
|
"grad_norm": 8.396589279174805, |
|
"learning_rate": 5e-05, |
|
"loss": 14.4371, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0022509848058525606, |
|
"grad_norm": 7.477896213531494, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 14.3411, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0024556197882027934, |
|
"grad_norm": 8.143397331237793, |
|
"learning_rate": 6e-05, |
|
"loss": 14.5638, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0026602547705530263, |
|
"grad_norm": 7.183845043182373, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 14.4296, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0028648897529032586, |
|
"grad_norm": 7.372915267944336, |
|
"learning_rate": 7e-05, |
|
"loss": 14.7023, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0030695247352534915, |
|
"grad_norm": 8.364891052246094, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 14.6036, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0032741597176037243, |
|
"grad_norm": 8.31488037109375, |
|
"learning_rate": 8e-05, |
|
"loss": 14.7258, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.003478794699953957, |
|
"grad_norm": 7.918307304382324, |
|
"learning_rate": 8.5e-05, |
|
"loss": 14.2108, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.00368342968230419, |
|
"grad_norm": 7.043214797973633, |
|
"learning_rate": 9e-05, |
|
"loss": 14.4846, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0038880646646544227, |
|
"grad_norm": 7.622882843017578, |
|
"learning_rate": 9.5e-05, |
|
"loss": 14.7337, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.004092699647004656, |
|
"grad_norm": 7.523791790008545, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6743, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004297334629354888, |
|
"grad_norm": 7.598966598510742, |
|
"learning_rate": 9.999892908320647e-05, |
|
"loss": 13.6939, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.004501969611705121, |
|
"grad_norm": 7.914957046508789, |
|
"learning_rate": 9.999571637870036e-05, |
|
"loss": 14.0858, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.004706604594055354, |
|
"grad_norm": 8.37746810913086, |
|
"learning_rate": 9.999036202410325e-05, |
|
"loss": 14.1071, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.004911239576405587, |
|
"grad_norm": 8.19976806640625, |
|
"learning_rate": 9.998286624877786e-05, |
|
"loss": 14.4376, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00511587455875582, |
|
"grad_norm": 8.429183959960938, |
|
"learning_rate": 9.997322937381829e-05, |
|
"loss": 13.8618, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0053205095411060525, |
|
"grad_norm": 8.871038436889648, |
|
"learning_rate": 9.996145181203615e-05, |
|
"loss": 14.0325, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0055251445234562845, |
|
"grad_norm": 9.305150985717773, |
|
"learning_rate": 9.994753406794301e-05, |
|
"loss": 14.309, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.005729779505806517, |
|
"grad_norm": 8.649129867553711, |
|
"learning_rate": 9.99314767377287e-05, |
|
"loss": 14.174, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.00593441448815675, |
|
"grad_norm": 9.096344947814941, |
|
"learning_rate": 9.991328050923581e-05, |
|
"loss": 14.1257, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.006139049470506983, |
|
"grad_norm": 9.188639640808105, |
|
"learning_rate": 9.989294616193017e-05, |
|
"loss": 13.7195, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006343684452857216, |
|
"grad_norm": 9.031623840332031, |
|
"learning_rate": 9.98704745668676e-05, |
|
"loss": 13.2224, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0065483194352074486, |
|
"grad_norm": 10.63833999633789, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 14.6318, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.006752954417557681, |
|
"grad_norm": 9.114713668823242, |
|
"learning_rate": 9.981912357541627e-05, |
|
"loss": 12.9636, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.006957589399907914, |
|
"grad_norm": 10.119890213012695, |
|
"learning_rate": 9.97902463787331e-05, |
|
"loss": 14.3062, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.007162224382258147, |
|
"grad_norm": 9.861045837402344, |
|
"learning_rate": 9.975923633360985e-05, |
|
"loss": 14.4585, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.00736685936460838, |
|
"grad_norm": 9.876028060913086, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 13.4113, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.007571494346958613, |
|
"grad_norm": 9.652708053588867, |
|
"learning_rate": 9.969082310281891e-05, |
|
"loss": 13.4341, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0077761293293088455, |
|
"grad_norm": 10.080284118652344, |
|
"learning_rate": 9.965342284774632e-05, |
|
"loss": 13.4885, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.007980764311659078, |
|
"grad_norm": 10.708706855773926, |
|
"learning_rate": 9.961389560529836e-05, |
|
"loss": 13.8326, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.008185399294009311, |
|
"grad_norm": 10.825308799743652, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 14.1761, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.008390034276359544, |
|
"grad_norm": 11.960084915161133, |
|
"learning_rate": 9.952846702217886e-05, |
|
"loss": 13.7566, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.008594669258709777, |
|
"grad_norm": 13.216733932495117, |
|
"learning_rate": 9.948256934098352e-05, |
|
"loss": 13.3996, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.00879930424106001, |
|
"grad_norm": 11.897379875183105, |
|
"learning_rate": 9.943455199120837e-05, |
|
"loss": 13.2895, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.009003939223410242, |
|
"grad_norm": 13.036720275878906, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 13.2497, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.009208574205760475, |
|
"grad_norm": 12.781022071838379, |
|
"learning_rate": 9.933216660424395e-05, |
|
"loss": 13.1251, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.009413209188110708, |
|
"grad_norm": 13.262389183044434, |
|
"learning_rate": 9.927780295290389e-05, |
|
"loss": 13.5698, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.009617844170460941, |
|
"grad_norm": 15.804819107055664, |
|
"learning_rate": 9.922132840449459e-05, |
|
"loss": 14.0475, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.009822479152811174, |
|
"grad_norm": 17.065067291259766, |
|
"learning_rate": 9.916274537819775e-05, |
|
"loss": 13.7782, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.010027114135161407, |
|
"grad_norm": 19.974628448486328, |
|
"learning_rate": 9.91020563835152e-05, |
|
"loss": 13.1872, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.01023174911751164, |
|
"grad_norm": 33.5842399597168, |
|
"learning_rate": 9.903926402016153e-05, |
|
"loss": 14.529, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.010436384099861872, |
|
"grad_norm": 27.80715560913086, |
|
"learning_rate": 9.897437097795257e-05, |
|
"loss": 15.1481, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.010641019082212105, |
|
"grad_norm": 25.08864974975586, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 15.3311, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.010845654064562336, |
|
"grad_norm": 20.487937927246094, |
|
"learning_rate": 9.883829406604363e-05, |
|
"loss": 14.983, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.011050289046912569, |
|
"grad_norm": 18.901628494262695, |
|
"learning_rate": 9.876711602542563e-05, |
|
"loss": 15.1327, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.011254924029262802, |
|
"grad_norm": 14.395322799682617, |
|
"learning_rate": 9.869384896386668e-05, |
|
"loss": 14.1538, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.011459559011613035, |
|
"grad_norm": 10.616168022155762, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 14.1126, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.011664193993963267, |
|
"grad_norm": 8.293721199035645, |
|
"learning_rate": 9.854106042134641e-05, |
|
"loss": 13.6647, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0118688289763135, |
|
"grad_norm": 7.054158687591553, |
|
"learning_rate": 9.846154548533773e-05, |
|
"loss": 13.3569, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.012073463958663733, |
|
"grad_norm": 7.207637310028076, |
|
"learning_rate": 9.837995461801299e-05, |
|
"loss": 13.5518, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.012278098941013966, |
|
"grad_norm": 7.297451496124268, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 13.9634, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.012482733923364199, |
|
"grad_norm": 7.142579555511475, |
|
"learning_rate": 9.821055915851647e-05, |
|
"loss": 13.9129, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.012687368905714431, |
|
"grad_norm": 6.920661926269531, |
|
"learning_rate": 9.812276182268236e-05, |
|
"loss": 13.7893, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.012892003888064664, |
|
"grad_norm": 7.224207401275635, |
|
"learning_rate": 9.803290306789676e-05, |
|
"loss": 13.4208, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.013096638870414897, |
|
"grad_norm": 6.482351779937744, |
|
"learning_rate": 9.794098674340965e-05, |
|
"loss": 14.011, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.01330127385276513, |
|
"grad_norm": 5.605070114135742, |
|
"learning_rate": 9.784701678661045e-05, |
|
"loss": 13.362, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.013505908835115363, |
|
"grad_norm": 6.630495548248291, |
|
"learning_rate": 9.775099722285935e-05, |
|
"loss": 13.4117, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.013710543817465596, |
|
"grad_norm": 6.491345405578613, |
|
"learning_rate": 9.765293216531486e-05, |
|
"loss": 13.6704, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.013915178799815828, |
|
"grad_norm": 6.191098690032959, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 13.405, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.014119813782166061, |
|
"grad_norm": 5.732306480407715, |
|
"learning_rate": 9.74506824594107e-05, |
|
"loss": 13.2353, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.014324448764516294, |
|
"grad_norm": 5.888903617858887, |
|
"learning_rate": 9.73465064747553e-05, |
|
"loss": 13.3435, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.014529083746866527, |
|
"grad_norm": 5.951428413391113, |
|
"learning_rate": 9.724030232334391e-05, |
|
"loss": 13.6627, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.01473371872921676, |
|
"grad_norm": 6.11599063873291, |
|
"learning_rate": 9.713207455460894e-05, |
|
"loss": 13.5377, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.014938353711566993, |
|
"grad_norm": 6.475672245025635, |
|
"learning_rate": 9.702182780466775e-05, |
|
"loss": 13.5349, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.015142988693917225, |
|
"grad_norm": 6.255554676055908, |
|
"learning_rate": 9.690956679612421e-05, |
|
"loss": 13.427, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.015347623676267458, |
|
"grad_norm": 6.450949668884277, |
|
"learning_rate": 9.67952963378663e-05, |
|
"loss": 13.1908, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.015552258658617691, |
|
"grad_norm": 6.359993934631348, |
|
"learning_rate": 9.667902132486009e-05, |
|
"loss": 13.6013, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.015756893640967922, |
|
"grad_norm": 6.952643394470215, |
|
"learning_rate": 9.656074673794018e-05, |
|
"loss": 12.9901, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.015961528623318157, |
|
"grad_norm": 6.661192417144775, |
|
"learning_rate": 9.644047764359622e-05, |
|
"loss": 13.4725, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.016166163605668388, |
|
"grad_norm": 6.839961051940918, |
|
"learning_rate": 9.631821919375591e-05, |
|
"loss": 13.0454, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.016370798588018622, |
|
"grad_norm": 7.225835800170898, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 13.5887, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.016575433570368853, |
|
"grad_norm": 7.208718776702881, |
|
"learning_rate": 9.606775526115963e-05, |
|
"loss": 12.9486, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.016780068552719088, |
|
"grad_norm": 7.323213577270508, |
|
"learning_rate": 9.593956050744492e-05, |
|
"loss": 13.15, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.01698470353506932, |
|
"grad_norm": 7.82949161529541, |
|
"learning_rate": 9.580939785585681e-05, |
|
"loss": 13.2066, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.017189338517419554, |
|
"grad_norm": 7.900018692016602, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 13.556, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.017393973499769785, |
|
"grad_norm": 8.467442512512207, |
|
"learning_rate": 9.554319124605879e-05, |
|
"loss": 13.7811, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.01759860848212002, |
|
"grad_norm": 8.75790786743164, |
|
"learning_rate": 9.540715869125407e-05, |
|
"loss": 13.5011, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.01780324346447025, |
|
"grad_norm": 8.657136917114258, |
|
"learning_rate": 9.526918104489777e-05, |
|
"loss": 12.9077, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.018007878446820485, |
|
"grad_norm": 8.441506385803223, |
|
"learning_rate": 9.512926421749304e-05, |
|
"loss": 13.4052, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.018212513429170716, |
|
"grad_norm": 8.36744499206543, |
|
"learning_rate": 9.498741420261108e-05, |
|
"loss": 12.8423, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.01841714841152095, |
|
"grad_norm": 9.058586120605469, |
|
"learning_rate": 9.484363707663442e-05, |
|
"loss": 13.437, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01862178339387118, |
|
"grad_norm": 9.995050430297852, |
|
"learning_rate": 9.469793899849661e-05, |
|
"loss": 13.485, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.018826418376221416, |
|
"grad_norm": 9.687874794006348, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 12.3617, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.019031053358571647, |
|
"grad_norm": 9.374463081359863, |
|
"learning_rate": 9.440080503264037e-05, |
|
"loss": 12.6718, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.019235688340921882, |
|
"grad_norm": 10.396638870239258, |
|
"learning_rate": 9.42493818731521e-05, |
|
"loss": 12.5755, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.019440323323272113, |
|
"grad_norm": 11.383647918701172, |
|
"learning_rate": 9.409606321741775e-05, |
|
"loss": 12.4185, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.019644958305622347, |
|
"grad_norm": 10.607769012451172, |
|
"learning_rate": 9.394085563309827e-05, |
|
"loss": 12.5911, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.01984959328797258, |
|
"grad_norm": 13.467524528503418, |
|
"learning_rate": 9.378376576876999e-05, |
|
"loss": 14.1048, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.020054228270322813, |
|
"grad_norm": 14.150450706481934, |
|
"learning_rate": 9.362480035363986e-05, |
|
"loss": 12.6339, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.020258863252673044, |
|
"grad_norm": 15.242779731750488, |
|
"learning_rate": 9.34639661972572e-05, |
|
"loss": 13.5117, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.02046349823502328, |
|
"grad_norm": 21.984615325927734, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 13.9761, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02046349823502328, |
|
"eval_loss": 3.370993137359619, |
|
"eval_runtime": 47.2126, |
|
"eval_samples_per_second": 174.318, |
|
"eval_steps_per_second": 43.59, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02066813321737351, |
|
"grad_norm": 11.214513778686523, |
|
"learning_rate": 9.31367192988896e-05, |
|
"loss": 14.0912, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.020872768199723744, |
|
"grad_norm": 10.00228214263916, |
|
"learning_rate": 9.297032057507264e-05, |
|
"loss": 13.9889, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.021077403182073975, |
|
"grad_norm": 10.31716251373291, |
|
"learning_rate": 9.280208114573859e-05, |
|
"loss": 14.1193, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.02128203816442421, |
|
"grad_norm": 9.314844131469727, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 13.7396, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.02148667314677444, |
|
"grad_norm": 7.567698001861572, |
|
"learning_rate": 9.246010907632895e-05, |
|
"loss": 13.7879, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.021691308129124672, |
|
"grad_norm": 6.531108856201172, |
|
"learning_rate": 9.228639108519868e-05, |
|
"loss": 13.5185, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.021895943111474907, |
|
"grad_norm": 5.202017307281494, |
|
"learning_rate": 9.211086168581433e-05, |
|
"loss": 13.3491, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.022100578093825138, |
|
"grad_norm": 4.530038356781006, |
|
"learning_rate": 9.193352839727121e-05, |
|
"loss": 13.2004, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.022305213076175372, |
|
"grad_norm": 4.831387996673584, |
|
"learning_rate": 9.175439881593716e-05, |
|
"loss": 13.4205, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.022509848058525603, |
|
"grad_norm": 4.692884922027588, |
|
"learning_rate": 9.157348061512727e-05, |
|
"loss": 13.4912, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.022714483040875838, |
|
"grad_norm": 5.204988479614258, |
|
"learning_rate": 9.139078154477512e-05, |
|
"loss": 13.1214, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.02291911802322607, |
|
"grad_norm": 4.781569004058838, |
|
"learning_rate": 9.120630943110077e-05, |
|
"loss": 12.6118, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.023123753005576304, |
|
"grad_norm": 4.754026412963867, |
|
"learning_rate": 9.102007217627568e-05, |
|
"loss": 13.186, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.023328387987926535, |
|
"grad_norm": 5.035665035247803, |
|
"learning_rate": 9.083207775808396e-05, |
|
"loss": 12.7322, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.02353302297027677, |
|
"grad_norm": 5.12575626373291, |
|
"learning_rate": 9.064233422958077e-05, |
|
"loss": 13.0182, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.023737657952627, |
|
"grad_norm": 5.39860200881958, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 13.4676, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.023942292934977235, |
|
"grad_norm": 5.005839824676514, |
|
"learning_rate": 9.025763242814291e-05, |
|
"loss": 13.0532, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.024146927917327466, |
|
"grad_norm": 5.046457290649414, |
|
"learning_rate": 9.006269063455304e-05, |
|
"loss": 13.336, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.0243515628996777, |
|
"grad_norm": 4.951815128326416, |
|
"learning_rate": 8.986603268863536e-05, |
|
"loss": 13.1308, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.02455619788202793, |
|
"grad_norm": 5.16800594329834, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 12.7553, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.024760832864378166, |
|
"grad_norm": 5.190509796142578, |
|
"learning_rate": 8.94676021096575e-05, |
|
"loss": 13.2239, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.024965467846728397, |
|
"grad_norm": 5.662418365478516, |
|
"learning_rate": 8.926584654403724e-05, |
|
"loss": 13.2593, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.025170102829078632, |
|
"grad_norm": 5.604646682739258, |
|
"learning_rate": 8.906240896023794e-05, |
|
"loss": 13.2693, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.025374737811428863, |
|
"grad_norm": 5.807793140411377, |
|
"learning_rate": 8.885729807284856e-05, |
|
"loss": 13.465, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.025579372793779098, |
|
"grad_norm": 6.032169818878174, |
|
"learning_rate": 8.865052266813685e-05, |
|
"loss": 13.0197, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.02578400777612933, |
|
"grad_norm": 5.969254970550537, |
|
"learning_rate": 8.844209160367299e-05, |
|
"loss": 12.91, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.025988642758479563, |
|
"grad_norm": 5.627323627471924, |
|
"learning_rate": 8.823201380795001e-05, |
|
"loss": 12.9693, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.026193277740829794, |
|
"grad_norm": 5.775904655456543, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 13.3716, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.02639791272318003, |
|
"grad_norm": 6.050631999969482, |
|
"learning_rate": 8.780695408901613e-05, |
|
"loss": 12.9946, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.02660254770553026, |
|
"grad_norm": 6.608086109161377, |
|
"learning_rate": 8.759199037394887e-05, |
|
"loss": 12.7268, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.026807182687880494, |
|
"grad_norm": 6.4099202156066895, |
|
"learning_rate": 8.737541634312985e-05, |
|
"loss": 13.3797, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.027011817670230726, |
|
"grad_norm": 6.958422660827637, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 13.2627, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.02721645265258096, |
|
"grad_norm": 6.657001495361328, |
|
"learning_rate": 8.693747451206232e-05, |
|
"loss": 13.1662, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.02742108763493119, |
|
"grad_norm": 6.775047302246094, |
|
"learning_rate": 8.671612547178428e-05, |
|
"loss": 12.8757, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.027625722617281426, |
|
"grad_norm": 6.7623419761657715, |
|
"learning_rate": 8.649320363489179e-05, |
|
"loss": 12.5799, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.027830357599631657, |
|
"grad_norm": 7.408362865447998, |
|
"learning_rate": 8.626871855061438e-05, |
|
"loss": 13.8727, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.02803499258198189, |
|
"grad_norm": 6.984137535095215, |
|
"learning_rate": 8.604267983514594e-05, |
|
"loss": 12.6957, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.028239627564332122, |
|
"grad_norm": 7.494143486022949, |
|
"learning_rate": 8.581509717123273e-05, |
|
"loss": 13.5292, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.028444262546682357, |
|
"grad_norm": 7.043254375457764, |
|
"learning_rate": 8.558598030775857e-05, |
|
"loss": 12.5103, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.028648897529032588, |
|
"grad_norm": 7.2675957679748535, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 12.8951, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.02885353251138282, |
|
"grad_norm": 7.874957084655762, |
|
"learning_rate": 8.51231833058426e-05, |
|
"loss": 12.9737, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.029058167493733054, |
|
"grad_norm": 8.002019882202148, |
|
"learning_rate": 8.488952299208401e-05, |
|
"loss": 12.8148, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.029262802476083285, |
|
"grad_norm": 8.36933422088623, |
|
"learning_rate": 8.46543681272818e-05, |
|
"loss": 12.4946, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.02946743745843352, |
|
"grad_norm": 9.498835563659668, |
|
"learning_rate": 8.44177287846877e-05, |
|
"loss": 13.271, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.02967207244078375, |
|
"grad_norm": 8.976995468139648, |
|
"learning_rate": 8.417961510114356e-05, |
|
"loss": 12.5241, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.029876707423133985, |
|
"grad_norm": 9.178775787353516, |
|
"learning_rate": 8.39400372766471e-05, |
|
"loss": 12.4166, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.030081342405484216, |
|
"grad_norm": 10.875651359558105, |
|
"learning_rate": 8.36990055739149e-05, |
|
"loss": 12.7323, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.03028597738783445, |
|
"grad_norm": 11.843050003051758, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 12.6294, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.030490612370184682, |
|
"grad_norm": 12.797874450683594, |
|
"learning_rate": 8.321262189556409e-05, |
|
"loss": 11.9468, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.030695247352534916, |
|
"grad_norm": 21.556180953979492, |
|
"learning_rate": 8.296729075500344e-05, |
|
"loss": 14.375, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.030899882334885147, |
|
"grad_norm": 5.878223419189453, |
|
"learning_rate": 8.272054740543052e-05, |
|
"loss": 13.2625, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.031104517317235382, |
|
"grad_norm": 6.683862209320068, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 13.4469, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.03130915229958561, |
|
"grad_norm": 6.695138931274414, |
|
"learning_rate": 8.222286641794488e-05, |
|
"loss": 13.8935, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.031513787281935844, |
|
"grad_norm": 6.529450416564941, |
|
"learning_rate": 8.197195009902924e-05, |
|
"loss": 13.2827, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.03171842226428608, |
|
"grad_norm": 5.889492034912109, |
|
"learning_rate": 8.171966420818228e-05, |
|
"loss": 13.2815, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.03192305724663631, |
|
"grad_norm": 5.005529403686523, |
|
"learning_rate": 8.146601955249188e-05, |
|
"loss": 13.1348, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.032127692228986544, |
|
"grad_norm": 4.527781009674072, |
|
"learning_rate": 8.121102699725089e-05, |
|
"loss": 12.9616, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.032332327211336775, |
|
"grad_norm": 3.992450714111328, |
|
"learning_rate": 8.095469746549172e-05, |
|
"loss": 13.2171, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.032536962193687013, |
|
"grad_norm": 3.9536304473876953, |
|
"learning_rate": 8.069704193751832e-05, |
|
"loss": 13.5083, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.032741597176037245, |
|
"grad_norm": 4.0044264793396, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 13.5044, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.032946232158387476, |
|
"grad_norm": 4.166686058044434, |
|
"learning_rate": 8.017779709767858e-05, |
|
"loss": 12.9416, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.03315086714073771, |
|
"grad_norm": 4.292598724365234, |
|
"learning_rate": 7.991623002853296e-05, |
|
"loss": 12.932, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.033355502123087945, |
|
"grad_norm": 4.622048377990723, |
|
"learning_rate": 7.965338144766186e-05, |
|
"loss": 13.1667, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.033560137105438176, |
|
"grad_norm": 4.218106746673584, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 12.9429, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.03376477208778841, |
|
"grad_norm": 4.615002155303955, |
|
"learning_rate": 7.912388484339012e-05, |
|
"loss": 13.4383, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.03396940707013864, |
|
"grad_norm": 4.371853828430176, |
|
"learning_rate": 7.88572595018617e-05, |
|
"loss": 12.6596, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.034174042052488876, |
|
"grad_norm": 4.507296562194824, |
|
"learning_rate": 7.858939801138061e-05, |
|
"loss": 13.0297, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.03437867703483911, |
|
"grad_norm": 4.6610941886901855, |
|
"learning_rate": 7.832031184624164e-05, |
|
"loss": 12.6801, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.03458331201718934, |
|
"grad_norm": 4.3974714279174805, |
|
"learning_rate": 7.80500125332005e-05, |
|
"loss": 12.6394, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.03478794699953957, |
|
"grad_norm": 4.65360689163208, |
|
"learning_rate": 7.777851165098012e-05, |
|
"loss": 13.2642, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03499258198188981, |
|
"grad_norm": 4.651695251464844, |
|
"learning_rate": 7.750582082977467e-05, |
|
"loss": 13.3055, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.03519721696424004, |
|
"grad_norm": 5.114010810852051, |
|
"learning_rate": 7.723195175075136e-05, |
|
"loss": 13.0045, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.03540185194659027, |
|
"grad_norm": 5.113755702972412, |
|
"learning_rate": 7.695691614555003e-05, |
|
"loss": 12.9366, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.0356064869289405, |
|
"grad_norm": 5.089533805847168, |
|
"learning_rate": 7.668072579578058e-05, |
|
"loss": 12.959, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.03581112191129073, |
|
"grad_norm": 5.559483051300049, |
|
"learning_rate": 7.64033925325184e-05, |
|
"loss": 13.2842, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.03601575689364097, |
|
"grad_norm": 5.3359761238098145, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 13.0262, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.0362203918759912, |
|
"grad_norm": 5.409842014312744, |
|
"learning_rate": 7.584534483410137e-05, |
|
"loss": 13.0076, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.03642502685834143, |
|
"grad_norm": 5.253081321716309, |
|
"learning_rate": 7.55646543038526e-05, |
|
"loss": 11.9703, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.03662966184069166, |
|
"grad_norm": 5.482647895812988, |
|
"learning_rate": 7.528286866889924e-05, |
|
"loss": 12.6692, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.0368342968230419, |
|
"grad_norm": 5.659306049346924, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 13.0874, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03703893180539213, |
|
"grad_norm": 5.71022891998291, |
|
"learning_rate": 7.471606041430723e-05, |
|
"loss": 12.9602, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.03724356678774236, |
|
"grad_norm": 6.031240940093994, |
|
"learning_rate": 7.443106207484776e-05, |
|
"loss": 12.8276, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.037448201770092594, |
|
"grad_norm": 5.916280746459961, |
|
"learning_rate": 7.414501719000187e-05, |
|
"loss": 12.7036, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.03765283675244283, |
|
"grad_norm": 6.090421676635742, |
|
"learning_rate": 7.385793801298042e-05, |
|
"loss": 12.5362, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.03785747173479306, |
|
"grad_norm": 6.97968053817749, |
|
"learning_rate": 7.35698368412999e-05, |
|
"loss": 13.1994, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.038062106717143294, |
|
"grad_norm": 6.6946587562561035, |
|
"learning_rate": 7.328072601625557e-05, |
|
"loss": 12.9428, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.038266741699493526, |
|
"grad_norm": 6.86458158493042, |
|
"learning_rate": 7.2990617922393e-05, |
|
"loss": 13.5336, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.038471376681843764, |
|
"grad_norm": 7.41053581237793, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 13.1752, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.038676011664193995, |
|
"grad_norm": 6.769413948059082, |
|
"learning_rate": 7.240745967946113e-05, |
|
"loss": 12.3618, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.038880646646544226, |
|
"grad_norm": 8.171807289123535, |
|
"learning_rate": 7.211443451095007e-05, |
|
"loss": 13.4612, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03908528162889446, |
|
"grad_norm": 7.6870598793029785, |
|
"learning_rate": 7.18204620336671e-05, |
|
"loss": 12.8721, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.039289916611244695, |
|
"grad_norm": 7.984126567840576, |
|
"learning_rate": 7.152555484041476e-05, |
|
"loss": 12.5025, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.039494551593594926, |
|
"grad_norm": 8.748424530029297, |
|
"learning_rate": 7.122972556403567e-05, |
|
"loss": 12.5803, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.03969918657594516, |
|
"grad_norm": 8.19789981842041, |
|
"learning_rate": 7.09329868768714e-05, |
|
"loss": 13.0793, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.03990382155829539, |
|
"grad_norm": 8.25755786895752, |
|
"learning_rate": 7.063535149021973e-05, |
|
"loss": 13.2436, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.040108456540645626, |
|
"grad_norm": 10.084080696105957, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 12.4769, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.04031309152299586, |
|
"grad_norm": 11.04244327545166, |
|
"learning_rate": 7.003744165515705e-05, |
|
"loss": 13.3229, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.04051772650534609, |
|
"grad_norm": 10.718149185180664, |
|
"learning_rate": 6.973719281921335e-05, |
|
"loss": 13.0458, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.04072236148769632, |
|
"grad_norm": 12.596996307373047, |
|
"learning_rate": 6.943609850761979e-05, |
|
"loss": 13.2156, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.04092699647004656, |
|
"grad_norm": 16.626497268676758, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 12.1366, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04092699647004656, |
|
"eval_loss": 3.208616256713867, |
|
"eval_runtime": 47.3335, |
|
"eval_samples_per_second": 173.872, |
|
"eval_steps_per_second": 43.479, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04113163145239679, |
|
"grad_norm": 3.9187700748443604, |
|
"learning_rate": 6.883142508466054e-05, |
|
"loss": 13.2198, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.04133626643474702, |
|
"grad_norm": 5.088418006896973, |
|
"learning_rate": 6.852787187549182e-05, |
|
"loss": 12.934, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.04154090141709725, |
|
"grad_norm": 4.74566125869751, |
|
"learning_rate": 6.82235249939575e-05, |
|
"loss": 13.3393, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.04174553639944749, |
|
"grad_norm": 4.898460865020752, |
|
"learning_rate": 6.7918397477265e-05, |
|
"loss": 13.5989, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.04195017138179772, |
|
"grad_norm": 4.619757652282715, |
|
"learning_rate": 6.761250239606169e-05, |
|
"loss": 13.0342, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.04215480636414795, |
|
"grad_norm": 4.482340335845947, |
|
"learning_rate": 6.730585285387465e-05, |
|
"loss": 13.0489, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.04235944134649818, |
|
"grad_norm": 4.1753644943237305, |
|
"learning_rate": 6.699846198654971e-05, |
|
"loss": 13.2165, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.04256407632884842, |
|
"grad_norm": 3.9566304683685303, |
|
"learning_rate": 6.669034296168855e-05, |
|
"loss": 13.2601, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.04276871131119865, |
|
"grad_norm": 4.045615196228027, |
|
"learning_rate": 6.638150897808468e-05, |
|
"loss": 13.0854, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.04297334629354888, |
|
"grad_norm": 3.9672138690948486, |
|
"learning_rate": 6.607197326515808e-05, |
|
"loss": 13.5277, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04317798127589911, |
|
"grad_norm": 3.8964602947235107, |
|
"learning_rate": 6.57617490823885e-05, |
|
"loss": 13.3445, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.043382616258249344, |
|
"grad_norm": 3.9119648933410645, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 12.763, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.04358725124059958, |
|
"grad_norm": 4.09339714050293, |
|
"learning_rate": 6.513928849212873e-05, |
|
"loss": 13.1653, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.043791886222949813, |
|
"grad_norm": 4.33394193649292, |
|
"learning_rate": 6.482707874877854e-05, |
|
"loss": 13.1689, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.043996521205300045, |
|
"grad_norm": 4.071203231811523, |
|
"learning_rate": 6.451423386272312e-05, |
|
"loss": 12.9157, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.044201156187650276, |
|
"grad_norm": 4.155096054077148, |
|
"learning_rate": 6.420076723519614e-05, |
|
"loss": 12.9944, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.044405791170000514, |
|
"grad_norm": 4.474510669708252, |
|
"learning_rate": 6.388669229406462e-05, |
|
"loss": 12.9211, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.044610426152350745, |
|
"grad_norm": 4.203741550445557, |
|
"learning_rate": 6.357202249325371e-05, |
|
"loss": 12.5732, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.044815061134700976, |
|
"grad_norm": 4.361083984375, |
|
"learning_rate": 6.32567713121704e-05, |
|
"loss": 13.1363, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.04501969611705121, |
|
"grad_norm": 4.626219749450684, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 13.1044, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.045224331099401445, |
|
"grad_norm": 4.69849967956543, |
|
"learning_rate": 6.26245788507579e-05, |
|
"loss": 12.7933, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.045428966081751676, |
|
"grad_norm": 4.679666996002197, |
|
"learning_rate": 6.230766465144967e-05, |
|
"loss": 13.1581, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.04563360106410191, |
|
"grad_norm": 4.953638553619385, |
|
"learning_rate": 6.199022323275083e-05, |
|
"loss": 13.0212, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.04583823604645214, |
|
"grad_norm": 4.850236415863037, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 13.0019, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.046042871028802376, |
|
"grad_norm": 4.989190578460693, |
|
"learning_rate": 6.135381315171867e-05, |
|
"loss": 12.2553, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.04624750601115261, |
|
"grad_norm": 4.897017478942871, |
|
"learning_rate": 6.103487175107507e-05, |
|
"loss": 13.0096, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.04645214099350284, |
|
"grad_norm": 4.998581886291504, |
|
"learning_rate": 6.071545765325254e-05, |
|
"loss": 12.4746, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.04665677597585307, |
|
"grad_norm": 5.277119159698486, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 12.6303, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.04686141095820331, |
|
"grad_norm": 5.548853874206543, |
|
"learning_rate": 6.007526611628086e-05, |
|
"loss": 12.8215, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.04706604594055354, |
|
"grad_norm": 5.378997325897217, |
|
"learning_rate": 5.9754516100806423e-05, |
|
"loss": 13.1972, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04727068092290377, |
|
"grad_norm": 5.815462589263916, |
|
"learning_rate": 5.9433348234327765e-05, |
|
"loss": 13.2821, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.047475315905254, |
|
"grad_norm": 5.872306823730469, |
|
"learning_rate": 5.911177627460739e-05, |
|
"loss": 13.199, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.04767995088760424, |
|
"grad_norm": 5.859600067138672, |
|
"learning_rate": 5.8789813996717736e-05, |
|
"loss": 12.6879, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.04788458586995447, |
|
"grad_norm": 6.172786712646484, |
|
"learning_rate": 5.8467475192451226e-05, |
|
"loss": 13.2259, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.0480892208523047, |
|
"grad_norm": 6.31078577041626, |
|
"learning_rate": 5.814477366972945e-05, |
|
"loss": 12.3852, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.04829385583465493, |
|
"grad_norm": 6.534313201904297, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 13.3609, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.04849849081700517, |
|
"grad_norm": 6.585941314697266, |
|
"learning_rate": 5.749833777770225e-05, |
|
"loss": 12.9351, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.0487031257993554, |
|
"grad_norm": 6.7909159660339355, |
|
"learning_rate": 5.717463109955896e-05, |
|
"loss": 12.2785, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.04890776078170563, |
|
"grad_norm": 7.011481761932373, |
|
"learning_rate": 5.685061708409841e-05, |
|
"loss": 13.0815, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.04911239576405586, |
|
"grad_norm": 7.400633335113525, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 12.5917, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0493170307464061, |
|
"grad_norm": 6.944792747497559, |
|
"learning_rate": 5.6201722572524275e-05, |
|
"loss": 12.8307, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.04952166572875633, |
|
"grad_norm": 9.408550262451172, |
|
"learning_rate": 5.587686987289189e-05, |
|
"loss": 13.0507, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.049726300711106564, |
|
"grad_norm": 8.005476951599121, |
|
"learning_rate": 5.5551765427713884e-05, |
|
"loss": 12.5769, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.049930935693456795, |
|
"grad_norm": 8.190591812133789, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 12.2517, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.050135570675807026, |
|
"grad_norm": 9.257965087890625, |
|
"learning_rate": 5.490085701647805e-05, |
|
"loss": 13.0658, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.050340205658157264, |
|
"grad_norm": 9.173686027526855, |
|
"learning_rate": 5.457508093317013e-05, |
|
"loss": 12.6359, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.050544840640507495, |
|
"grad_norm": 9.59607982635498, |
|
"learning_rate": 5.4249108868622086e-05, |
|
"loss": 13.0733, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.050749475622857726, |
|
"grad_norm": 10.842556953430176, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 12.6057, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.05095411060520796, |
|
"grad_norm": 11.467256546020508, |
|
"learning_rate": 5.359663265783598e-05, |
|
"loss": 12.7772, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.051158745587558195, |
|
"grad_norm": 16.473846435546875, |
|
"learning_rate": 5.327015646150716e-05, |
|
"loss": 12.2446, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.051363380569908426, |
|
"grad_norm": 2.952035903930664, |
|
"learning_rate": 5.294354018255945e-05, |
|
"loss": 13.1735, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.05156801555225866, |
|
"grad_norm": 3.22007417678833, |
|
"learning_rate": 5.26167978121472e-05, |
|
"loss": 13.0792, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.05177265053460889, |
|
"grad_norm": 3.549088716506958, |
|
"learning_rate": 5.228994334682604e-05, |
|
"loss": 12.7948, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.051977285516959126, |
|
"grad_norm": 3.728848695755005, |
|
"learning_rate": 5.196299078795344e-05, |
|
"loss": 13.0125, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.05218192049930936, |
|
"grad_norm": 3.749281167984009, |
|
"learning_rate": 5.1635954141088813e-05, |
|
"loss": 13.2622, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.05238655548165959, |
|
"grad_norm": 3.824709415435791, |
|
"learning_rate": 5.1308847415393666e-05, |
|
"loss": 12.9234, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.05259119046400982, |
|
"grad_norm": 3.764427661895752, |
|
"learning_rate": 5.0981684623031415e-05, |
|
"loss": 13.2274, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.05279582544636006, |
|
"grad_norm": 3.7923285961151123, |
|
"learning_rate": 5.0654479778567223e-05, |
|
"loss": 13.1988, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.05300046042871029, |
|
"grad_norm": 3.733365535736084, |
|
"learning_rate": 5.0327246898367597e-05, |
|
"loss": 12.5384, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.05320509541106052, |
|
"grad_norm": 3.8030307292938232, |
|
"learning_rate": 5e-05, |
|
"loss": 12.991, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.05340973039341075, |
|
"grad_norm": 3.7069780826568604, |
|
"learning_rate": 4.9672753101632415e-05, |
|
"loss": 12.7534, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.05361436537576099, |
|
"grad_norm": 3.764336109161377, |
|
"learning_rate": 4.934552022143279e-05, |
|
"loss": 12.8254, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.05381900035811122, |
|
"grad_norm": 3.753891944885254, |
|
"learning_rate": 4.901831537696859e-05, |
|
"loss": 12.6035, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.05402363534046145, |
|
"grad_norm": 3.9714443683624268, |
|
"learning_rate": 4.869115258460635e-05, |
|
"loss": 12.6499, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.05422827032281168, |
|
"grad_norm": 3.999743938446045, |
|
"learning_rate": 4.83640458589112e-05, |
|
"loss": 13.0991, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.05443290530516192, |
|
"grad_norm": 3.9809932708740234, |
|
"learning_rate": 4.8037009212046586e-05, |
|
"loss": 12.6208, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.05463754028751215, |
|
"grad_norm": 4.165307521820068, |
|
"learning_rate": 4.7710056653173976e-05, |
|
"loss": 13.0348, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.05484217526986238, |
|
"grad_norm": 4.428051948547363, |
|
"learning_rate": 4.738320218785281e-05, |
|
"loss": 13.4851, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.055046810252212613, |
|
"grad_norm": 4.276752948760986, |
|
"learning_rate": 4.7056459817440544e-05, |
|
"loss": 12.8883, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.05525144523456285, |
|
"grad_norm": 4.741238594055176, |
|
"learning_rate": 4.6729843538492847e-05, |
|
"loss": 13.3597, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05545608021691308, |
|
"grad_norm": 4.348086833953857, |
|
"learning_rate": 4.640336734216403e-05, |
|
"loss": 12.9206, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.055660715199263314, |
|
"grad_norm": 4.487641334533691, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 12.7532, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.055865350181613545, |
|
"grad_norm": 4.975533485412598, |
|
"learning_rate": 4.575089113137792e-05, |
|
"loss": 12.5924, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.05606998516396378, |
|
"grad_norm": 4.721080303192139, |
|
"learning_rate": 4.542491906682989e-05, |
|
"loss": 12.9426, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.056274620146314014, |
|
"grad_norm": 4.957543849945068, |
|
"learning_rate": 4.509914298352197e-05, |
|
"loss": 12.5506, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.056479255128664245, |
|
"grad_norm": 4.958243370056152, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 12.7699, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.056683890111014476, |
|
"grad_norm": 5.35684061050415, |
|
"learning_rate": 4.444823457228612e-05, |
|
"loss": 12.8455, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.056888525093364714, |
|
"grad_norm": 5.440086364746094, |
|
"learning_rate": 4.412313012710813e-05, |
|
"loss": 13.7743, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.057093160075714945, |
|
"grad_norm": 5.20829439163208, |
|
"learning_rate": 4.379827742747575e-05, |
|
"loss": 13.318, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.057297795058065176, |
|
"grad_norm": 5.2258405685424805, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 13.0874, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05750243004041541, |
|
"grad_norm": 5.654691219329834, |
|
"learning_rate": 4.3149382915901606e-05, |
|
"loss": 12.4869, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.05770706502276564, |
|
"grad_norm": 5.957024097442627, |
|
"learning_rate": 4.282536890044104e-05, |
|
"loss": 12.8174, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.057911700005115876, |
|
"grad_norm": 6.341736316680908, |
|
"learning_rate": 4.250166222229774e-05, |
|
"loss": 12.6841, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.05811633498746611, |
|
"grad_norm": 6.56013822555542, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 13.0789, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.05832096996981634, |
|
"grad_norm": 6.450329780578613, |
|
"learning_rate": 4.185522633027057e-05, |
|
"loss": 12.6028, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.05852560495216657, |
|
"grad_norm": 6.356710433959961, |
|
"learning_rate": 4.153252480754877e-05, |
|
"loss": 13.0871, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.05873023993451681, |
|
"grad_norm": 6.647814750671387, |
|
"learning_rate": 4.1210186003282275e-05, |
|
"loss": 12.6458, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.05893487491686704, |
|
"grad_norm": 6.441559314727783, |
|
"learning_rate": 4.088822372539263e-05, |
|
"loss": 12.2483, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.05913950989921727, |
|
"grad_norm": 8.019023895263672, |
|
"learning_rate": 4.0566651765672246e-05, |
|
"loss": 12.7241, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.0593441448815675, |
|
"grad_norm": 7.507869720458984, |
|
"learning_rate": 4.0245483899193595e-05, |
|
"loss": 13.3737, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.05954877986391774, |
|
"grad_norm": 7.296957015991211, |
|
"learning_rate": 3.992473388371915e-05, |
|
"loss": 12.6952, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.05975341484626797, |
|
"grad_norm": 8.110812187194824, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 12.2771, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.0599580498286182, |
|
"grad_norm": 8.923057556152344, |
|
"learning_rate": 3.928454234674747e-05, |
|
"loss": 12.7108, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.06016268481096843, |
|
"grad_norm": 10.090682983398438, |
|
"learning_rate": 3.896512824892495e-05, |
|
"loss": 13.1477, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.06036731979331867, |
|
"grad_norm": 8.957847595214844, |
|
"learning_rate": 3.864618684828134e-05, |
|
"loss": 11.7159, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.0605719547756689, |
|
"grad_norm": 10.131745338439941, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 12.191, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.06077658975801913, |
|
"grad_norm": 10.587480545043945, |
|
"learning_rate": 3.800977676724919e-05, |
|
"loss": 12.6343, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.060981224740369364, |
|
"grad_norm": 11.739582061767578, |
|
"learning_rate": 3.769233534855035e-05, |
|
"loss": 12.5622, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.0611858597227196, |
|
"grad_norm": 14.421760559082031, |
|
"learning_rate": 3.73754211492421e-05, |
|
"loss": 12.8323, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.06139049470506983, |
|
"grad_norm": 17.0710391998291, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 11.9398, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06139049470506983, |
|
"eval_loss": 3.1531736850738525, |
|
"eval_runtime": 47.3782, |
|
"eval_samples_per_second": 173.709, |
|
"eval_steps_per_second": 43.438, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.061595129687420064, |
|
"grad_norm": 2.744182825088501, |
|
"learning_rate": 3.6743228687829595e-05, |
|
"loss": 12.8172, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.061799764669770295, |
|
"grad_norm": 3.1303441524505615, |
|
"learning_rate": 3.642797750674629e-05, |
|
"loss": 12.6422, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.06200439965212053, |
|
"grad_norm": 3.3844480514526367, |
|
"learning_rate": 3.6113307705935396e-05, |
|
"loss": 13.0422, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.062209034634470764, |
|
"grad_norm": 3.383885383605957, |
|
"learning_rate": 3.579923276480387e-05, |
|
"loss": 12.6751, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.062413669616820995, |
|
"grad_norm": 3.3772575855255127, |
|
"learning_rate": 3.5485766137276894e-05, |
|
"loss": 13.055, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.06261830459917123, |
|
"grad_norm": 3.6094918251037598, |
|
"learning_rate": 3.5172921251221455e-05, |
|
"loss": 13.1467, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.06282293958152146, |
|
"grad_norm": 3.674668073654175, |
|
"learning_rate": 3.486071150787128e-05, |
|
"loss": 12.7998, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.06302757456387169, |
|
"grad_norm": 3.914242744445801, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 13.6751, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.06323220954622193, |
|
"grad_norm": 3.7367589473724365, |
|
"learning_rate": 3.423825091761153e-05, |
|
"loss": 13.0127, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.06343684452857216, |
|
"grad_norm": 3.7376673221588135, |
|
"learning_rate": 3.392802673484193e-05, |
|
"loss": 13.3006, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06364147951092239, |
|
"grad_norm": 3.9828696250915527, |
|
"learning_rate": 3.361849102191533e-05, |
|
"loss": 13.0535, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.06384611449327263, |
|
"grad_norm": 3.7258951663970947, |
|
"learning_rate": 3.330965703831146e-05, |
|
"loss": 12.8206, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.06405074947562286, |
|
"grad_norm": 3.842252254486084, |
|
"learning_rate": 3.300153801345028e-05, |
|
"loss": 13.0683, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.06425538445797309, |
|
"grad_norm": 3.9074199199676514, |
|
"learning_rate": 3.2694147146125345e-05, |
|
"loss": 12.9179, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.06446001944032333, |
|
"grad_norm": 3.9515974521636963, |
|
"learning_rate": 3.2387497603938326e-05, |
|
"loss": 13.3211, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.06466465442267355, |
|
"grad_norm": 4.135197162628174, |
|
"learning_rate": 3.2081602522734986e-05, |
|
"loss": 13.1106, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.06486928940502379, |
|
"grad_norm": 4.115512371063232, |
|
"learning_rate": 3.177647500604252e-05, |
|
"loss": 12.8488, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.06507392438737403, |
|
"grad_norm": 4.200262069702148, |
|
"learning_rate": 3.147212812450819e-05, |
|
"loss": 12.7581, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.06527855936972425, |
|
"grad_norm": 4.211337089538574, |
|
"learning_rate": 3.116857491533947e-05, |
|
"loss": 12.8883, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.06548319435207449, |
|
"grad_norm": 4.417909622192383, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 12.9176, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06568782933442471, |
|
"grad_norm": 4.327807903289795, |
|
"learning_rate": 3.056390149238022e-05, |
|
"loss": 12.5289, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.06589246431677495, |
|
"grad_norm": 4.564601898193359, |
|
"learning_rate": 3.0262807180786647e-05, |
|
"loss": 12.8324, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.06609709929912519, |
|
"grad_norm": 4.540707111358643, |
|
"learning_rate": 2.996255834484296e-05, |
|
"loss": 12.1696, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.06630173428147541, |
|
"grad_norm": 4.797798156738281, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 12.73, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.06650636926382565, |
|
"grad_norm": 4.712722301483154, |
|
"learning_rate": 2.936464850978027e-05, |
|
"loss": 12.439, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.06671100424617589, |
|
"grad_norm": 4.9245781898498535, |
|
"learning_rate": 2.9067013123128613e-05, |
|
"loss": 12.8693, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.06691563922852611, |
|
"grad_norm": 5.3487467765808105, |
|
"learning_rate": 2.8770274435964355e-05, |
|
"loss": 12.7175, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.06712027421087635, |
|
"grad_norm": 5.03184175491333, |
|
"learning_rate": 2.8474445159585235e-05, |
|
"loss": 12.3854, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.06732490919322658, |
|
"grad_norm": 4.980981349945068, |
|
"learning_rate": 2.8179537966332887e-05, |
|
"loss": 13.0615, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.06752954417557681, |
|
"grad_norm": 5.728270053863525, |
|
"learning_rate": 2.7885565489049946e-05, |
|
"loss": 13.4382, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06773417915792705, |
|
"grad_norm": 5.375972270965576, |
|
"learning_rate": 2.759254032053888e-05, |
|
"loss": 12.7093, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.06793881414027728, |
|
"grad_norm": 6.161487579345703, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 13.2182, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.06814344912262751, |
|
"grad_norm": 5.7192254066467285, |
|
"learning_rate": 2.700938207760701e-05, |
|
"loss": 13.2617, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.06834808410497775, |
|
"grad_norm": 5.916773796081543, |
|
"learning_rate": 2.671927398374443e-05, |
|
"loss": 13.1881, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.06855271908732798, |
|
"grad_norm": 6.228060722351074, |
|
"learning_rate": 2.6430163158700115e-05, |
|
"loss": 12.904, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.06875735406967821, |
|
"grad_norm": 7.778335094451904, |
|
"learning_rate": 2.6142061987019577e-05, |
|
"loss": 13.1079, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.06896198905202844, |
|
"grad_norm": 6.622939109802246, |
|
"learning_rate": 2.5854982809998153e-05, |
|
"loss": 12.9644, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.06916662403437868, |
|
"grad_norm": 6.916367053985596, |
|
"learning_rate": 2.556893792515227e-05, |
|
"loss": 13.016, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.06937125901672891, |
|
"grad_norm": 6.418735980987549, |
|
"learning_rate": 2.5283939585692783e-05, |
|
"loss": 12.5322, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.06957589399907914, |
|
"grad_norm": 7.215633392333984, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 12.5916, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06978052898142938, |
|
"grad_norm": 7.442222595214844, |
|
"learning_rate": 2.471713133110078e-05, |
|
"loss": 13.7867, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.06998516396377961, |
|
"grad_norm": 8.242687225341797, |
|
"learning_rate": 2.4435345696147403e-05, |
|
"loss": 12.5457, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.07018979894612984, |
|
"grad_norm": 8.024588584899902, |
|
"learning_rate": 2.4154655165898627e-05, |
|
"loss": 13.2987, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.07039443392848008, |
|
"grad_norm": 8.097381591796875, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 12.2318, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.0705990689108303, |
|
"grad_norm": 8.730584144592285, |
|
"learning_rate": 2.3596607467481603e-05, |
|
"loss": 12.7858, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.07080370389318054, |
|
"grad_norm": 10.03264045715332, |
|
"learning_rate": 2.3319274204219428e-05, |
|
"loss": 12.4164, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.07100833887553078, |
|
"grad_norm": 10.419230461120605, |
|
"learning_rate": 2.3043083854449988e-05, |
|
"loss": 11.9891, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.071212973857881, |
|
"grad_norm": 10.628999710083008, |
|
"learning_rate": 2.2768048249248648e-05, |
|
"loss": 13.0758, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.07141760884023124, |
|
"grad_norm": 12.918512344360352, |
|
"learning_rate": 2.2494179170225333e-05, |
|
"loss": 12.2184, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.07162224382258146, |
|
"grad_norm": 17.08102798461914, |
|
"learning_rate": 2.2221488349019903e-05, |
|
"loss": 13.2989, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0718268788049317, |
|
"grad_norm": 2.621025800704956, |
|
"learning_rate": 2.194998746679952e-05, |
|
"loss": 13.1545, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.07203151378728194, |
|
"grad_norm": 2.733877182006836, |
|
"learning_rate": 2.167968815375837e-05, |
|
"loss": 12.7721, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.07223614876963216, |
|
"grad_norm": 2.7492332458496094, |
|
"learning_rate": 2.1410601988619394e-05, |
|
"loss": 12.9304, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.0724407837519824, |
|
"grad_norm": 3.0048248767852783, |
|
"learning_rate": 2.1142740498138324e-05, |
|
"loss": 12.5243, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.07264541873433264, |
|
"grad_norm": 3.2829930782318115, |
|
"learning_rate": 2.08761151566099e-05, |
|
"loss": 12.8763, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.07285005371668286, |
|
"grad_norm": 3.4896061420440674, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 12.6124, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.0730546886990331, |
|
"grad_norm": 3.766759157180786, |
|
"learning_rate": 2.034661855233815e-05, |
|
"loss": 12.8122, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.07325932368138333, |
|
"grad_norm": 3.564368486404419, |
|
"learning_rate": 2.008376997146705e-05, |
|
"loss": 12.5289, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.07346395866373356, |
|
"grad_norm": 3.5410869121551514, |
|
"learning_rate": 1.982220290232143e-05, |
|
"loss": 12.9323, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.0736685936460838, |
|
"grad_norm": 3.591470718383789, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 12.6307, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.07387322862843403, |
|
"grad_norm": 3.747335910797119, |
|
"learning_rate": 1.9302958062481673e-05, |
|
"loss": 12.2941, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.07407786361078426, |
|
"grad_norm": 3.776078224182129, |
|
"learning_rate": 1.9045302534508297e-05, |
|
"loss": 12.6067, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.0742824985931345, |
|
"grad_norm": 4.040799617767334, |
|
"learning_rate": 1.8788973002749112e-05, |
|
"loss": 12.9593, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.07448713357548473, |
|
"grad_norm": 3.7775509357452393, |
|
"learning_rate": 1.8533980447508137e-05, |
|
"loss": 12.3113, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.07469176855783496, |
|
"grad_norm": 3.858264446258545, |
|
"learning_rate": 1.8280335791817733e-05, |
|
"loss": 12.9729, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.07489640354018519, |
|
"grad_norm": 4.055905342102051, |
|
"learning_rate": 1.8028049900970767e-05, |
|
"loss": 12.5952, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.07510103852253543, |
|
"grad_norm": 4.153656959533691, |
|
"learning_rate": 1.777713358205514e-05, |
|
"loss": 13.1257, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.07530567350488566, |
|
"grad_norm": 4.324829578399658, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 12.7625, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.07551030848723589, |
|
"grad_norm": 4.624112129211426, |
|
"learning_rate": 1.7279452594569483e-05, |
|
"loss": 12.7958, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.07571494346958613, |
|
"grad_norm": 4.383573055267334, |
|
"learning_rate": 1.703270924499656e-05, |
|
"loss": 12.8363, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07591957845193636, |
|
"grad_norm": 4.42855167388916, |
|
"learning_rate": 1.678737810443593e-05, |
|
"loss": 12.8649, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.07612421343428659, |
|
"grad_norm": 4.5845947265625, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 12.901, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.07632884841663683, |
|
"grad_norm": 4.834083080291748, |
|
"learning_rate": 1.6300994426085103e-05, |
|
"loss": 13.347, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.07653348339898705, |
|
"grad_norm": 4.795494079589844, |
|
"learning_rate": 1.605996272335291e-05, |
|
"loss": 12.8918, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.07673811838133729, |
|
"grad_norm": 4.895383358001709, |
|
"learning_rate": 1.5820384898856434e-05, |
|
"loss": 13.162, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.07694275336368753, |
|
"grad_norm": 4.894996166229248, |
|
"learning_rate": 1.5582271215312294e-05, |
|
"loss": 12.701, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.07714738834603775, |
|
"grad_norm": 5.065547943115234, |
|
"learning_rate": 1.5345631872718214e-05, |
|
"loss": 12.8654, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.07735202332838799, |
|
"grad_norm": 5.112913608551025, |
|
"learning_rate": 1.5110477007916001e-05, |
|
"loss": 12.766, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.07755665831073823, |
|
"grad_norm": 5.340709209442139, |
|
"learning_rate": 1.4876816694157419e-05, |
|
"loss": 12.9361, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.07776129329308845, |
|
"grad_norm": 5.618555068969727, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 13.0778, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.07796592827543869, |
|
"grad_norm": 5.697518348693848, |
|
"learning_rate": 1.4414019692241437e-05, |
|
"loss": 12.582, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.07817056325778891, |
|
"grad_norm": 5.6424241065979, |
|
"learning_rate": 1.4184902828767287e-05, |
|
"loss": 12.7521, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.07837519824013915, |
|
"grad_norm": 6.131405830383301, |
|
"learning_rate": 1.3957320164854059e-05, |
|
"loss": 13.2849, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.07857983322248939, |
|
"grad_norm": 5.919434547424316, |
|
"learning_rate": 1.373128144938563e-05, |
|
"loss": 13.0198, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.07878446820483961, |
|
"grad_norm": 6.204239368438721, |
|
"learning_rate": 1.3506796365108232e-05, |
|
"loss": 13.2723, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.07898910318718985, |
|
"grad_norm": 6.187657833099365, |
|
"learning_rate": 1.3283874528215733e-05, |
|
"loss": 12.9363, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.07919373816954008, |
|
"grad_norm": 6.771162509918213, |
|
"learning_rate": 1.3062525487937699e-05, |
|
"loss": 12.9462, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.07939837315189031, |
|
"grad_norm": 7.12640380859375, |
|
"learning_rate": 1.2842758726130283e-05, |
|
"loss": 13.2795, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.07960300813424055, |
|
"grad_norm": 6.75380802154541, |
|
"learning_rate": 1.2624583656870154e-05, |
|
"loss": 12.9841, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.07980764311659078, |
|
"grad_norm": 7.025509357452393, |
|
"learning_rate": 1.2408009626051137e-05, |
|
"loss": 12.963, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.08001227809894101, |
|
"grad_norm": 7.841353893280029, |
|
"learning_rate": 1.2193045910983863e-05, |
|
"loss": 12.8249, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.08021691308129125, |
|
"grad_norm": 7.905152320861816, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 12.7228, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.08042154806364148, |
|
"grad_norm": 7.849075794219971, |
|
"learning_rate": 1.1767986192049984e-05, |
|
"loss": 12.9185, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.08062618304599171, |
|
"grad_norm": 9.404616355895996, |
|
"learning_rate": 1.1557908396327028e-05, |
|
"loss": 12.9398, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.08083081802834194, |
|
"grad_norm": 9.282879829406738, |
|
"learning_rate": 1.134947733186315e-05, |
|
"loss": 12.8917, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.08103545301069218, |
|
"grad_norm": 10.77182388305664, |
|
"learning_rate": 1.1142701927151456e-05, |
|
"loss": 13.7875, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.08124008799304241, |
|
"grad_norm": 11.910287857055664, |
|
"learning_rate": 1.0937591039762085e-05, |
|
"loss": 13.1089, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.08144472297539264, |
|
"grad_norm": 11.270553588867188, |
|
"learning_rate": 1.0734153455962765e-05, |
|
"loss": 12.87, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.08164935795774288, |
|
"grad_norm": 12.47913646697998, |
|
"learning_rate": 1.0532397890342505e-05, |
|
"loss": 12.6082, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.08185399294009311, |
|
"grad_norm": 19.859079360961914, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 12.4463, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08185399294009311, |
|
"eval_loss": 3.1334729194641113, |
|
"eval_runtime": 47.3815, |
|
"eval_samples_per_second": 173.697, |
|
"eval_steps_per_second": 43.435, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08205862792244334, |
|
"grad_norm": 2.140819549560547, |
|
"learning_rate": 1.013396731136465e-05, |
|
"loss": 12.6548, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.08226326290479358, |
|
"grad_norm": 2.7390999794006348, |
|
"learning_rate": 9.937309365446973e-06, |
|
"loss": 12.7752, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.0824678978871438, |
|
"grad_norm": 2.9735829830169678, |
|
"learning_rate": 9.742367571857091e-06, |
|
"loss": 13.2948, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.08267253286949404, |
|
"grad_norm": 2.9806323051452637, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 12.9468, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.08287716785184428, |
|
"grad_norm": 2.9686119556427, |
|
"learning_rate": 9.357665770419244e-06, |
|
"loss": 12.8879, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.0830818028341945, |
|
"grad_norm": 3.127723217010498, |
|
"learning_rate": 9.167922241916055e-06, |
|
"loss": 12.6892, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.08328643781654474, |
|
"grad_norm": 3.3917040824890137, |
|
"learning_rate": 8.97992782372432e-06, |
|
"loss": 13.2438, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.08349107279889498, |
|
"grad_norm": 3.28285551071167, |
|
"learning_rate": 8.793690568899216e-06, |
|
"loss": 12.7453, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.0836957077812452, |
|
"grad_norm": 3.364295482635498, |
|
"learning_rate": 8.609218455224893e-06, |
|
"loss": 13.0133, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.08390034276359544, |
|
"grad_norm": 3.374210834503174, |
|
"learning_rate": 8.426519384872733e-06, |
|
"loss": 12.79, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.08410497774594566, |
|
"grad_norm": 3.6572799682617188, |
|
"learning_rate": 8.245601184062852e-06, |
|
"loss": 12.6433, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.0843096127282959, |
|
"grad_norm": 3.5379762649536133, |
|
"learning_rate": 8.066471602728803e-06, |
|
"loss": 12.6151, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.08451424771064614, |
|
"grad_norm": 3.6328377723693848, |
|
"learning_rate": 7.889138314185678e-06, |
|
"loss": 13.0676, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.08471888269299636, |
|
"grad_norm": 4.000665664672852, |
|
"learning_rate": 7.71360891480134e-06, |
|
"loss": 13.2594, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.0849235176753466, |
|
"grad_norm": 4.031978130340576, |
|
"learning_rate": 7.539890923671062e-06, |
|
"loss": 12.738, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.08512815265769684, |
|
"grad_norm": 3.724813461303711, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 12.2267, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.08533278764004706, |
|
"grad_norm": 4.113027095794678, |
|
"learning_rate": 7.197918854261432e-06, |
|
"loss": 12.7985, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.0855374226223973, |
|
"grad_norm": 4.147072792053223, |
|
"learning_rate": 7.029679424927365e-06, |
|
"loss": 13.3496, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.08574205760474753, |
|
"grad_norm": 4.0234527587890625, |
|
"learning_rate": 6.863280701110408e-06, |
|
"loss": 13.023, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.08594669258709776, |
|
"grad_norm": 4.267332077026367, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 13.0938, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.086151327569448, |
|
"grad_norm": 4.35993766784668, |
|
"learning_rate": 6.536033802742813e-06, |
|
"loss": 12.7928, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.08635596255179823, |
|
"grad_norm": 4.47703218460083, |
|
"learning_rate": 6.375199646360142e-06, |
|
"loss": 12.9274, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.08656059753414846, |
|
"grad_norm": 4.716027736663818, |
|
"learning_rate": 6.216234231230012e-06, |
|
"loss": 13.2614, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.08676523251649869, |
|
"grad_norm": 4.96610164642334, |
|
"learning_rate": 6.059144366901736e-06, |
|
"loss": 12.2165, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.08696986749884893, |
|
"grad_norm": 4.611530780792236, |
|
"learning_rate": 5.903936782582253e-06, |
|
"loss": 12.7755, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.08717450248119916, |
|
"grad_norm": 4.9565300941467285, |
|
"learning_rate": 5.750618126847912e-06, |
|
"loss": 13.0633, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.08737913746354939, |
|
"grad_norm": 5.047351837158203, |
|
"learning_rate": 5.599194967359639e-06, |
|
"loss": 12.7034, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.08758377244589963, |
|
"grad_norm": 4.904860019683838, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 12.5122, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.08778840742824986, |
|
"grad_norm": 5.229506015777588, |
|
"learning_rate": 5.302061001503394e-06, |
|
"loss": 13.0031, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.08799304241060009, |
|
"grad_norm": 5.458662509918213, |
|
"learning_rate": 5.156362923365588e-06, |
|
"loss": 12.8912, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.08819767739295033, |
|
"grad_norm": 5.457494735717773, |
|
"learning_rate": 5.012585797388936e-06, |
|
"loss": 13.3934, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.08840231237530055, |
|
"grad_norm": 5.259174823760986, |
|
"learning_rate": 4.87073578250698e-06, |
|
"loss": 12.2529, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.08860694735765079, |
|
"grad_norm": 5.968120574951172, |
|
"learning_rate": 4.730818955102234e-06, |
|
"loss": 12.54, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.08881158234000103, |
|
"grad_norm": 6.257950782775879, |
|
"learning_rate": 4.592841308745932e-06, |
|
"loss": 12.6848, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.08901621732235125, |
|
"grad_norm": 6.009523868560791, |
|
"learning_rate": 4.456808753941205e-06, |
|
"loss": 13.0703, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.08922085230470149, |
|
"grad_norm": 6.272739887237549, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 12.3788, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.08942548728705173, |
|
"grad_norm": 6.675440788269043, |
|
"learning_rate": 4.190602144143207e-06, |
|
"loss": 13.0829, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.08963012226940195, |
|
"grad_norm": 6.91643762588501, |
|
"learning_rate": 4.06043949255509e-06, |
|
"loss": 13.3085, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.08983475725175219, |
|
"grad_norm": 7.155692100524902, |
|
"learning_rate": 3.932244738840379e-06, |
|
"loss": 12.7579, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.09003939223410241, |
|
"grad_norm": 6.8719940185546875, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 12.96, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.09024402721645265, |
|
"grad_norm": 7.757195949554443, |
|
"learning_rate": 3.681780806244095e-06, |
|
"loss": 12.8733, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.09044866219880289, |
|
"grad_norm": 7.0664215087890625, |
|
"learning_rate": 3.5595223564037884e-06, |
|
"loss": 12.6288, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.09065329718115311, |
|
"grad_norm": 8.440971374511719, |
|
"learning_rate": 3.4392532620598216e-06, |
|
"loss": 13.3932, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.09085793216350335, |
|
"grad_norm": 8.004168510437012, |
|
"learning_rate": 3.3209786751399187e-06, |
|
"loss": 12.6128, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.09106256714585359, |
|
"grad_norm": 8.411291122436523, |
|
"learning_rate": 3.2047036621337236e-06, |
|
"loss": 13.4134, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.09126720212820381, |
|
"grad_norm": 9.257599830627441, |
|
"learning_rate": 3.0904332038757977e-06, |
|
"loss": 12.8954, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.09147183711055405, |
|
"grad_norm": 9.609465599060059, |
|
"learning_rate": 2.978172195332263e-06, |
|
"loss": 13.3849, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.09167647209290428, |
|
"grad_norm": 11.43130111694336, |
|
"learning_rate": 2.8679254453910785e-06, |
|
"loss": 12.0155, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.09188110707525451, |
|
"grad_norm": 12.757843971252441, |
|
"learning_rate": 2.759697676656098e-06, |
|
"loss": 12.183, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.09208574205760475, |
|
"grad_norm": 17.703031539916992, |
|
"learning_rate": 2.653493525244721e-06, |
|
"loss": 13.8012, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.09229037703995498, |
|
"grad_norm": 2.267075300216675, |
|
"learning_rate": 2.549317540589308e-06, |
|
"loss": 12.4868, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.09249501202230521, |
|
"grad_norm": 2.5427181720733643, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 12.5128, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.09269964700465545, |
|
"grad_norm": 2.599229097366333, |
|
"learning_rate": 2.3470678346851518e-06, |
|
"loss": 12.9201, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.09290428198700568, |
|
"grad_norm": 2.6054906845092773, |
|
"learning_rate": 2.2490027771406687e-06, |
|
"loss": 12.6195, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.09310891696935591, |
|
"grad_norm": 2.9035027027130127, |
|
"learning_rate": 2.152983213389559e-06, |
|
"loss": 12.7726, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.09331355195170614, |
|
"grad_norm": 3.060668468475342, |
|
"learning_rate": 2.0590132565903476e-06, |
|
"loss": 13.0542, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.09351818693405638, |
|
"grad_norm": 3.1342105865478516, |
|
"learning_rate": 1.9670969321032407e-06, |
|
"loss": 12.9032, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.09372282191640662, |
|
"grad_norm": 3.196115493774414, |
|
"learning_rate": 1.8772381773176417e-06, |
|
"loss": 13.1882, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.09392745689875684, |
|
"grad_norm": 3.232515335083008, |
|
"learning_rate": 1.7894408414835362e-06, |
|
"loss": 12.8594, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.09413209188110708, |
|
"grad_norm": 3.358297348022461, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 12.909, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0943367268634573, |
|
"grad_norm": 3.49599027633667, |
|
"learning_rate": 1.620045381987012e-06, |
|
"loss": 12.7092, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.09454136184580754, |
|
"grad_norm": 3.4245402812957764, |
|
"learning_rate": 1.5384545146622852e-06, |
|
"loss": 12.656, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.09474599682815778, |
|
"grad_norm": 3.7484071254730225, |
|
"learning_rate": 1.4589395786535953e-06, |
|
"loss": 12.6428, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.094950631810508, |
|
"grad_norm": 3.6963798999786377, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 12.75, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.09515526679285824, |
|
"grad_norm": 3.9733211994171143, |
|
"learning_rate": 1.3061510361333185e-06, |
|
"loss": 12.9671, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.09535990177520848, |
|
"grad_norm": 3.8950541019439697, |
|
"learning_rate": 1.232883974574367e-06, |
|
"loss": 12.7971, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.0955645367575587, |
|
"grad_norm": 4.091780185699463, |
|
"learning_rate": 1.1617059339563807e-06, |
|
"loss": 12.4321, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.09576917173990894, |
|
"grad_norm": 4.282808303833008, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 13.4677, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.09597380672225916, |
|
"grad_norm": 4.246225357055664, |
|
"learning_rate": 1.0256290220474307e-06, |
|
"loss": 12.7789, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.0961784417046094, |
|
"grad_norm": 4.31233549118042, |
|
"learning_rate": 9.607359798384785e-07, |
|
"loss": 12.7813, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.09638307668695964, |
|
"grad_norm": 4.715573787689209, |
|
"learning_rate": 8.979436164848088e-07, |
|
"loss": 13.1816, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.09658771166930986, |
|
"grad_norm": 4.476752281188965, |
|
"learning_rate": 8.372546218022747e-07, |
|
"loss": 13.1693, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.0967923466516601, |
|
"grad_norm": 4.613335132598877, |
|
"learning_rate": 7.786715955054203e-07, |
|
"loss": 12.6692, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.09699698163401034, |
|
"grad_norm": 4.564615249633789, |
|
"learning_rate": 7.221970470961125e-07, |
|
"loss": 12.6351, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.09720161661636056, |
|
"grad_norm": 4.650303840637207, |
|
"learning_rate": 6.678333957560512e-07, |
|
"loss": 13.1352, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.0974062515987108, |
|
"grad_norm": 4.742563724517822, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 12.8478, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.09761088658106103, |
|
"grad_norm": 5.02756929397583, |
|
"learning_rate": 5.654480087916303e-07, |
|
"loss": 12.4434, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.09781552156341126, |
|
"grad_norm": 5.11809778213501, |
|
"learning_rate": 5.174306590164879e-07, |
|
"loss": 12.5754, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.0980201565457615, |
|
"grad_norm": 5.026023864746094, |
|
"learning_rate": 4.715329778211375e-07, |
|
"loss": 12.7589, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.09822479152811173, |
|
"grad_norm": 5.174000263214111, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 12.4781, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.09842942651046196, |
|
"grad_norm": 5.73392391204834, |
|
"learning_rate": 3.8610439470164737e-07, |
|
"loss": 13.2662, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.0986340614928122, |
|
"grad_norm": 5.5650224685668945, |
|
"learning_rate": 3.465771522536854e-07, |
|
"loss": 12.698, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.09883869647516243, |
|
"grad_norm": 5.415269374847412, |
|
"learning_rate": 3.09176897181096e-07, |
|
"loss": 12.4908, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.09904333145751266, |
|
"grad_norm": 5.8570170402526855, |
|
"learning_rate": 2.7390523158633554e-07, |
|
"loss": 12.5447, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.09924796643986289, |
|
"grad_norm": 6.1962361335754395, |
|
"learning_rate": 2.407636663901591e-07, |
|
"loss": 13.4094, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.09945260142221313, |
|
"grad_norm": 6.143553256988525, |
|
"learning_rate": 2.0975362126691712e-07, |
|
"loss": 12.9071, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.09965723640456337, |
|
"grad_norm": 6.178109645843506, |
|
"learning_rate": 1.8087642458373134e-07, |
|
"loss": 12.7015, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.09986187138691359, |
|
"grad_norm": 6.477899551391602, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 12.9638, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.10006650636926383, |
|
"grad_norm": 6.375460624694824, |
|
"learning_rate": 1.2952543313240472e-07, |
|
"loss": 12.3259, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.10027114135161405, |
|
"grad_norm": 7.033268928527832, |
|
"learning_rate": 1.0705383806982606e-07, |
|
"loss": 12.8778, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.10047577633396429, |
|
"grad_norm": 6.597715377807617, |
|
"learning_rate": 8.671949076420882e-08, |
|
"loss": 12.0536, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.10068041131631453, |
|
"grad_norm": 7.840211868286133, |
|
"learning_rate": 6.852326227130834e-08, |
|
"loss": 13.5376, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.10088504629866475, |
|
"grad_norm": 7.423345565795898, |
|
"learning_rate": 5.246593205699424e-08, |
|
"loss": 12.8476, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.10108968128101499, |
|
"grad_norm": 8.046225547790527, |
|
"learning_rate": 3.8548187963854956e-08, |
|
"loss": 12.5719, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.10129431626336523, |
|
"grad_norm": 8.249234199523926, |
|
"learning_rate": 2.6770626181715773e-08, |
|
"loss": 12.1992, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.10149895124571545, |
|
"grad_norm": 9.472099304199219, |
|
"learning_rate": 1.7133751222137007e-08, |
|
"loss": 12.8579, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.10170358622806569, |
|
"grad_norm": 9.204952239990234, |
|
"learning_rate": 9.637975896759077e-09, |
|
"loss": 12.0369, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.10190822121041591, |
|
"grad_norm": 11.747493743896484, |
|
"learning_rate": 4.2836212996499865e-09, |
|
"loss": 13.0472, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.10211285619276615, |
|
"grad_norm": 12.529196739196777, |
|
"learning_rate": 1.0709167935385455e-09, |
|
"loss": 12.7862, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.10231749117511639, |
|
"grad_norm": 19.44767951965332, |
|
"learning_rate": 0.0, |
|
"loss": 13.4285, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10231749117511639, |
|
"eval_loss": 3.130511999130249, |
|
"eval_runtime": 47.3503, |
|
"eval_samples_per_second": 173.811, |
|
"eval_steps_per_second": 43.463, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9442642165235712.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|