|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.021669254938784355, |
|
"eval_steps": 25, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002889233991837914, |
|
"grad_norm": 66.85765075683594, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 26.6527, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002889233991837914, |
|
"eval_loss": 4.310550689697266, |
|
"eval_runtime": 990.545, |
|
"eval_samples_per_second": 2.943, |
|
"eval_steps_per_second": 1.472, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005778467983675828, |
|
"grad_norm": 73.37711334228516, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 43.7611, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0008667701975513742, |
|
"grad_norm": 65.6069107055664, |
|
"learning_rate": 0.0001, |
|
"loss": 26.0857, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0011556935967351656, |
|
"grad_norm": 59.228843688964844, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 28.3396, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.001444616995918957, |
|
"grad_norm": 24.407468795776367, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 22.8091, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0017335403951027485, |
|
"grad_norm": 19.40464973449707, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 21.675, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0020224637942865397, |
|
"grad_norm": 21.251638412475586, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 17.9579, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.002311387193470331, |
|
"grad_norm": 41.2540168762207, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 16.5047, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0026003105926541226, |
|
"grad_norm": 22.907991409301758, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 15.6627, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.002889233991837914, |
|
"grad_norm": 37.04414367675781, |
|
"learning_rate": 9.768584753741134e-05, |
|
"loss": 17.3563, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0031781573910217055, |
|
"grad_norm": 21.000375747680664, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 14.7146, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.003467080790205497, |
|
"grad_norm": 18.32266616821289, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 12.4842, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.003756004189389288, |
|
"grad_norm": 27.443967819213867, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 10.4003, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.004044927588573079, |
|
"grad_norm": 42.203514099121094, |
|
"learning_rate": 9.435054165891109e-05, |
|
"loss": 8.0634, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.004333850987756871, |
|
"grad_norm": 162.54161071777344, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 12.063, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.004622774386940662, |
|
"grad_norm": 36.060081481933594, |
|
"learning_rate": 9.21695722906443e-05, |
|
"loss": 8.8787, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.004911697786124454, |
|
"grad_norm": 52.42499923706055, |
|
"learning_rate": 9.09576022144496e-05, |
|
"loss": 8.1511, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.005200621185308245, |
|
"grad_norm": 49.18842697143555, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 6.8164, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.005489544584492036, |
|
"grad_norm": 28.918075561523438, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 7.1671, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.005778467983675828, |
|
"grad_norm": 18.048503875732422, |
|
"learning_rate": 8.68638668405062e-05, |
|
"loss": 8.5257, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006067391382859619, |
|
"grad_norm": 18.87720489501953, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 5.431, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.006356314782043411, |
|
"grad_norm": 19.41789436340332, |
|
"learning_rate": 8.377951038078302e-05, |
|
"loss": 7.1162, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.006645238181227202, |
|
"grad_norm": 19.061424255371094, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 7.5901, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.006934161580410994, |
|
"grad_norm": 16.62812614440918, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 5.7933, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.007223084979594785, |
|
"grad_norm": 28.045270919799805, |
|
"learning_rate": 7.86788218175523e-05, |
|
"loss": 6.6255, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.007223084979594785, |
|
"eval_loss": 1.1299744844436646, |
|
"eval_runtime": 994.88, |
|
"eval_samples_per_second": 2.93, |
|
"eval_steps_per_second": 1.466, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.007512008378778576, |
|
"grad_norm": 18.70418357849121, |
|
"learning_rate": 7.68649804173412e-05, |
|
"loss": 8.0266, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.007800931777962368, |
|
"grad_norm": 29.825096130371094, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 9.148, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.008089855177146159, |
|
"grad_norm": 52.200557708740234, |
|
"learning_rate": 7.308743066175172e-05, |
|
"loss": 7.8062, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.00837877857632995, |
|
"grad_norm": 35.42280197143555, |
|
"learning_rate": 7.113091308703498e-05, |
|
"loss": 7.3871, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.008667701975513743, |
|
"grad_norm": 41.86412048339844, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 7.6188, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.008956625374697533, |
|
"grad_norm": 27.898130416870117, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 8.1271, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.009245548773881325, |
|
"grad_norm": 21.139310836791992, |
|
"learning_rate": 6.503528997521366e-05, |
|
"loss": 5.1898, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.009534472173065116, |
|
"grad_norm": 24.908050537109375, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 6.0579, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.009823395572248908, |
|
"grad_norm": 46.74921798706055, |
|
"learning_rate": 6.0821980696905146e-05, |
|
"loss": 7.2563, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.010112318971432699, |
|
"grad_norm": 156.18777465820312, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 11.1945, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01040124237061649, |
|
"grad_norm": 38.862464904785156, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 7.6352, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.010690165769800282, |
|
"grad_norm": 47.18630599975586, |
|
"learning_rate": 5.435778713738292e-05, |
|
"loss": 6.7799, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.010979089168984072, |
|
"grad_norm": 24.2867431640625, |
|
"learning_rate": 5.218096936826681e-05, |
|
"loss": 5.5195, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.011268012568167864, |
|
"grad_norm": 30.565311431884766, |
|
"learning_rate": 5e-05, |
|
"loss": 6.0461, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.011556935967351656, |
|
"grad_norm": 28.035436630249023, |
|
"learning_rate": 4.781903063173321e-05, |
|
"loss": 6.4751, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011845859366535448, |
|
"grad_norm": 31.523160934448242, |
|
"learning_rate": 4.564221286261709e-05, |
|
"loss": 7.083, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.012134782765719238, |
|
"grad_norm": 33.626914978027344, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 6.0292, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.01242370616490303, |
|
"grad_norm": 46.72255325317383, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 7.6237, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.012712629564086822, |
|
"grad_norm": 33.627716064453125, |
|
"learning_rate": 3.917801930309486e-05, |
|
"loss": 7.184, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.013001552963270612, |
|
"grad_norm": 27.371572494506836, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 6.2984, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.013290476362454404, |
|
"grad_norm": 37.012481689453125, |
|
"learning_rate": 3.4964710024786354e-05, |
|
"loss": 5.5282, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.013579399761638196, |
|
"grad_norm": 25.040252685546875, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 5.8425, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.013868323160821988, |
|
"grad_norm": 53.89478302001953, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 9.8582, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.014157246560005778, |
|
"grad_norm": 74.33063507080078, |
|
"learning_rate": 2.886908691296504e-05, |
|
"loss": 10.6698, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.01444616995918957, |
|
"grad_norm": 55.97947311401367, |
|
"learning_rate": 2.6912569338248315e-05, |
|
"loss": 6.7913, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01444616995918957, |
|
"eval_loss": 0.7597658038139343, |
|
"eval_runtime": 996.114, |
|
"eval_samples_per_second": 2.926, |
|
"eval_steps_per_second": 1.464, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014735093358373362, |
|
"grad_norm": 26.76323699951172, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 5.3221, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.015024016757557152, |
|
"grad_norm": 40.71314239501953, |
|
"learning_rate": 2.3135019582658802e-05, |
|
"loss": 7.0028, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.015312940156740944, |
|
"grad_norm": 18.181428909301758, |
|
"learning_rate": 2.132117818244771e-05, |
|
"loss": 5.6947, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.015601863555924736, |
|
"grad_norm": 23.002670288085938, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 6.155, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.015890786955108527, |
|
"grad_norm": 18.26033592224121, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 5.6797, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.016179710354292318, |
|
"grad_norm": 14.501717567443848, |
|
"learning_rate": 1.622048961921699e-05, |
|
"loss": 4.742, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.01646863375347611, |
|
"grad_norm": 17.230289459228516, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 6.2838, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0167575571526599, |
|
"grad_norm": 14.124578475952148, |
|
"learning_rate": 1.3136133159493802e-05, |
|
"loss": 5.8269, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.01704648055184369, |
|
"grad_norm": 20.78701400756836, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 6.3414, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.017335403951027485, |
|
"grad_norm": 13.879270553588867, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 4.1704, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.017624327350211275, |
|
"grad_norm": 13.16641902923584, |
|
"learning_rate": 9.042397785550405e-06, |
|
"loss": 4.4702, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.017913250749395065, |
|
"grad_norm": 12.741442680358887, |
|
"learning_rate": 7.830427709355725e-06, |
|
"loss": 4.0511, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.01820217414857886, |
|
"grad_norm": 12.697881698608398, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 4.0318, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.01849109754776265, |
|
"grad_norm": 19.755266189575195, |
|
"learning_rate": 5.649458341088915e-06, |
|
"loss": 3.4955, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.01878002094694644, |
|
"grad_norm": 17.082738876342773, |
|
"learning_rate": 4.684610648167503e-06, |
|
"loss": 5.3582, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.019068944346130233, |
|
"grad_norm": 15.043936729431152, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 4.0782, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.019357867745314023, |
|
"grad_norm": 24.604942321777344, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 4.5255, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.019646791144497817, |
|
"grad_norm": 13.922004699707031, |
|
"learning_rate": 2.314152462588659e-06, |
|
"loss": 4.1978, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.019935714543681607, |
|
"grad_norm": 13.740044593811035, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 4.8924, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.020224637942865397, |
|
"grad_norm": 12.760802268981934, |
|
"learning_rate": 1.1851996440033319e-06, |
|
"loss": 4.1786, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02051356134204919, |
|
"grad_norm": 13.630950927734375, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 4.6798, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.02080248474123298, |
|
"grad_norm": 15.914883613586426, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 6.1266, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.02109140814041677, |
|
"grad_norm": 21.27867889404297, |
|
"learning_rate": 1.9026509541272275e-07, |
|
"loss": 3.6778, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.021380331539600565, |
|
"grad_norm": 16.753841400146484, |
|
"learning_rate": 4.7588920907110094e-08, |
|
"loss": 5.9576, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.021669254938784355, |
|
"grad_norm": 15.862313270568848, |
|
"learning_rate": 0.0, |
|
"loss": 3.6563, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.021669254938784355, |
|
"eval_loss": 0.6312903761863708, |
|
"eval_runtime": 995.9618, |
|
"eval_samples_per_second": 2.927, |
|
"eval_steps_per_second": 1.464, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.86091809144832e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|