|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0785876993166287, |
|
"eval_steps": 500, |
|
"global_step": 138, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005694760820045558, |
|
"grad_norm": 0.9193857312202454, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 0.9288, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0011389521640091116, |
|
"grad_norm": 0.8991456031799316, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 0.9457, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0017084282460136675, |
|
"grad_norm": 1.1229982376098633, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 0.9705, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.002277904328018223, |
|
"grad_norm": 1.1453852653503418, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 0.8237, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0028473804100227792, |
|
"grad_norm": 0.6836767196655273, |
|
"learning_rate": 0.00015, |
|
"loss": 0.6138, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003416856492027335, |
|
"grad_norm": 0.7178201079368591, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 0.715, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.003986332574031891, |
|
"grad_norm": 0.6292722225189209, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 0.8324, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.004555808656036446, |
|
"grad_norm": 1.0551400184631348, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.7891, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005125284738041002, |
|
"grad_norm": 2.0253489017486572, |
|
"learning_rate": 0.00027, |
|
"loss": 1.188, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0056947608200455585, |
|
"grad_norm": 1.0278229713439941, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7582, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006264236902050114, |
|
"grad_norm": 1.3169053792953491, |
|
"learning_rate": 0.0002999548228044306, |
|
"loss": 0.7924, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00683371298405467, |
|
"grad_norm": 0.8358129262924194, |
|
"learning_rate": 0.00029981931843077583, |
|
"loss": 0.7108, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.007403189066059226, |
|
"grad_norm": 1.0881885290145874, |
|
"learning_rate": 0.0002995935685018035, |
|
"loss": 0.7324, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.007972665148063782, |
|
"grad_norm": 1.1146137714385986, |
|
"learning_rate": 0.00029927770900082954, |
|
"loss": 1.2375, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.008542141230068337, |
|
"grad_norm": 0.5502291917800903, |
|
"learning_rate": 0.0002988719301898065, |
|
"loss": 0.6301, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009111617312072893, |
|
"grad_norm": 0.684746503829956, |
|
"learning_rate": 0.00029837647649471715, |
|
"loss": 0.6484, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00968109339407745, |
|
"grad_norm": 0.6915183663368225, |
|
"learning_rate": 0.00029779164635834114, |
|
"loss": 1.5319, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.010250569476082005, |
|
"grad_norm": 0.6584042906761169, |
|
"learning_rate": 0.00029711779206048454, |
|
"loss": 0.7015, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01082004555808656, |
|
"grad_norm": 0.6344502568244934, |
|
"learning_rate": 0.00029635531950577925, |
|
"loss": 0.6405, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.011389521640091117, |
|
"grad_norm": 0.8625004291534424, |
|
"learning_rate": 0.0002955046879791816, |
|
"loss": 0.6721, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.011958997722095672, |
|
"grad_norm": 0.8597177863121033, |
|
"learning_rate": 0.00029456640986931596, |
|
"loss": 0.6415, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.012528473804100227, |
|
"grad_norm": 1.4716626405715942, |
|
"learning_rate": 0.0002935410503598313, |
|
"loss": 0.952, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.013097949886104784, |
|
"grad_norm": 0.5828580260276794, |
|
"learning_rate": 0.00029242922708895547, |
|
"loss": 0.5977, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.01366742596810934, |
|
"grad_norm": 0.6336590647697449, |
|
"learning_rate": 0.00029123160977745306, |
|
"loss": 0.8268, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.014236902050113895, |
|
"grad_norm": 0.6400074362754822, |
|
"learning_rate": 0.0002899489198252108, |
|
"loss": 0.7116, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.014806378132118452, |
|
"grad_norm": 0.914237380027771, |
|
"learning_rate": 0.000288581929876693, |
|
"loss": 0.6563, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.015375854214123007, |
|
"grad_norm": 0.7890664339065552, |
|
"learning_rate": 0.0002871314633555296, |
|
"loss": 0.9234, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.015945330296127564, |
|
"grad_norm": 0.9337290525436401, |
|
"learning_rate": 0.0002855983939685165, |
|
"loss": 0.7655, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01651480637813212, |
|
"grad_norm": 0.9062933325767517, |
|
"learning_rate": 0.00028398364517932725, |
|
"loss": 0.6479, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.017084282460136675, |
|
"grad_norm": 0.5920599102973938, |
|
"learning_rate": 0.0002822881896522532, |
|
"loss": 0.6417, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01765375854214123, |
|
"grad_norm": 0.7619308829307556, |
|
"learning_rate": 0.0002805130486663067, |
|
"loss": 0.7647, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.018223234624145785, |
|
"grad_norm": 0.8592938184738159, |
|
"learning_rate": 0.0002786592915000408, |
|
"loss": 1.0644, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.01879271070615034, |
|
"grad_norm": 0.584583044052124, |
|
"learning_rate": 0.000276728034787456, |
|
"loss": 0.5707, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0193621867881549, |
|
"grad_norm": 0.6947116851806641, |
|
"learning_rate": 0.0002747204418453818, |
|
"loss": 0.8087, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.019931662870159454, |
|
"grad_norm": 0.5154379606246948, |
|
"learning_rate": 0.0002726377219727375, |
|
"loss": 0.6937, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02050113895216401, |
|
"grad_norm": 0.6641525626182556, |
|
"learning_rate": 0.0002704811297220967, |
|
"loss": 0.7324, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.021070615034168565, |
|
"grad_norm": 0.776289701461792, |
|
"learning_rate": 0.00026825196414399094, |
|
"loss": 0.6164, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.02164009111617312, |
|
"grad_norm": 0.9698323607444763, |
|
"learning_rate": 0.0002659515680044105, |
|
"loss": 0.5876, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.022209567198177675, |
|
"grad_norm": 0.9234256744384766, |
|
"learning_rate": 0.00026358132697597265, |
|
"loss": 1.6872, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.022779043280182234, |
|
"grad_norm": 0.8341031670570374, |
|
"learning_rate": 0.00026114266880324387, |
|
"loss": 0.7148, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02334851936218679, |
|
"grad_norm": 0.7260201573371887, |
|
"learning_rate": 0.00025863706244272003, |
|
"loss": 0.6195, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.023917995444191344, |
|
"grad_norm": 0.5485382080078125, |
|
"learning_rate": 0.00025606601717798207, |
|
"loss": 0.6015, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0244874715261959, |
|
"grad_norm": 0.83427494764328, |
|
"learning_rate": 0.00025343108171056, |
|
"loss": 0.7354, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.025056947608200455, |
|
"grad_norm": 0.7569791674613953, |
|
"learning_rate": 0.00025073384322705274, |
|
"loss": 0.7379, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.02562642369020501, |
|
"grad_norm": 0.7086009383201599, |
|
"learning_rate": 0.00024797592644306646, |
|
"loss": 0.8229, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02619589977220957, |
|
"grad_norm": 0.6626051068305969, |
|
"learning_rate": 0.0002451589926245468, |
|
"loss": 0.7937, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.026765375854214124, |
|
"grad_norm": 1.0067198276519775, |
|
"learning_rate": 0.000242284738587094, |
|
"loss": 0.94, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.02733485193621868, |
|
"grad_norm": 0.6433237791061401, |
|
"learning_rate": 0.000239354895673865, |
|
"loss": 0.9452, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.027904328018223234, |
|
"grad_norm": 0.6352181434631348, |
|
"learning_rate": 0.00023637122871267679, |
|
"loss": 0.6538, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.02847380410022779, |
|
"grad_norm": 0.7389889359474182, |
|
"learning_rate": 0.0002333355349529403, |
|
"loss": 0.989, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.029043280182232345, |
|
"grad_norm": 0.7596966028213501, |
|
"learning_rate": 0.00023024964298306458, |
|
"loss": 0.6397, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.029612756264236904, |
|
"grad_norm": 0.6516755223274231, |
|
"learning_rate": 0.00022711541162898321, |
|
"loss": 0.5003, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03018223234624146, |
|
"grad_norm": 0.8451756238937378, |
|
"learning_rate": 0.0002239347288344676, |
|
"loss": 0.738, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.030751708428246014, |
|
"grad_norm": 0.7065162062644958, |
|
"learning_rate": 0.00022070951052389966, |
|
"loss": 0.7718, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03132118451025057, |
|
"grad_norm": 0.9125147461891174, |
|
"learning_rate": 0.00021744169944819098, |
|
"loss": 0.5715, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03189066059225513, |
|
"grad_norm": 0.5039160847663879, |
|
"learning_rate": 0.0002141332640145423, |
|
"loss": 0.5745, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03246013667425968, |
|
"grad_norm": 0.5523383617401123, |
|
"learning_rate": 0.00021078619710074845, |
|
"loss": 0.9072, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03302961275626424, |
|
"grad_norm": 0.7476831674575806, |
|
"learning_rate": 0.00020740251485476345, |
|
"loss": 0.4698, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.033599088838268794, |
|
"grad_norm": 0.6698426604270935, |
|
"learning_rate": 0.00020398425548024822, |
|
"loss": 0.6769, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03416856492027335, |
|
"grad_norm": 0.7437167167663574, |
|
"learning_rate": 0.00020053347800883298, |
|
"loss": 0.6624, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.034738041002277904, |
|
"grad_norm": 0.5557659268379211, |
|
"learning_rate": 0.00019705226105983374, |
|
"loss": 0.7612, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.03530751708428246, |
|
"grad_norm": 0.5920267701148987, |
|
"learning_rate": 0.0001935427015881693, |
|
"loss": 0.6359, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.035876993166287015, |
|
"grad_norm": 0.5547272562980652, |
|
"learning_rate": 0.00019000691362123473, |
|
"loss": 0.5502, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03644646924829157, |
|
"grad_norm": 0.6265895366668701, |
|
"learning_rate": 0.0001864470269854896, |
|
"loss": 0.8296, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.037015945330296125, |
|
"grad_norm": 0.5109124779701233, |
|
"learning_rate": 0.00018286518602353045, |
|
"loss": 0.6811, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03758542141230068, |
|
"grad_norm": 0.733314573764801, |
|
"learning_rate": 0.00017926354830241924, |
|
"loss": 0.9034, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.038154897494305236, |
|
"grad_norm": 0.5381625294685364, |
|
"learning_rate": 0.00017564428331404519, |
|
"loss": 0.6674, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0387243735763098, |
|
"grad_norm": 0.6308789849281311, |
|
"learning_rate": 0.00017200957116830423, |
|
"loss": 0.6398, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.03929384965831435, |
|
"grad_norm": 0.6676629185676575, |
|
"learning_rate": 0.00016836160127988242, |
|
"loss": 0.57, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.03986332574031891, |
|
"grad_norm": 0.72255539894104, |
|
"learning_rate": 0.0001647025710494341, |
|
"loss": 0.6134, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.040432801822323464, |
|
"grad_norm": 1.10958731174469, |
|
"learning_rate": 0.00016103468453995012, |
|
"loss": 0.9271, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.04100227790432802, |
|
"grad_norm": 0.5311421751976013, |
|
"learning_rate": 0.0001573601511491127, |
|
"loss": 0.6661, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.041571753986332574, |
|
"grad_norm": 0.8206638097763062, |
|
"learning_rate": 0.00015368118427843682, |
|
"loss": 0.8327, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.04214123006833713, |
|
"grad_norm": 0.5865733027458191, |
|
"learning_rate": 0.00015, |
|
"loss": 0.5316, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.042710706150341685, |
|
"grad_norm": 0.674359917640686, |
|
"learning_rate": 0.00014631881572156315, |
|
"loss": 1.232, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04328018223234624, |
|
"grad_norm": 0.6954506039619446, |
|
"learning_rate": 0.0001426398488508873, |
|
"loss": 0.6289, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.043849658314350795, |
|
"grad_norm": 0.6149243116378784, |
|
"learning_rate": 0.00013896531546004988, |
|
"loss": 0.6659, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.04441913439635535, |
|
"grad_norm": 0.6586117148399353, |
|
"learning_rate": 0.0001352974289505659, |
|
"loss": 0.9493, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.044988610478359906, |
|
"grad_norm": 0.6128969192504883, |
|
"learning_rate": 0.00013163839872011758, |
|
"loss": 1.0356, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.04555808656036447, |
|
"grad_norm": 0.6175865530967712, |
|
"learning_rate": 0.00012799042883169574, |
|
"loss": 0.7532, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04612756264236902, |
|
"grad_norm": 0.7808921933174133, |
|
"learning_rate": 0.0001243557166859548, |
|
"loss": 1.015, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.04669703872437358, |
|
"grad_norm": 1.351828932762146, |
|
"learning_rate": 0.00012073645169758076, |
|
"loss": 0.9374, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.04726651480637813, |
|
"grad_norm": 0.598646879196167, |
|
"learning_rate": 0.00011713481397646953, |
|
"loss": 0.5562, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.04783599088838269, |
|
"grad_norm": 0.7442788481712341, |
|
"learning_rate": 0.00011355297301451042, |
|
"loss": 0.75, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.048405466970387244, |
|
"grad_norm": 0.5332076549530029, |
|
"learning_rate": 0.00010999308637876524, |
|
"loss": 0.6766, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0489749430523918, |
|
"grad_norm": 1.0476224422454834, |
|
"learning_rate": 0.00010645729841183066, |
|
"loss": 0.6271, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.049544419134396354, |
|
"grad_norm": 0.8156277537345886, |
|
"learning_rate": 0.00010294773894016627, |
|
"loss": 0.8984, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.05011389521640091, |
|
"grad_norm": 0.8451378345489502, |
|
"learning_rate": 9.946652199116699e-05, |
|
"loss": 1.0814, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.050683371298405465, |
|
"grad_norm": 0.6506671905517578, |
|
"learning_rate": 9.601574451975175e-05, |
|
"loss": 0.5343, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.05125284738041002, |
|
"grad_norm": 1.0723323822021484, |
|
"learning_rate": 9.259748514523653e-05, |
|
"loss": 1.1407, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.051822323462414575, |
|
"grad_norm": 0.6675905585289001, |
|
"learning_rate": 8.921380289925153e-05, |
|
"loss": 0.8981, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.05239179954441914, |
|
"grad_norm": 0.851328432559967, |
|
"learning_rate": 8.586673598545771e-05, |
|
"loss": 0.7855, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.05296127562642369, |
|
"grad_norm": 0.5953764915466309, |
|
"learning_rate": 8.255830055180899e-05, |
|
"loss": 0.6019, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.05353075170842825, |
|
"grad_norm": 0.6898136138916016, |
|
"learning_rate": 7.929048947610034e-05, |
|
"loss": 0.6316, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0541002277904328, |
|
"grad_norm": 0.766689658164978, |
|
"learning_rate": 7.606527116553241e-05, |
|
"loss": 0.7684, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05466970387243736, |
|
"grad_norm": 0.9130173325538635, |
|
"learning_rate": 7.288458837101675e-05, |
|
"loss": 1.0119, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.055239179954441914, |
|
"grad_norm": 0.7758641242980957, |
|
"learning_rate": 6.975035701693544e-05, |
|
"loss": 0.7098, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.05580865603644647, |
|
"grad_norm": 0.5639253258705139, |
|
"learning_rate": 6.66644650470597e-05, |
|
"loss": 0.5637, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.056378132118451024, |
|
"grad_norm": 0.9065825939178467, |
|
"learning_rate": 6.362877128732319e-05, |
|
"loss": 1.149, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.05694760820045558, |
|
"grad_norm": 0.75728839635849, |
|
"learning_rate": 6.064510432613499e-05, |
|
"loss": 0.4102, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.057517084282460135, |
|
"grad_norm": 0.7174970507621765, |
|
"learning_rate": 5.771526141290599e-05, |
|
"loss": 0.7149, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.05808656036446469, |
|
"grad_norm": 0.5997675657272339, |
|
"learning_rate": 5.4841007375453186e-05, |
|
"loss": 0.4369, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.058656036446469245, |
|
"grad_norm": 0.6755107641220093, |
|
"learning_rate": 5.2024073556933516e-05, |
|
"loss": 1.361, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.05922551252847381, |
|
"grad_norm": 0.8155584931373596, |
|
"learning_rate": 4.926615677294723e-05, |
|
"loss": 0.6092, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.05979498861047836, |
|
"grad_norm": 0.7561736702919006, |
|
"learning_rate": 4.656891828943996e-05, |
|
"loss": 0.8085, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06036446469248292, |
|
"grad_norm": 0.6462244391441345, |
|
"learning_rate": 4.3933982822017876e-05, |
|
"loss": 0.661, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.06093394077448747, |
|
"grad_norm": 0.8051128387451172, |
|
"learning_rate": 4.136293755727998e-05, |
|
"loss": 0.7713, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.06150341685649203, |
|
"grad_norm": 1.8678494691848755, |
|
"learning_rate": 3.885733119675616e-05, |
|
"loss": 1.0606, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.062072892938496584, |
|
"grad_norm": 0.5828897953033447, |
|
"learning_rate": 3.641867302402731e-05, |
|
"loss": 0.5834, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.06264236902050115, |
|
"grad_norm": 0.4921259582042694, |
|
"learning_rate": 3.404843199558945e-05, |
|
"loss": 0.6211, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0632118451025057, |
|
"grad_norm": 0.7523202896118164, |
|
"learning_rate": 3.174803585600906e-05, |
|
"loss": 0.5977, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.06378132118451026, |
|
"grad_norm": 0.618629515171051, |
|
"learning_rate": 2.9518870277903274e-05, |
|
"loss": 0.5802, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.06435079726651481, |
|
"grad_norm": 0.633359968662262, |
|
"learning_rate": 2.7362278027262457e-05, |
|
"loss": 0.8338, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.06492027334851937, |
|
"grad_norm": 0.5951647758483887, |
|
"learning_rate": 2.5279558154618197e-05, |
|
"loss": 0.5764, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.06548974943052392, |
|
"grad_norm": 0.5431082248687744, |
|
"learning_rate": 2.3271965212543932e-05, |
|
"loss": 0.6116, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06605922551252848, |
|
"grad_norm": 0.6911126971244812, |
|
"learning_rate": 2.1340708499959197e-05, |
|
"loss": 0.8577, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.06662870159453303, |
|
"grad_norm": 0.7030333280563354, |
|
"learning_rate": 1.9486951333693296e-05, |
|
"loss": 0.7916, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.06719817767653759, |
|
"grad_norm": 0.6063715815544128, |
|
"learning_rate": 1.7711810347746757e-05, |
|
"loss": 0.6928, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.06776765375854214, |
|
"grad_norm": 0.6492345333099365, |
|
"learning_rate": 1.6016354820672715e-05, |
|
"loss": 0.6717, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0683371298405467, |
|
"grad_norm": 0.658710777759552, |
|
"learning_rate": 1.4401606031483497e-05, |
|
"loss": 1.0441, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06890660592255125, |
|
"grad_norm": 0.6208887696266174, |
|
"learning_rate": 1.2868536644470396e-05, |
|
"loss": 0.793, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.06947608200455581, |
|
"grad_norm": 0.520664393901825, |
|
"learning_rate": 1.1418070123306989e-05, |
|
"loss": 0.5236, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.07004555808656036, |
|
"grad_norm": 0.5397936701774597, |
|
"learning_rate": 1.0051080174789172e-05, |
|
"loss": 0.6599, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.07061503416856492, |
|
"grad_norm": 0.6907640695571899, |
|
"learning_rate": 8.768390222546895e-06, |
|
"loss": 0.7875, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.07118451025056947, |
|
"grad_norm": 0.573017418384552, |
|
"learning_rate": 7.570772911044498e-06, |
|
"loss": 0.5655, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07175398633257403, |
|
"grad_norm": 1.2410931587219238, |
|
"learning_rate": 6.458949640168675e-06, |
|
"loss": 0.6824, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.07232346241457858, |
|
"grad_norm": 0.692986786365509, |
|
"learning_rate": 5.4335901306840235e-06, |
|
"loss": 0.6636, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.07289293849658314, |
|
"grad_norm": 0.5771859288215637, |
|
"learning_rate": 4.495312020818403e-06, |
|
"loss": 0.9473, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0734624145785877, |
|
"grad_norm": 0.8466888666152954, |
|
"learning_rate": 3.6446804942207306e-06, |
|
"loss": 0.7754, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.07403189066059225, |
|
"grad_norm": 0.5004900097846985, |
|
"learning_rate": 2.882207939515435e-06, |
|
"loss": 0.7227, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0746013667425968, |
|
"grad_norm": 0.8404062390327454, |
|
"learning_rate": 2.2083536416588165e-06, |
|
"loss": 0.5737, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.07517084282460136, |
|
"grad_norm": 0.46463948488235474, |
|
"learning_rate": 1.6235235052828476e-06, |
|
"loss": 0.6784, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.07574031890660592, |
|
"grad_norm": 0.7300965785980225, |
|
"learning_rate": 1.128069810193505e-06, |
|
"loss": 0.987, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.07630979498861047, |
|
"grad_norm": 0.9501856565475464, |
|
"learning_rate": 7.222909991704773e-07, |
|
"loss": 0.5392, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.07687927107061504, |
|
"grad_norm": 0.6093735694885254, |
|
"learning_rate": 4.064314981964689e-07, |
|
"loss": 0.801, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0774487471526196, |
|
"grad_norm": 0.8983132839202881, |
|
"learning_rate": 1.8068156922413924e-07, |
|
"loss": 0.7463, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.07801822323462415, |
|
"grad_norm": 0.7183220982551575, |
|
"learning_rate": 4.51771955693625e-08, |
|
"loss": 0.6387, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.0785876993166287, |
|
"grad_norm": 0.6416277885437012, |
|
"learning_rate": 0.0, |
|
"loss": 1.1154, |
|
"step": 138 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 138, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.7728037223071744e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|