{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0785876993166287, "eval_steps": 500, "global_step": 138, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005694760820045558, "grad_norm": 0.9193857312202454, "learning_rate": 2.9999999999999997e-05, "loss": 0.9288, "step": 1 }, { "epoch": 0.0011389521640091116, "grad_norm": 0.8991456031799316, "learning_rate": 5.9999999999999995e-05, "loss": 0.9457, "step": 2 }, { "epoch": 0.0017084282460136675, "grad_norm": 1.1229982376098633, "learning_rate": 8.999999999999999e-05, "loss": 0.9705, "step": 3 }, { "epoch": 0.002277904328018223, "grad_norm": 1.1453852653503418, "learning_rate": 0.00011999999999999999, "loss": 0.8237, "step": 4 }, { "epoch": 0.0028473804100227792, "grad_norm": 0.6836767196655273, "learning_rate": 0.00015, "loss": 0.6138, "step": 5 }, { "epoch": 0.003416856492027335, "grad_norm": 0.7178201079368591, "learning_rate": 0.00017999999999999998, "loss": 0.715, "step": 6 }, { "epoch": 0.003986332574031891, "grad_norm": 0.6292722225189209, "learning_rate": 0.00020999999999999998, "loss": 0.8324, "step": 7 }, { "epoch": 0.004555808656036446, "grad_norm": 1.0551400184631348, "learning_rate": 0.00023999999999999998, "loss": 0.7891, "step": 8 }, { "epoch": 0.005125284738041002, "grad_norm": 2.0253489017486572, "learning_rate": 0.00027, "loss": 1.188, "step": 9 }, { "epoch": 0.0056947608200455585, "grad_norm": 1.0278229713439941, "learning_rate": 0.0003, "loss": 0.7582, "step": 10 }, { "epoch": 0.006264236902050114, "grad_norm": 1.3169053792953491, "learning_rate": 0.0002999548228044306, "loss": 0.7924, "step": 11 }, { "epoch": 0.00683371298405467, "grad_norm": 0.8358129262924194, "learning_rate": 0.00029981931843077583, "loss": 0.7108, "step": 12 }, { "epoch": 0.007403189066059226, "grad_norm": 1.0881885290145874, "learning_rate": 0.0002995935685018035, "loss": 0.7324, "step": 13 }, { "epoch": 0.007972665148063782, "grad_norm": 1.1146137714385986, "learning_rate": 0.00029927770900082954, "loss": 1.2375, "step": 14 }, { "epoch": 0.008542141230068337, "grad_norm": 0.5502291917800903, "learning_rate": 0.0002988719301898065, "loss": 0.6301, "step": 15 }, { "epoch": 0.009111617312072893, "grad_norm": 0.684746503829956, "learning_rate": 0.00029837647649471715, "loss": 0.6484, "step": 16 }, { "epoch": 0.00968109339407745, "grad_norm": 0.6915183663368225, "learning_rate": 0.00029779164635834114, "loss": 1.5319, "step": 17 }, { "epoch": 0.010250569476082005, "grad_norm": 0.6584042906761169, "learning_rate": 0.00029711779206048454, "loss": 0.7015, "step": 18 }, { "epoch": 0.01082004555808656, "grad_norm": 0.6344502568244934, "learning_rate": 0.00029635531950577925, "loss": 0.6405, "step": 19 }, { "epoch": 0.011389521640091117, "grad_norm": 0.8625004291534424, "learning_rate": 0.0002955046879791816, "loss": 0.6721, "step": 20 }, { "epoch": 0.011958997722095672, "grad_norm": 0.8597177863121033, "learning_rate": 0.00029456640986931596, "loss": 0.6415, "step": 21 }, { "epoch": 0.012528473804100227, "grad_norm": 1.4716626405715942, "learning_rate": 0.0002935410503598313, "loss": 0.952, "step": 22 }, { "epoch": 0.013097949886104784, "grad_norm": 0.5828580260276794, "learning_rate": 0.00029242922708895547, "loss": 0.5977, "step": 23 }, { "epoch": 0.01366742596810934, "grad_norm": 0.6336590647697449, "learning_rate": 0.00029123160977745306, "loss": 0.8268, "step": 24 }, { "epoch": 0.014236902050113895, "grad_norm": 0.6400074362754822, "learning_rate": 0.0002899489198252108, "loss": 0.7116, "step": 25 }, { "epoch": 0.014806378132118452, "grad_norm": 0.914237380027771, "learning_rate": 0.000288581929876693, "loss": 0.6563, "step": 26 }, { "epoch": 0.015375854214123007, "grad_norm": 0.7890664339065552, "learning_rate": 0.0002871314633555296, "loss": 0.9234, "step": 27 }, { "epoch": 0.015945330296127564, "grad_norm": 0.9337290525436401, "learning_rate": 0.0002855983939685165, "loss": 0.7655, "step": 28 }, { "epoch": 0.01651480637813212, "grad_norm": 0.9062933325767517, "learning_rate": 0.00028398364517932725, "loss": 0.6479, "step": 29 }, { "epoch": 0.017084282460136675, "grad_norm": 0.5920599102973938, "learning_rate": 0.0002822881896522532, "loss": 0.6417, "step": 30 }, { "epoch": 0.01765375854214123, "grad_norm": 0.7619308829307556, "learning_rate": 0.0002805130486663067, "loss": 0.7647, "step": 31 }, { "epoch": 0.018223234624145785, "grad_norm": 0.8592938184738159, "learning_rate": 0.0002786592915000408, "loss": 1.0644, "step": 32 }, { "epoch": 0.01879271070615034, "grad_norm": 0.584583044052124, "learning_rate": 0.000276728034787456, "loss": 0.5707, "step": 33 }, { "epoch": 0.0193621867881549, "grad_norm": 0.6947116851806641, "learning_rate": 0.0002747204418453818, "loss": 0.8087, "step": 34 }, { "epoch": 0.019931662870159454, "grad_norm": 0.5154379606246948, "learning_rate": 0.0002726377219727375, "loss": 0.6937, "step": 35 }, { "epoch": 0.02050113895216401, "grad_norm": 0.6641525626182556, "learning_rate": 0.0002704811297220967, "loss": 0.7324, "step": 36 }, { "epoch": 0.021070615034168565, "grad_norm": 0.776289701461792, "learning_rate": 0.00026825196414399094, "loss": 0.6164, "step": 37 }, { "epoch": 0.02164009111617312, "grad_norm": 0.9698323607444763, "learning_rate": 0.0002659515680044105, "loss": 0.5876, "step": 38 }, { "epoch": 0.022209567198177675, "grad_norm": 0.9234256744384766, "learning_rate": 0.00026358132697597265, "loss": 1.6872, "step": 39 }, { "epoch": 0.022779043280182234, "grad_norm": 0.8341031670570374, "learning_rate": 0.00026114266880324387, "loss": 0.7148, "step": 40 }, { "epoch": 0.02334851936218679, "grad_norm": 0.7260201573371887, "learning_rate": 0.00025863706244272003, "loss": 0.6195, "step": 41 }, { "epoch": 0.023917995444191344, "grad_norm": 0.5485382080078125, "learning_rate": 0.00025606601717798207, "loss": 0.6015, "step": 42 }, { "epoch": 0.0244874715261959, "grad_norm": 0.83427494764328, "learning_rate": 0.00025343108171056, "loss": 0.7354, "step": 43 }, { "epoch": 0.025056947608200455, "grad_norm": 0.7569791674613953, "learning_rate": 0.00025073384322705274, "loss": 0.7379, "step": 44 }, { "epoch": 0.02562642369020501, "grad_norm": 0.7086009383201599, "learning_rate": 0.00024797592644306646, "loss": 0.8229, "step": 45 }, { "epoch": 0.02619589977220957, "grad_norm": 0.6626051068305969, "learning_rate": 0.0002451589926245468, "loss": 0.7937, "step": 46 }, { "epoch": 0.026765375854214124, "grad_norm": 1.0067198276519775, "learning_rate": 0.000242284738587094, "loss": 0.94, "step": 47 }, { "epoch": 0.02733485193621868, "grad_norm": 0.6433237791061401, "learning_rate": 0.000239354895673865, "loss": 0.9452, "step": 48 }, { "epoch": 0.027904328018223234, "grad_norm": 0.6352181434631348, "learning_rate": 0.00023637122871267679, "loss": 0.6538, "step": 49 }, { "epoch": 0.02847380410022779, "grad_norm": 0.7389889359474182, "learning_rate": 0.0002333355349529403, "loss": 0.989, "step": 50 }, { "epoch": 0.029043280182232345, "grad_norm": 0.7596966028213501, "learning_rate": 0.00023024964298306458, "loss": 0.6397, "step": 51 }, { "epoch": 0.029612756264236904, "grad_norm": 0.6516755223274231, "learning_rate": 0.00022711541162898321, "loss": 0.5003, "step": 52 }, { "epoch": 0.03018223234624146, "grad_norm": 0.8451756238937378, "learning_rate": 0.0002239347288344676, "loss": 0.738, "step": 53 }, { "epoch": 0.030751708428246014, "grad_norm": 0.7065162062644958, "learning_rate": 0.00022070951052389966, "loss": 0.7718, "step": 54 }, { "epoch": 0.03132118451025057, "grad_norm": 0.9125147461891174, "learning_rate": 0.00021744169944819098, "loss": 0.5715, "step": 55 }, { "epoch": 0.03189066059225513, "grad_norm": 0.5039160847663879, "learning_rate": 0.0002141332640145423, "loss": 0.5745, "step": 56 }, { "epoch": 0.03246013667425968, "grad_norm": 0.5523383617401123, "learning_rate": 0.00021078619710074845, "loss": 0.9072, "step": 57 }, { "epoch": 0.03302961275626424, "grad_norm": 0.7476831674575806, "learning_rate": 0.00020740251485476345, "loss": 0.4698, "step": 58 }, { "epoch": 0.033599088838268794, "grad_norm": 0.6698426604270935, "learning_rate": 0.00020398425548024822, "loss": 0.6769, "step": 59 }, { "epoch": 0.03416856492027335, "grad_norm": 0.7437167167663574, "learning_rate": 0.00020053347800883298, "loss": 0.6624, "step": 60 }, { "epoch": 0.034738041002277904, "grad_norm": 0.5557659268379211, "learning_rate": 0.00019705226105983374, "loss": 0.7612, "step": 61 }, { "epoch": 0.03530751708428246, "grad_norm": 0.5920267701148987, "learning_rate": 0.0001935427015881693, "loss": 0.6359, "step": 62 }, { "epoch": 0.035876993166287015, "grad_norm": 0.5547272562980652, "learning_rate": 0.00019000691362123473, "loss": 0.5502, "step": 63 }, { "epoch": 0.03644646924829157, "grad_norm": 0.6265895366668701, "learning_rate": 0.0001864470269854896, "loss": 0.8296, "step": 64 }, { "epoch": 0.037015945330296125, "grad_norm": 0.5109124779701233, "learning_rate": 0.00018286518602353045, "loss": 0.6811, "step": 65 }, { "epoch": 0.03758542141230068, "grad_norm": 0.733314573764801, "learning_rate": 0.00017926354830241924, "loss": 0.9034, "step": 66 }, { "epoch": 0.038154897494305236, "grad_norm": 0.5381625294685364, "learning_rate": 0.00017564428331404519, "loss": 0.6674, "step": 67 }, { "epoch": 0.0387243735763098, "grad_norm": 0.6308789849281311, "learning_rate": 0.00017200957116830423, "loss": 0.6398, "step": 68 }, { "epoch": 0.03929384965831435, "grad_norm": 0.6676629185676575, "learning_rate": 0.00016836160127988242, "loss": 0.57, "step": 69 }, { "epoch": 0.03986332574031891, "grad_norm": 0.72255539894104, "learning_rate": 0.0001647025710494341, "loss": 0.6134, "step": 70 }, { "epoch": 0.040432801822323464, "grad_norm": 1.10958731174469, "learning_rate": 0.00016103468453995012, "loss": 0.9271, "step": 71 }, { "epoch": 0.04100227790432802, "grad_norm": 0.5311421751976013, "learning_rate": 0.0001573601511491127, "loss": 0.6661, "step": 72 }, { "epoch": 0.041571753986332574, "grad_norm": 0.8206638097763062, "learning_rate": 0.00015368118427843682, "loss": 0.8327, "step": 73 }, { "epoch": 0.04214123006833713, "grad_norm": 0.5865733027458191, "learning_rate": 0.00015, "loss": 0.5316, "step": 74 }, { "epoch": 0.042710706150341685, "grad_norm": 0.674359917640686, "learning_rate": 0.00014631881572156315, "loss": 1.232, "step": 75 }, { "epoch": 0.04328018223234624, "grad_norm": 0.6954506039619446, "learning_rate": 0.0001426398488508873, "loss": 0.6289, "step": 76 }, { "epoch": 0.043849658314350795, "grad_norm": 0.6149243116378784, "learning_rate": 0.00013896531546004988, "loss": 0.6659, "step": 77 }, { "epoch": 0.04441913439635535, "grad_norm": 0.6586117148399353, "learning_rate": 0.0001352974289505659, "loss": 0.9493, "step": 78 }, { "epoch": 0.044988610478359906, "grad_norm": 0.6128969192504883, "learning_rate": 0.00013163839872011758, "loss": 1.0356, "step": 79 }, { "epoch": 0.04555808656036447, "grad_norm": 0.6175865530967712, "learning_rate": 0.00012799042883169574, "loss": 0.7532, "step": 80 }, { "epoch": 0.04612756264236902, "grad_norm": 0.7808921933174133, "learning_rate": 0.0001243557166859548, "loss": 1.015, "step": 81 }, { "epoch": 0.04669703872437358, "grad_norm": 1.351828932762146, "learning_rate": 0.00012073645169758076, "loss": 0.9374, "step": 82 }, { "epoch": 0.04726651480637813, "grad_norm": 0.598646879196167, "learning_rate": 0.00011713481397646953, "loss": 0.5562, "step": 83 }, { "epoch": 0.04783599088838269, "grad_norm": 0.7442788481712341, "learning_rate": 0.00011355297301451042, "loss": 0.75, "step": 84 }, { "epoch": 0.048405466970387244, "grad_norm": 0.5332076549530029, "learning_rate": 0.00010999308637876524, "loss": 0.6766, "step": 85 }, { "epoch": 0.0489749430523918, "grad_norm": 1.0476224422454834, "learning_rate": 0.00010645729841183066, "loss": 0.6271, "step": 86 }, { "epoch": 0.049544419134396354, "grad_norm": 0.8156277537345886, "learning_rate": 0.00010294773894016627, "loss": 0.8984, "step": 87 }, { "epoch": 0.05011389521640091, "grad_norm": 0.8451378345489502, "learning_rate": 9.946652199116699e-05, "loss": 1.0814, "step": 88 }, { "epoch": 0.050683371298405465, "grad_norm": 0.6506671905517578, "learning_rate": 9.601574451975175e-05, "loss": 0.5343, "step": 89 }, { "epoch": 0.05125284738041002, "grad_norm": 1.0723323822021484, "learning_rate": 9.259748514523653e-05, "loss": 1.1407, "step": 90 }, { "epoch": 0.051822323462414575, "grad_norm": 0.6675905585289001, "learning_rate": 8.921380289925153e-05, "loss": 0.8981, "step": 91 }, { "epoch": 0.05239179954441914, "grad_norm": 0.851328432559967, "learning_rate": 8.586673598545771e-05, "loss": 0.7855, "step": 92 }, { "epoch": 0.05296127562642369, "grad_norm": 0.5953764915466309, "learning_rate": 8.255830055180899e-05, "loss": 0.6019, "step": 93 }, { "epoch": 0.05353075170842825, "grad_norm": 0.6898136138916016, "learning_rate": 7.929048947610034e-05, "loss": 0.6316, "step": 94 }, { "epoch": 0.0541002277904328, "grad_norm": 0.766689658164978, "learning_rate": 7.606527116553241e-05, "loss": 0.7684, "step": 95 }, { "epoch": 0.05466970387243736, "grad_norm": 0.9130173325538635, "learning_rate": 7.288458837101675e-05, "loss": 1.0119, "step": 96 }, { "epoch": 0.055239179954441914, "grad_norm": 0.7758641242980957, "learning_rate": 6.975035701693544e-05, "loss": 0.7098, "step": 97 }, { "epoch": 0.05580865603644647, "grad_norm": 0.5639253258705139, "learning_rate": 6.66644650470597e-05, "loss": 0.5637, "step": 98 }, { "epoch": 0.056378132118451024, "grad_norm": 0.9065825939178467, "learning_rate": 6.362877128732319e-05, "loss": 1.149, "step": 99 }, { "epoch": 0.05694760820045558, "grad_norm": 0.75728839635849, "learning_rate": 6.064510432613499e-05, "loss": 0.4102, "step": 100 }, { "epoch": 0.057517084282460135, "grad_norm": 0.7174970507621765, "learning_rate": 5.771526141290599e-05, "loss": 0.7149, "step": 101 }, { "epoch": 0.05808656036446469, "grad_norm": 0.5997675657272339, "learning_rate": 5.4841007375453186e-05, "loss": 0.4369, "step": 102 }, { "epoch": 0.058656036446469245, "grad_norm": 0.6755107641220093, "learning_rate": 5.2024073556933516e-05, "loss": 1.361, "step": 103 }, { "epoch": 0.05922551252847381, "grad_norm": 0.8155584931373596, "learning_rate": 4.926615677294723e-05, "loss": 0.6092, "step": 104 }, { "epoch": 0.05979498861047836, "grad_norm": 0.7561736702919006, "learning_rate": 4.656891828943996e-05, "loss": 0.8085, "step": 105 }, { "epoch": 0.06036446469248292, "grad_norm": 0.6462244391441345, "learning_rate": 4.3933982822017876e-05, "loss": 0.661, "step": 106 }, { "epoch": 0.06093394077448747, "grad_norm": 0.8051128387451172, "learning_rate": 4.136293755727998e-05, "loss": 0.7713, "step": 107 }, { "epoch": 0.06150341685649203, "grad_norm": 1.8678494691848755, "learning_rate": 3.885733119675616e-05, "loss": 1.0606, "step": 108 }, { "epoch": 0.062072892938496584, "grad_norm": 0.5828897953033447, "learning_rate": 3.641867302402731e-05, "loss": 0.5834, "step": 109 }, { "epoch": 0.06264236902050115, "grad_norm": 0.4921259582042694, "learning_rate": 3.404843199558945e-05, "loss": 0.6211, "step": 110 }, { "epoch": 0.0632118451025057, "grad_norm": 0.7523202896118164, "learning_rate": 3.174803585600906e-05, "loss": 0.5977, "step": 111 }, { "epoch": 0.06378132118451026, "grad_norm": 0.618629515171051, "learning_rate": 2.9518870277903274e-05, "loss": 0.5802, "step": 112 }, { "epoch": 0.06435079726651481, "grad_norm": 0.633359968662262, "learning_rate": 2.7362278027262457e-05, "loss": 0.8338, "step": 113 }, { "epoch": 0.06492027334851937, "grad_norm": 0.5951647758483887, "learning_rate": 2.5279558154618197e-05, "loss": 0.5764, "step": 114 }, { "epoch": 0.06548974943052392, "grad_norm": 0.5431082248687744, "learning_rate": 2.3271965212543932e-05, "loss": 0.6116, "step": 115 }, { "epoch": 0.06605922551252848, "grad_norm": 0.6911126971244812, "learning_rate": 2.1340708499959197e-05, "loss": 0.8577, "step": 116 }, { "epoch": 0.06662870159453303, "grad_norm": 0.7030333280563354, "learning_rate": 1.9486951333693296e-05, "loss": 0.7916, "step": 117 }, { "epoch": 0.06719817767653759, "grad_norm": 0.6063715815544128, "learning_rate": 1.7711810347746757e-05, "loss": 0.6928, "step": 118 }, { "epoch": 0.06776765375854214, "grad_norm": 0.6492345333099365, "learning_rate": 1.6016354820672715e-05, "loss": 0.6717, "step": 119 }, { "epoch": 0.0683371298405467, "grad_norm": 0.658710777759552, "learning_rate": 1.4401606031483497e-05, "loss": 1.0441, "step": 120 }, { "epoch": 0.06890660592255125, "grad_norm": 0.6208887696266174, "learning_rate": 1.2868536644470396e-05, "loss": 0.793, "step": 121 }, { "epoch": 0.06947608200455581, "grad_norm": 0.520664393901825, "learning_rate": 1.1418070123306989e-05, "loss": 0.5236, "step": 122 }, { "epoch": 0.07004555808656036, "grad_norm": 0.5397936701774597, "learning_rate": 1.0051080174789172e-05, "loss": 0.6599, "step": 123 }, { "epoch": 0.07061503416856492, "grad_norm": 0.6907640695571899, "learning_rate": 8.768390222546895e-06, "loss": 0.7875, "step": 124 }, { "epoch": 0.07118451025056947, "grad_norm": 0.573017418384552, "learning_rate": 7.570772911044498e-06, "loss": 0.5655, "step": 125 }, { "epoch": 0.07175398633257403, "grad_norm": 1.2410931587219238, "learning_rate": 6.458949640168675e-06, "loss": 0.6824, "step": 126 }, { "epoch": 0.07232346241457858, "grad_norm": 0.692986786365509, "learning_rate": 5.4335901306840235e-06, "loss": 0.6636, "step": 127 }, { "epoch": 0.07289293849658314, "grad_norm": 0.5771859288215637, "learning_rate": 4.495312020818403e-06, "loss": 0.9473, "step": 128 }, { "epoch": 0.0734624145785877, "grad_norm": 0.8466888666152954, "learning_rate": 3.6446804942207306e-06, "loss": 0.7754, "step": 129 }, { "epoch": 0.07403189066059225, "grad_norm": 0.5004900097846985, "learning_rate": 2.882207939515435e-06, "loss": 0.7227, "step": 130 }, { "epoch": 0.0746013667425968, "grad_norm": 0.8404062390327454, "learning_rate": 2.2083536416588165e-06, "loss": 0.5737, "step": 131 }, { "epoch": 0.07517084282460136, "grad_norm": 0.46463948488235474, "learning_rate": 1.6235235052828476e-06, "loss": 0.6784, "step": 132 }, { "epoch": 0.07574031890660592, "grad_norm": 0.7300965785980225, "learning_rate": 1.128069810193505e-06, "loss": 0.987, "step": 133 }, { "epoch": 0.07630979498861047, "grad_norm": 0.9501856565475464, "learning_rate": 7.222909991704773e-07, "loss": 0.5392, "step": 134 }, { "epoch": 0.07687927107061504, "grad_norm": 0.6093735694885254, "learning_rate": 4.064314981964689e-07, "loss": 0.801, "step": 135 }, { "epoch": 0.0774487471526196, "grad_norm": 0.8983132839202881, "learning_rate": 1.8068156922413924e-07, "loss": 0.7463, "step": 136 }, { "epoch": 0.07801822323462415, "grad_norm": 0.7183220982551575, "learning_rate": 4.51771955693625e-08, "loss": 0.6387, "step": 137 }, { "epoch": 0.0785876993166287, "grad_norm": 0.6416277885437012, "learning_rate": 0.0, "loss": 1.1154, "step": 138 } ], "logging_steps": 1, "max_steps": 138, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7728037223071744e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }