jssky's picture
Training in progress, step 138, checkpoint
704edc4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0785876993166287,
"eval_steps": 500,
"global_step": 138,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005694760820045558,
"grad_norm": 0.9193857312202454,
"learning_rate": 2.9999999999999997e-05,
"loss": 0.9288,
"step": 1
},
{
"epoch": 0.0011389521640091116,
"grad_norm": 0.8991456031799316,
"learning_rate": 5.9999999999999995e-05,
"loss": 0.9457,
"step": 2
},
{
"epoch": 0.0017084282460136675,
"grad_norm": 1.1229982376098633,
"learning_rate": 8.999999999999999e-05,
"loss": 0.9705,
"step": 3
},
{
"epoch": 0.002277904328018223,
"grad_norm": 1.1453852653503418,
"learning_rate": 0.00011999999999999999,
"loss": 0.8237,
"step": 4
},
{
"epoch": 0.0028473804100227792,
"grad_norm": 0.6836767196655273,
"learning_rate": 0.00015,
"loss": 0.6138,
"step": 5
},
{
"epoch": 0.003416856492027335,
"grad_norm": 0.7178201079368591,
"learning_rate": 0.00017999999999999998,
"loss": 0.715,
"step": 6
},
{
"epoch": 0.003986332574031891,
"grad_norm": 0.6292722225189209,
"learning_rate": 0.00020999999999999998,
"loss": 0.8324,
"step": 7
},
{
"epoch": 0.004555808656036446,
"grad_norm": 1.0551400184631348,
"learning_rate": 0.00023999999999999998,
"loss": 0.7891,
"step": 8
},
{
"epoch": 0.005125284738041002,
"grad_norm": 2.0253489017486572,
"learning_rate": 0.00027,
"loss": 1.188,
"step": 9
},
{
"epoch": 0.0056947608200455585,
"grad_norm": 1.0278229713439941,
"learning_rate": 0.0003,
"loss": 0.7582,
"step": 10
},
{
"epoch": 0.006264236902050114,
"grad_norm": 1.3169053792953491,
"learning_rate": 0.0002999548228044306,
"loss": 0.7924,
"step": 11
},
{
"epoch": 0.00683371298405467,
"grad_norm": 0.8358129262924194,
"learning_rate": 0.00029981931843077583,
"loss": 0.7108,
"step": 12
},
{
"epoch": 0.007403189066059226,
"grad_norm": 1.0881885290145874,
"learning_rate": 0.0002995935685018035,
"loss": 0.7324,
"step": 13
},
{
"epoch": 0.007972665148063782,
"grad_norm": 1.1146137714385986,
"learning_rate": 0.00029927770900082954,
"loss": 1.2375,
"step": 14
},
{
"epoch": 0.008542141230068337,
"grad_norm": 0.5502291917800903,
"learning_rate": 0.0002988719301898065,
"loss": 0.6301,
"step": 15
},
{
"epoch": 0.009111617312072893,
"grad_norm": 0.684746503829956,
"learning_rate": 0.00029837647649471715,
"loss": 0.6484,
"step": 16
},
{
"epoch": 0.00968109339407745,
"grad_norm": 0.6915183663368225,
"learning_rate": 0.00029779164635834114,
"loss": 1.5319,
"step": 17
},
{
"epoch": 0.010250569476082005,
"grad_norm": 0.6584042906761169,
"learning_rate": 0.00029711779206048454,
"loss": 0.7015,
"step": 18
},
{
"epoch": 0.01082004555808656,
"grad_norm": 0.6344502568244934,
"learning_rate": 0.00029635531950577925,
"loss": 0.6405,
"step": 19
},
{
"epoch": 0.011389521640091117,
"grad_norm": 0.8625004291534424,
"learning_rate": 0.0002955046879791816,
"loss": 0.6721,
"step": 20
},
{
"epoch": 0.011958997722095672,
"grad_norm": 0.8597177863121033,
"learning_rate": 0.00029456640986931596,
"loss": 0.6415,
"step": 21
},
{
"epoch": 0.012528473804100227,
"grad_norm": 1.4716626405715942,
"learning_rate": 0.0002935410503598313,
"loss": 0.952,
"step": 22
},
{
"epoch": 0.013097949886104784,
"grad_norm": 0.5828580260276794,
"learning_rate": 0.00029242922708895547,
"loss": 0.5977,
"step": 23
},
{
"epoch": 0.01366742596810934,
"grad_norm": 0.6336590647697449,
"learning_rate": 0.00029123160977745306,
"loss": 0.8268,
"step": 24
},
{
"epoch": 0.014236902050113895,
"grad_norm": 0.6400074362754822,
"learning_rate": 0.0002899489198252108,
"loss": 0.7116,
"step": 25
},
{
"epoch": 0.014806378132118452,
"grad_norm": 0.914237380027771,
"learning_rate": 0.000288581929876693,
"loss": 0.6563,
"step": 26
},
{
"epoch": 0.015375854214123007,
"grad_norm": 0.7890664339065552,
"learning_rate": 0.0002871314633555296,
"loss": 0.9234,
"step": 27
},
{
"epoch": 0.015945330296127564,
"grad_norm": 0.9337290525436401,
"learning_rate": 0.0002855983939685165,
"loss": 0.7655,
"step": 28
},
{
"epoch": 0.01651480637813212,
"grad_norm": 0.9062933325767517,
"learning_rate": 0.00028398364517932725,
"loss": 0.6479,
"step": 29
},
{
"epoch": 0.017084282460136675,
"grad_norm": 0.5920599102973938,
"learning_rate": 0.0002822881896522532,
"loss": 0.6417,
"step": 30
},
{
"epoch": 0.01765375854214123,
"grad_norm": 0.7619308829307556,
"learning_rate": 0.0002805130486663067,
"loss": 0.7647,
"step": 31
},
{
"epoch": 0.018223234624145785,
"grad_norm": 0.8592938184738159,
"learning_rate": 0.0002786592915000408,
"loss": 1.0644,
"step": 32
},
{
"epoch": 0.01879271070615034,
"grad_norm": 0.584583044052124,
"learning_rate": 0.000276728034787456,
"loss": 0.5707,
"step": 33
},
{
"epoch": 0.0193621867881549,
"grad_norm": 0.6947116851806641,
"learning_rate": 0.0002747204418453818,
"loss": 0.8087,
"step": 34
},
{
"epoch": 0.019931662870159454,
"grad_norm": 0.5154379606246948,
"learning_rate": 0.0002726377219727375,
"loss": 0.6937,
"step": 35
},
{
"epoch": 0.02050113895216401,
"grad_norm": 0.6641525626182556,
"learning_rate": 0.0002704811297220967,
"loss": 0.7324,
"step": 36
},
{
"epoch": 0.021070615034168565,
"grad_norm": 0.776289701461792,
"learning_rate": 0.00026825196414399094,
"loss": 0.6164,
"step": 37
},
{
"epoch": 0.02164009111617312,
"grad_norm": 0.9698323607444763,
"learning_rate": 0.0002659515680044105,
"loss": 0.5876,
"step": 38
},
{
"epoch": 0.022209567198177675,
"grad_norm": 0.9234256744384766,
"learning_rate": 0.00026358132697597265,
"loss": 1.6872,
"step": 39
},
{
"epoch": 0.022779043280182234,
"grad_norm": 0.8341031670570374,
"learning_rate": 0.00026114266880324387,
"loss": 0.7148,
"step": 40
},
{
"epoch": 0.02334851936218679,
"grad_norm": 0.7260201573371887,
"learning_rate": 0.00025863706244272003,
"loss": 0.6195,
"step": 41
},
{
"epoch": 0.023917995444191344,
"grad_norm": 0.5485382080078125,
"learning_rate": 0.00025606601717798207,
"loss": 0.6015,
"step": 42
},
{
"epoch": 0.0244874715261959,
"grad_norm": 0.83427494764328,
"learning_rate": 0.00025343108171056,
"loss": 0.7354,
"step": 43
},
{
"epoch": 0.025056947608200455,
"grad_norm": 0.7569791674613953,
"learning_rate": 0.00025073384322705274,
"loss": 0.7379,
"step": 44
},
{
"epoch": 0.02562642369020501,
"grad_norm": 0.7086009383201599,
"learning_rate": 0.00024797592644306646,
"loss": 0.8229,
"step": 45
},
{
"epoch": 0.02619589977220957,
"grad_norm": 0.6626051068305969,
"learning_rate": 0.0002451589926245468,
"loss": 0.7937,
"step": 46
},
{
"epoch": 0.026765375854214124,
"grad_norm": 1.0067198276519775,
"learning_rate": 0.000242284738587094,
"loss": 0.94,
"step": 47
},
{
"epoch": 0.02733485193621868,
"grad_norm": 0.6433237791061401,
"learning_rate": 0.000239354895673865,
"loss": 0.9452,
"step": 48
},
{
"epoch": 0.027904328018223234,
"grad_norm": 0.6352181434631348,
"learning_rate": 0.00023637122871267679,
"loss": 0.6538,
"step": 49
},
{
"epoch": 0.02847380410022779,
"grad_norm": 0.7389889359474182,
"learning_rate": 0.0002333355349529403,
"loss": 0.989,
"step": 50
},
{
"epoch": 0.029043280182232345,
"grad_norm": 0.7596966028213501,
"learning_rate": 0.00023024964298306458,
"loss": 0.6397,
"step": 51
},
{
"epoch": 0.029612756264236904,
"grad_norm": 0.6516755223274231,
"learning_rate": 0.00022711541162898321,
"loss": 0.5003,
"step": 52
},
{
"epoch": 0.03018223234624146,
"grad_norm": 0.8451756238937378,
"learning_rate": 0.0002239347288344676,
"loss": 0.738,
"step": 53
},
{
"epoch": 0.030751708428246014,
"grad_norm": 0.7065162062644958,
"learning_rate": 0.00022070951052389966,
"loss": 0.7718,
"step": 54
},
{
"epoch": 0.03132118451025057,
"grad_norm": 0.9125147461891174,
"learning_rate": 0.00021744169944819098,
"loss": 0.5715,
"step": 55
},
{
"epoch": 0.03189066059225513,
"grad_norm": 0.5039160847663879,
"learning_rate": 0.0002141332640145423,
"loss": 0.5745,
"step": 56
},
{
"epoch": 0.03246013667425968,
"grad_norm": 0.5523383617401123,
"learning_rate": 0.00021078619710074845,
"loss": 0.9072,
"step": 57
},
{
"epoch": 0.03302961275626424,
"grad_norm": 0.7476831674575806,
"learning_rate": 0.00020740251485476345,
"loss": 0.4698,
"step": 58
},
{
"epoch": 0.033599088838268794,
"grad_norm": 0.6698426604270935,
"learning_rate": 0.00020398425548024822,
"loss": 0.6769,
"step": 59
},
{
"epoch": 0.03416856492027335,
"grad_norm": 0.7437167167663574,
"learning_rate": 0.00020053347800883298,
"loss": 0.6624,
"step": 60
},
{
"epoch": 0.034738041002277904,
"grad_norm": 0.5557659268379211,
"learning_rate": 0.00019705226105983374,
"loss": 0.7612,
"step": 61
},
{
"epoch": 0.03530751708428246,
"grad_norm": 0.5920267701148987,
"learning_rate": 0.0001935427015881693,
"loss": 0.6359,
"step": 62
},
{
"epoch": 0.035876993166287015,
"grad_norm": 0.5547272562980652,
"learning_rate": 0.00019000691362123473,
"loss": 0.5502,
"step": 63
},
{
"epoch": 0.03644646924829157,
"grad_norm": 0.6265895366668701,
"learning_rate": 0.0001864470269854896,
"loss": 0.8296,
"step": 64
},
{
"epoch": 0.037015945330296125,
"grad_norm": 0.5109124779701233,
"learning_rate": 0.00018286518602353045,
"loss": 0.6811,
"step": 65
},
{
"epoch": 0.03758542141230068,
"grad_norm": 0.733314573764801,
"learning_rate": 0.00017926354830241924,
"loss": 0.9034,
"step": 66
},
{
"epoch": 0.038154897494305236,
"grad_norm": 0.5381625294685364,
"learning_rate": 0.00017564428331404519,
"loss": 0.6674,
"step": 67
},
{
"epoch": 0.0387243735763098,
"grad_norm": 0.6308789849281311,
"learning_rate": 0.00017200957116830423,
"loss": 0.6398,
"step": 68
},
{
"epoch": 0.03929384965831435,
"grad_norm": 0.6676629185676575,
"learning_rate": 0.00016836160127988242,
"loss": 0.57,
"step": 69
},
{
"epoch": 0.03986332574031891,
"grad_norm": 0.72255539894104,
"learning_rate": 0.0001647025710494341,
"loss": 0.6134,
"step": 70
},
{
"epoch": 0.040432801822323464,
"grad_norm": 1.10958731174469,
"learning_rate": 0.00016103468453995012,
"loss": 0.9271,
"step": 71
},
{
"epoch": 0.04100227790432802,
"grad_norm": 0.5311421751976013,
"learning_rate": 0.0001573601511491127,
"loss": 0.6661,
"step": 72
},
{
"epoch": 0.041571753986332574,
"grad_norm": 0.8206638097763062,
"learning_rate": 0.00015368118427843682,
"loss": 0.8327,
"step": 73
},
{
"epoch": 0.04214123006833713,
"grad_norm": 0.5865733027458191,
"learning_rate": 0.00015,
"loss": 0.5316,
"step": 74
},
{
"epoch": 0.042710706150341685,
"grad_norm": 0.674359917640686,
"learning_rate": 0.00014631881572156315,
"loss": 1.232,
"step": 75
},
{
"epoch": 0.04328018223234624,
"grad_norm": 0.6954506039619446,
"learning_rate": 0.0001426398488508873,
"loss": 0.6289,
"step": 76
},
{
"epoch": 0.043849658314350795,
"grad_norm": 0.6149243116378784,
"learning_rate": 0.00013896531546004988,
"loss": 0.6659,
"step": 77
},
{
"epoch": 0.04441913439635535,
"grad_norm": 0.6586117148399353,
"learning_rate": 0.0001352974289505659,
"loss": 0.9493,
"step": 78
},
{
"epoch": 0.044988610478359906,
"grad_norm": 0.6128969192504883,
"learning_rate": 0.00013163839872011758,
"loss": 1.0356,
"step": 79
},
{
"epoch": 0.04555808656036447,
"grad_norm": 0.6175865530967712,
"learning_rate": 0.00012799042883169574,
"loss": 0.7532,
"step": 80
},
{
"epoch": 0.04612756264236902,
"grad_norm": 0.7808921933174133,
"learning_rate": 0.0001243557166859548,
"loss": 1.015,
"step": 81
},
{
"epoch": 0.04669703872437358,
"grad_norm": 1.351828932762146,
"learning_rate": 0.00012073645169758076,
"loss": 0.9374,
"step": 82
},
{
"epoch": 0.04726651480637813,
"grad_norm": 0.598646879196167,
"learning_rate": 0.00011713481397646953,
"loss": 0.5562,
"step": 83
},
{
"epoch": 0.04783599088838269,
"grad_norm": 0.7442788481712341,
"learning_rate": 0.00011355297301451042,
"loss": 0.75,
"step": 84
},
{
"epoch": 0.048405466970387244,
"grad_norm": 0.5332076549530029,
"learning_rate": 0.00010999308637876524,
"loss": 0.6766,
"step": 85
},
{
"epoch": 0.0489749430523918,
"grad_norm": 1.0476224422454834,
"learning_rate": 0.00010645729841183066,
"loss": 0.6271,
"step": 86
},
{
"epoch": 0.049544419134396354,
"grad_norm": 0.8156277537345886,
"learning_rate": 0.00010294773894016627,
"loss": 0.8984,
"step": 87
},
{
"epoch": 0.05011389521640091,
"grad_norm": 0.8451378345489502,
"learning_rate": 9.946652199116699e-05,
"loss": 1.0814,
"step": 88
},
{
"epoch": 0.050683371298405465,
"grad_norm": 0.6506671905517578,
"learning_rate": 9.601574451975175e-05,
"loss": 0.5343,
"step": 89
},
{
"epoch": 0.05125284738041002,
"grad_norm": 1.0723323822021484,
"learning_rate": 9.259748514523653e-05,
"loss": 1.1407,
"step": 90
},
{
"epoch": 0.051822323462414575,
"grad_norm": 0.6675905585289001,
"learning_rate": 8.921380289925153e-05,
"loss": 0.8981,
"step": 91
},
{
"epoch": 0.05239179954441914,
"grad_norm": 0.851328432559967,
"learning_rate": 8.586673598545771e-05,
"loss": 0.7855,
"step": 92
},
{
"epoch": 0.05296127562642369,
"grad_norm": 0.5953764915466309,
"learning_rate": 8.255830055180899e-05,
"loss": 0.6019,
"step": 93
},
{
"epoch": 0.05353075170842825,
"grad_norm": 0.6898136138916016,
"learning_rate": 7.929048947610034e-05,
"loss": 0.6316,
"step": 94
},
{
"epoch": 0.0541002277904328,
"grad_norm": 0.766689658164978,
"learning_rate": 7.606527116553241e-05,
"loss": 0.7684,
"step": 95
},
{
"epoch": 0.05466970387243736,
"grad_norm": 0.9130173325538635,
"learning_rate": 7.288458837101675e-05,
"loss": 1.0119,
"step": 96
},
{
"epoch": 0.055239179954441914,
"grad_norm": 0.7758641242980957,
"learning_rate": 6.975035701693544e-05,
"loss": 0.7098,
"step": 97
},
{
"epoch": 0.05580865603644647,
"grad_norm": 0.5639253258705139,
"learning_rate": 6.66644650470597e-05,
"loss": 0.5637,
"step": 98
},
{
"epoch": 0.056378132118451024,
"grad_norm": 0.9065825939178467,
"learning_rate": 6.362877128732319e-05,
"loss": 1.149,
"step": 99
},
{
"epoch": 0.05694760820045558,
"grad_norm": 0.75728839635849,
"learning_rate": 6.064510432613499e-05,
"loss": 0.4102,
"step": 100
},
{
"epoch": 0.057517084282460135,
"grad_norm": 0.7174970507621765,
"learning_rate": 5.771526141290599e-05,
"loss": 0.7149,
"step": 101
},
{
"epoch": 0.05808656036446469,
"grad_norm": 0.5997675657272339,
"learning_rate": 5.4841007375453186e-05,
"loss": 0.4369,
"step": 102
},
{
"epoch": 0.058656036446469245,
"grad_norm": 0.6755107641220093,
"learning_rate": 5.2024073556933516e-05,
"loss": 1.361,
"step": 103
},
{
"epoch": 0.05922551252847381,
"grad_norm": 0.8155584931373596,
"learning_rate": 4.926615677294723e-05,
"loss": 0.6092,
"step": 104
},
{
"epoch": 0.05979498861047836,
"grad_norm": 0.7561736702919006,
"learning_rate": 4.656891828943996e-05,
"loss": 0.8085,
"step": 105
},
{
"epoch": 0.06036446469248292,
"grad_norm": 0.6462244391441345,
"learning_rate": 4.3933982822017876e-05,
"loss": 0.661,
"step": 106
},
{
"epoch": 0.06093394077448747,
"grad_norm": 0.8051128387451172,
"learning_rate": 4.136293755727998e-05,
"loss": 0.7713,
"step": 107
},
{
"epoch": 0.06150341685649203,
"grad_norm": 1.8678494691848755,
"learning_rate": 3.885733119675616e-05,
"loss": 1.0606,
"step": 108
},
{
"epoch": 0.062072892938496584,
"grad_norm": 0.5828897953033447,
"learning_rate": 3.641867302402731e-05,
"loss": 0.5834,
"step": 109
},
{
"epoch": 0.06264236902050115,
"grad_norm": 0.4921259582042694,
"learning_rate": 3.404843199558945e-05,
"loss": 0.6211,
"step": 110
},
{
"epoch": 0.0632118451025057,
"grad_norm": 0.7523202896118164,
"learning_rate": 3.174803585600906e-05,
"loss": 0.5977,
"step": 111
},
{
"epoch": 0.06378132118451026,
"grad_norm": 0.618629515171051,
"learning_rate": 2.9518870277903274e-05,
"loss": 0.5802,
"step": 112
},
{
"epoch": 0.06435079726651481,
"grad_norm": 0.633359968662262,
"learning_rate": 2.7362278027262457e-05,
"loss": 0.8338,
"step": 113
},
{
"epoch": 0.06492027334851937,
"grad_norm": 0.5951647758483887,
"learning_rate": 2.5279558154618197e-05,
"loss": 0.5764,
"step": 114
},
{
"epoch": 0.06548974943052392,
"grad_norm": 0.5431082248687744,
"learning_rate": 2.3271965212543932e-05,
"loss": 0.6116,
"step": 115
},
{
"epoch": 0.06605922551252848,
"grad_norm": 0.6911126971244812,
"learning_rate": 2.1340708499959197e-05,
"loss": 0.8577,
"step": 116
},
{
"epoch": 0.06662870159453303,
"grad_norm": 0.7030333280563354,
"learning_rate": 1.9486951333693296e-05,
"loss": 0.7916,
"step": 117
},
{
"epoch": 0.06719817767653759,
"grad_norm": 0.6063715815544128,
"learning_rate": 1.7711810347746757e-05,
"loss": 0.6928,
"step": 118
},
{
"epoch": 0.06776765375854214,
"grad_norm": 0.6492345333099365,
"learning_rate": 1.6016354820672715e-05,
"loss": 0.6717,
"step": 119
},
{
"epoch": 0.0683371298405467,
"grad_norm": 0.658710777759552,
"learning_rate": 1.4401606031483497e-05,
"loss": 1.0441,
"step": 120
},
{
"epoch": 0.06890660592255125,
"grad_norm": 0.6208887696266174,
"learning_rate": 1.2868536644470396e-05,
"loss": 0.793,
"step": 121
},
{
"epoch": 0.06947608200455581,
"grad_norm": 0.520664393901825,
"learning_rate": 1.1418070123306989e-05,
"loss": 0.5236,
"step": 122
},
{
"epoch": 0.07004555808656036,
"grad_norm": 0.5397936701774597,
"learning_rate": 1.0051080174789172e-05,
"loss": 0.6599,
"step": 123
},
{
"epoch": 0.07061503416856492,
"grad_norm": 0.6907640695571899,
"learning_rate": 8.768390222546895e-06,
"loss": 0.7875,
"step": 124
},
{
"epoch": 0.07118451025056947,
"grad_norm": 0.573017418384552,
"learning_rate": 7.570772911044498e-06,
"loss": 0.5655,
"step": 125
},
{
"epoch": 0.07175398633257403,
"grad_norm": 1.2410931587219238,
"learning_rate": 6.458949640168675e-06,
"loss": 0.6824,
"step": 126
},
{
"epoch": 0.07232346241457858,
"grad_norm": 0.692986786365509,
"learning_rate": 5.4335901306840235e-06,
"loss": 0.6636,
"step": 127
},
{
"epoch": 0.07289293849658314,
"grad_norm": 0.5771859288215637,
"learning_rate": 4.495312020818403e-06,
"loss": 0.9473,
"step": 128
},
{
"epoch": 0.0734624145785877,
"grad_norm": 0.8466888666152954,
"learning_rate": 3.6446804942207306e-06,
"loss": 0.7754,
"step": 129
},
{
"epoch": 0.07403189066059225,
"grad_norm": 0.5004900097846985,
"learning_rate": 2.882207939515435e-06,
"loss": 0.7227,
"step": 130
},
{
"epoch": 0.0746013667425968,
"grad_norm": 0.8404062390327454,
"learning_rate": 2.2083536416588165e-06,
"loss": 0.5737,
"step": 131
},
{
"epoch": 0.07517084282460136,
"grad_norm": 0.46463948488235474,
"learning_rate": 1.6235235052828476e-06,
"loss": 0.6784,
"step": 132
},
{
"epoch": 0.07574031890660592,
"grad_norm": 0.7300965785980225,
"learning_rate": 1.128069810193505e-06,
"loss": 0.987,
"step": 133
},
{
"epoch": 0.07630979498861047,
"grad_norm": 0.9501856565475464,
"learning_rate": 7.222909991704773e-07,
"loss": 0.5392,
"step": 134
},
{
"epoch": 0.07687927107061504,
"grad_norm": 0.6093735694885254,
"learning_rate": 4.064314981964689e-07,
"loss": 0.801,
"step": 135
},
{
"epoch": 0.0774487471526196,
"grad_norm": 0.8983132839202881,
"learning_rate": 1.8068156922413924e-07,
"loss": 0.7463,
"step": 136
},
{
"epoch": 0.07801822323462415,
"grad_norm": 0.7183220982551575,
"learning_rate": 4.51771955693625e-08,
"loss": 0.6387,
"step": 137
},
{
"epoch": 0.0785876993166287,
"grad_norm": 0.6416277885437012,
"learning_rate": 0.0,
"loss": 1.1154,
"step": 138
}
],
"logging_steps": 1,
"max_steps": 138,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7728037223071744e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}