jssky's picture
Training in progress, step 372, checkpoint
75f7267 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0013449899125757,
"eval_steps": 93,
"global_step": 372,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026899798251513113,
"grad_norm": 1.8751003742218018,
"learning_rate": 2e-05,
"loss": 2.0835,
"step": 1
},
{
"epoch": 0.005379959650302623,
"grad_norm": 2.948612928390503,
"learning_rate": 4e-05,
"loss": 2.7117,
"step": 2
},
{
"epoch": 0.008069939475453935,
"grad_norm": 2.65693998336792,
"learning_rate": 6e-05,
"loss": 2.2257,
"step": 3
},
{
"epoch": 0.010759919300605245,
"grad_norm": 3.0478532314300537,
"learning_rate": 8e-05,
"loss": 2.5187,
"step": 4
},
{
"epoch": 0.013449899125756557,
"grad_norm": 5.911397457122803,
"learning_rate": 0.0001,
"loss": 2.3052,
"step": 5
},
{
"epoch": 0.01613987895090787,
"grad_norm": 6.985674858093262,
"learning_rate": 0.00012,
"loss": 2.2497,
"step": 6
},
{
"epoch": 0.01882985877605918,
"grad_norm": 12.322802543640137,
"learning_rate": 0.00014,
"loss": 2.3409,
"step": 7
},
{
"epoch": 0.02151983860121049,
"grad_norm": 5.827511787414551,
"learning_rate": 0.00016,
"loss": 2.4239,
"step": 8
},
{
"epoch": 0.0242098184263618,
"grad_norm": 4.404603481292725,
"learning_rate": 0.00018,
"loss": 2.2837,
"step": 9
},
{
"epoch": 0.026899798251513115,
"grad_norm": 5.033199310302734,
"learning_rate": 0.0002,
"loss": 1.9387,
"step": 10
},
{
"epoch": 0.029589778076664425,
"grad_norm": 5.6907196044921875,
"learning_rate": 0.00019999623426388962,
"loss": 1.7416,
"step": 11
},
{
"epoch": 0.03227975790181574,
"grad_norm": 6.009058952331543,
"learning_rate": 0.00019998493733917384,
"loss": 1.1876,
"step": 12
},
{
"epoch": 0.03496973772696705,
"grad_norm": 7.73223352432251,
"learning_rate": 0.00019996611007667742,
"loss": 1.6472,
"step": 13
},
{
"epoch": 0.03765971755211836,
"grad_norm": 9.206978797912598,
"learning_rate": 0.00019993975389437038,
"loss": 1.3337,
"step": 14
},
{
"epoch": 0.04034969737726967,
"grad_norm": 8.522892951965332,
"learning_rate": 0.00019990587077726128,
"loss": 1.107,
"step": 15
},
{
"epoch": 0.04303967720242098,
"grad_norm": 6.441080093383789,
"learning_rate": 0.0001998644632772477,
"loss": 1.3421,
"step": 16
},
{
"epoch": 0.04572965702757229,
"grad_norm": 10.297110557556152,
"learning_rate": 0.00019981553451292396,
"loss": 1.542,
"step": 17
},
{
"epoch": 0.0484196368527236,
"grad_norm": 8.839176177978516,
"learning_rate": 0.0001997590881693464,
"loss": 1.3331,
"step": 18
},
{
"epoch": 0.05110961667787491,
"grad_norm": 7.464210510253906,
"learning_rate": 0.00019969512849775565,
"loss": 0.6544,
"step": 19
},
{
"epoch": 0.05379959650302623,
"grad_norm": 8.324406623840332,
"learning_rate": 0.00019962366031525664,
"loss": 0.6081,
"step": 20
},
{
"epoch": 0.05648957632817754,
"grad_norm": 7.356775760650635,
"learning_rate": 0.00019954468900445566,
"loss": 0.8796,
"step": 21
},
{
"epoch": 0.05917955615332885,
"grad_norm": 4.948946475982666,
"learning_rate": 0.00019945822051305507,
"loss": 0.6637,
"step": 22
},
{
"epoch": 0.06186953597848016,
"grad_norm": 4.626830577850342,
"learning_rate": 0.00019936426135340528,
"loss": 0.9787,
"step": 23
},
{
"epoch": 0.06455951580363148,
"grad_norm": 5.476314067840576,
"learning_rate": 0.0001992628186020143,
"loss": 1.1509,
"step": 24
},
{
"epoch": 0.06724949562878278,
"grad_norm": 4.901749134063721,
"learning_rate": 0.00019915389989901474,
"loss": 1.2591,
"step": 25
},
{
"epoch": 0.0699394754539341,
"grad_norm": 5.085122108459473,
"learning_rate": 0.00019903751344758848,
"loss": 0.7551,
"step": 26
},
{
"epoch": 0.0726294552790854,
"grad_norm": 7.4879231452941895,
"learning_rate": 0.0001989136680133488,
"loss": 0.8804,
"step": 27
},
{
"epoch": 0.07531943510423672,
"grad_norm": 4.756115913391113,
"learning_rate": 0.00019878237292368013,
"loss": 0.7665,
"step": 28
},
{
"epoch": 0.07800941492938802,
"grad_norm": 4.777318954467773,
"learning_rate": 0.0001986436380670357,
"loss": 1.0634,
"step": 29
},
{
"epoch": 0.08069939475453934,
"grad_norm": 4.6226325035095215,
"learning_rate": 0.00019849747389219272,
"loss": 0.5563,
"step": 30
},
{
"epoch": 0.08338937457969066,
"grad_norm": 4.621855735778809,
"learning_rate": 0.0001983438914074654,
"loss": 0.6377,
"step": 31
},
{
"epoch": 0.08607935440484196,
"grad_norm": 4.654213905334473,
"learning_rate": 0.00019818290217987587,
"loss": 0.7768,
"step": 32
},
{
"epoch": 0.08876933422999328,
"grad_norm": 4.971796989440918,
"learning_rate": 0.00019801451833428312,
"loss": 0.7793,
"step": 33
},
{
"epoch": 0.09145931405514458,
"grad_norm": 4.303219318389893,
"learning_rate": 0.00019783875255246973,
"loss": 0.7334,
"step": 34
},
{
"epoch": 0.0941492938802959,
"grad_norm": 4.168240547180176,
"learning_rate": 0.0001976556180721867,
"loss": 0.7872,
"step": 35
},
{
"epoch": 0.0968392737054472,
"grad_norm": 3.7110626697540283,
"learning_rate": 0.00019746512868615656,
"loss": 0.9156,
"step": 36
},
{
"epoch": 0.09952925353059852,
"grad_norm": 4.882535457611084,
"learning_rate": 0.00019726729874103448,
"loss": 1.1328,
"step": 37
},
{
"epoch": 0.10221923335574983,
"grad_norm": 7.1945624351501465,
"learning_rate": 0.00019706214313632784,
"loss": 1.173,
"step": 38
},
{
"epoch": 0.10490921318090114,
"grad_norm": 8.771895408630371,
"learning_rate": 0.00019684967732327396,
"loss": 0.9978,
"step": 39
},
{
"epoch": 0.10759919300605246,
"grad_norm": 5.988046169281006,
"learning_rate": 0.00019662991730367663,
"loss": 0.7832,
"step": 40
},
{
"epoch": 0.11028917283120376,
"grad_norm": 12.11080551147461,
"learning_rate": 0.00019640287962870062,
"loss": 1.593,
"step": 41
},
{
"epoch": 0.11297915265635508,
"grad_norm": 8.772387504577637,
"learning_rate": 0.00019616858139762534,
"loss": 0.7483,
"step": 42
},
{
"epoch": 0.11566913248150638,
"grad_norm": 8.320154190063477,
"learning_rate": 0.000195927040256557,
"loss": 0.7829,
"step": 43
},
{
"epoch": 0.1183591123066577,
"grad_norm": 9.61361026763916,
"learning_rate": 0.00019567827439709954,
"loss": 1.014,
"step": 44
},
{
"epoch": 0.121049092131809,
"grad_norm": 5.479886531829834,
"learning_rate": 0.00019542230255498454,
"loss": 0.5207,
"step": 45
},
{
"epoch": 0.12373907195696032,
"grad_norm": 18.779264450073242,
"learning_rate": 0.0001951591440086602,
"loss": 1.5535,
"step": 46
},
{
"epoch": 0.12642905178211164,
"grad_norm": 11.512892723083496,
"learning_rate": 0.00019488881857783935,
"loss": 1.3748,
"step": 47
},
{
"epoch": 0.12911903160726296,
"grad_norm": 16.13918685913086,
"learning_rate": 0.00019461134662200668,
"loss": 1.1999,
"step": 48
},
{
"epoch": 0.13180901143241425,
"grad_norm": 11.946172714233398,
"learning_rate": 0.00019432674903888548,
"loss": 0.7801,
"step": 49
},
{
"epoch": 0.13449899125756556,
"grad_norm": 10.913107872009277,
"learning_rate": 0.0001940350472628637,
"loss": 0.4945,
"step": 50
},
{
"epoch": 0.13718897108271688,
"grad_norm": 7.555224895477295,
"learning_rate": 0.00019373626326337946,
"loss": 1.7617,
"step": 51
},
{
"epoch": 0.1398789509078682,
"grad_norm": 8.530354499816895,
"learning_rate": 0.0001934304195432668,
"loss": 2.1801,
"step": 52
},
{
"epoch": 0.1425689307330195,
"grad_norm": 6.264369487762451,
"learning_rate": 0.0001931175391370605,
"loss": 1.514,
"step": 53
},
{
"epoch": 0.1452589105581708,
"grad_norm": 5.179037094116211,
"learning_rate": 0.00019279764560926142,
"loss": 1.746,
"step": 54
},
{
"epoch": 0.14794889038332212,
"grad_norm": 4.9991536140441895,
"learning_rate": 0.00019247076305256176,
"loss": 1.4403,
"step": 55
},
{
"epoch": 0.15063887020847344,
"grad_norm": 5.004469871520996,
"learning_rate": 0.00019213691608603047,
"loss": 2.0104,
"step": 56
},
{
"epoch": 0.15332885003362476,
"grad_norm": 5.55224609375,
"learning_rate": 0.00019179612985325908,
"loss": 1.5966,
"step": 57
},
{
"epoch": 0.15601882985877605,
"grad_norm": 4.276426315307617,
"learning_rate": 0.00019144843002046806,
"loss": 1.3126,
"step": 58
},
{
"epoch": 0.15870880968392737,
"grad_norm": 6.414350509643555,
"learning_rate": 0.0001910938427745737,
"loss": 1.4917,
"step": 59
},
{
"epoch": 0.16139878950907868,
"grad_norm": 4.202977657318115,
"learning_rate": 0.000190732394821216,
"loss": 1.0693,
"step": 60
},
{
"epoch": 0.16408876933423,
"grad_norm": 4.08539342880249,
"learning_rate": 0.00019036411338274703,
"loss": 1.0895,
"step": 61
},
{
"epoch": 0.16677874915938132,
"grad_norm": 3.5492026805877686,
"learning_rate": 0.00018998902619618116,
"loss": 0.9263,
"step": 62
},
{
"epoch": 0.1694687289845326,
"grad_norm": 3.8257992267608643,
"learning_rate": 0.00018960716151110554,
"loss": 1.3858,
"step": 63
},
{
"epoch": 0.17215870880968392,
"grad_norm": 3.3236279487609863,
"learning_rate": 0.00018921854808755294,
"loss": 1.0358,
"step": 64
},
{
"epoch": 0.17484868863483524,
"grad_norm": 3.356065511703491,
"learning_rate": 0.00018882321519383534,
"loss": 0.8704,
"step": 65
},
{
"epoch": 0.17753866845998656,
"grad_norm": 4.296020984649658,
"learning_rate": 0.00018842119260433982,
"loss": 1.0503,
"step": 66
},
{
"epoch": 0.18022864828513785,
"grad_norm": 5.693877696990967,
"learning_rate": 0.00018801251059728604,
"loss": 0.6922,
"step": 67
},
{
"epoch": 0.18291862811028917,
"grad_norm": 5.999386310577393,
"learning_rate": 0.0001875971999524458,
"loss": 0.824,
"step": 68
},
{
"epoch": 0.18560860793544048,
"grad_norm": 4.900099754333496,
"learning_rate": 0.000187175291948825,
"loss": 0.7182,
"step": 69
},
{
"epoch": 0.1882985877605918,
"grad_norm": 4.141096115112305,
"learning_rate": 0.0001867468183623077,
"loss": 0.4006,
"step": 70
},
{
"epoch": 0.19098856758574312,
"grad_norm": 6.806368350982666,
"learning_rate": 0.00018631181146326305,
"loss": 0.8936,
"step": 71
},
{
"epoch": 0.1936785474108944,
"grad_norm": 2.945824146270752,
"learning_rate": 0.0001858703040141148,
"loss": 0.5178,
"step": 72
},
{
"epoch": 0.19636852723604573,
"grad_norm": 6.985172271728516,
"learning_rate": 0.00018542232926687383,
"loss": 0.8644,
"step": 73
},
{
"epoch": 0.19905850706119704,
"grad_norm": 5.231998920440674,
"learning_rate": 0.0001849679209606338,
"loss": 1.0585,
"step": 74
},
{
"epoch": 0.20174848688634836,
"grad_norm": 4.8978705406188965,
"learning_rate": 0.00018450711331903006,
"loss": 0.7828,
"step": 75
},
{
"epoch": 0.20443846671149965,
"grad_norm": 5.309878826141357,
"learning_rate": 0.00018403994104766212,
"loss": 0.7666,
"step": 76
},
{
"epoch": 0.20712844653665097,
"grad_norm": 5.227763652801514,
"learning_rate": 0.00018356643933147986,
"loss": 0.8396,
"step": 77
},
{
"epoch": 0.20981842636180228,
"grad_norm": 4.239619731903076,
"learning_rate": 0.00018308664383213344,
"loss": 0.6439,
"step": 78
},
{
"epoch": 0.2125084061869536,
"grad_norm": 5.731531620025635,
"learning_rate": 0.00018260059068528762,
"loss": 1.3371,
"step": 79
},
{
"epoch": 0.21519838601210492,
"grad_norm": 4.006597518920898,
"learning_rate": 0.00018210831649790018,
"loss": 0.5272,
"step": 80
},
{
"epoch": 0.2178883658372562,
"grad_norm": 4.0596699714660645,
"learning_rate": 0.00018160985834546475,
"loss": 0.4416,
"step": 81
},
{
"epoch": 0.22057834566240753,
"grad_norm": 4.053659915924072,
"learning_rate": 0.00018110525376921862,
"loss": 0.4781,
"step": 82
},
{
"epoch": 0.22326832548755884,
"grad_norm": 4.120569705963135,
"learning_rate": 0.00018059454077331527,
"loss": 0.8082,
"step": 83
},
{
"epoch": 0.22595830531271016,
"grad_norm": 6.420701503753662,
"learning_rate": 0.00018007775782196214,
"loss": 0.5476,
"step": 84
},
{
"epoch": 0.22864828513786148,
"grad_norm": 13.603157997131348,
"learning_rate": 0.00017955494383652365,
"loss": 0.7857,
"step": 85
},
{
"epoch": 0.23133826496301277,
"grad_norm": 7.784971237182617,
"learning_rate": 0.00017902613819258985,
"loss": 0.7705,
"step": 86
},
{
"epoch": 0.23402824478816409,
"grad_norm": 5.966789722442627,
"learning_rate": 0.00017849138071701092,
"loss": 0.9065,
"step": 87
},
{
"epoch": 0.2367182246133154,
"grad_norm": 9.451849937438965,
"learning_rate": 0.0001779507116848976,
"loss": 0.998,
"step": 88
},
{
"epoch": 0.23940820443846672,
"grad_norm": 6.712332725524902,
"learning_rate": 0.00017740417181658788,
"loss": 1.1464,
"step": 89
},
{
"epoch": 0.242098184263618,
"grad_norm": 4.188553333282471,
"learning_rate": 0.00017685180227458003,
"loss": 0.6356,
"step": 90
},
{
"epoch": 0.24478816408876933,
"grad_norm": 9.703150749206543,
"learning_rate": 0.00017629364466043273,
"loss": 1.0548,
"step": 91
},
{
"epoch": 0.24747814391392065,
"grad_norm": 6.08168363571167,
"learning_rate": 0.00017572974101163165,
"loss": 0.7252,
"step": 92
},
{
"epoch": 0.25016812373907193,
"grad_norm": 10.167409896850586,
"learning_rate": 0.00017516013379842337,
"loss": 0.658,
"step": 93
},
{
"epoch": 0.25016812373907193,
"eval_loss": 0.9310864210128784,
"eval_runtime": 10.8326,
"eval_samples_per_second": 14.493,
"eval_steps_per_second": 7.293,
"step": 93
},
{
"epoch": 0.2528581035642233,
"grad_norm": 5.78995418548584,
"learning_rate": 0.00017458486592061704,
"loss": 0.9346,
"step": 94
},
{
"epoch": 0.25554808338937457,
"grad_norm": 3.517900228500366,
"learning_rate": 0.00017400398070435293,
"loss": 0.3506,
"step": 95
},
{
"epoch": 0.2582380632145259,
"grad_norm": 5.804417610168457,
"learning_rate": 0.00017341752189883983,
"loss": 0.4959,
"step": 96
},
{
"epoch": 0.2609280430396772,
"grad_norm": 8.148117065429688,
"learning_rate": 0.00017282553367305975,
"loss": 0.9842,
"step": 97
},
{
"epoch": 0.2636180228648285,
"grad_norm": 9.511378288269043,
"learning_rate": 0.0001722280606124415,
"loss": 0.7143,
"step": 98
},
{
"epoch": 0.26630800268997984,
"grad_norm": 6.079991340637207,
"learning_rate": 0.00017162514771550255,
"loss": 0.2979,
"step": 99
},
{
"epoch": 0.26899798251513113,
"grad_norm": 6.114333152770996,
"learning_rate": 0.00017101684039046036,
"loss": 0.5812,
"step": 100
},
{
"epoch": 0.2716879623402825,
"grad_norm": 4.91884183883667,
"learning_rate": 0.0001704031844518121,
"loss": 1.8317,
"step": 101
},
{
"epoch": 0.27437794216543376,
"grad_norm": 5.735188007354736,
"learning_rate": 0.0001697842261168843,
"loss": 2.3345,
"step": 102
},
{
"epoch": 0.27706792199058505,
"grad_norm": 5.317649841308594,
"learning_rate": 0.0001691600120023521,
"loss": 2.0851,
"step": 103
},
{
"epoch": 0.2797579018157364,
"grad_norm": 7.778799057006836,
"learning_rate": 0.00016853058912072802,
"loss": 1.1674,
"step": 104
},
{
"epoch": 0.2824478816408877,
"grad_norm": 4.196416854858398,
"learning_rate": 0.00016789600487682156,
"loss": 1.5939,
"step": 105
},
{
"epoch": 0.285137861466039,
"grad_norm": 4.3927741050720215,
"learning_rate": 0.0001672563070641688,
"loss": 1.4615,
"step": 106
},
{
"epoch": 0.2878278412911903,
"grad_norm": 4.284142017364502,
"learning_rate": 0.0001666115438614328,
"loss": 1.9508,
"step": 107
},
{
"epoch": 0.2905178211163416,
"grad_norm": 5.4508867263793945,
"learning_rate": 0.00016596176382877506,
"loss": 1.3256,
"step": 108
},
{
"epoch": 0.29320780094149296,
"grad_norm": 11.987678527832031,
"learning_rate": 0.00016530701590419824,
"loss": 0.9202,
"step": 109
},
{
"epoch": 0.29589778076664425,
"grad_norm": 5.667636394500732,
"learning_rate": 0.00016464734939986036,
"loss": 1.3247,
"step": 110
},
{
"epoch": 0.29858776059179554,
"grad_norm": 3.8087687492370605,
"learning_rate": 0.00016398281399836097,
"loss": 0.9626,
"step": 111
},
{
"epoch": 0.3012777404169469,
"grad_norm": 5.772204875946045,
"learning_rate": 0.00016331345974899923,
"loss": 1.3912,
"step": 112
},
{
"epoch": 0.30396772024209817,
"grad_norm": 3.2174160480499268,
"learning_rate": 0.00016263933706400451,
"loss": 1.0545,
"step": 113
},
{
"epoch": 0.3066577000672495,
"grad_norm": 3.539743423461914,
"learning_rate": 0.00016196049671473954,
"loss": 0.9489,
"step": 114
},
{
"epoch": 0.3093476798924008,
"grad_norm": 3.6935033798217773,
"learning_rate": 0.0001612769898278766,
"loss": 1.0005,
"step": 115
},
{
"epoch": 0.3120376597175521,
"grad_norm": 3.477961301803589,
"learning_rate": 0.00016058886788154712,
"loss": 0.6155,
"step": 116
},
{
"epoch": 0.31472763954270344,
"grad_norm": 3.9399242401123047,
"learning_rate": 0.00015989618270146423,
"loss": 0.7689,
"step": 117
},
{
"epoch": 0.31741761936785473,
"grad_norm": 4.4496846199035645,
"learning_rate": 0.0001591989864570199,
"loss": 1.0174,
"step": 118
},
{
"epoch": 0.3201075991930061,
"grad_norm": 4.519758224487305,
"learning_rate": 0.00015849733165735556,
"loss": 0.9051,
"step": 119
},
{
"epoch": 0.32279757901815737,
"grad_norm": 3.636235237121582,
"learning_rate": 0.00015779127114740757,
"loss": 0.5993,
"step": 120
},
{
"epoch": 0.32548755884330866,
"grad_norm": 2.2947537899017334,
"learning_rate": 0.0001570808581039271,
"loss": 0.23,
"step": 121
},
{
"epoch": 0.32817753866846,
"grad_norm": 3.0490782260894775,
"learning_rate": 0.00015636614603147512,
"loss": 0.5818,
"step": 122
},
{
"epoch": 0.3308675184936113,
"grad_norm": 3.2933220863342285,
"learning_rate": 0.0001556471887583929,
"loss": 0.6548,
"step": 123
},
{
"epoch": 0.33355749831876264,
"grad_norm": 4.488528251647949,
"learning_rate": 0.0001549240404327477,
"loss": 0.9628,
"step": 124
},
{
"epoch": 0.3362474781439139,
"grad_norm": 4.679425239562988,
"learning_rate": 0.00015419675551825475,
"loss": 0.4106,
"step": 125
},
{
"epoch": 0.3389374579690652,
"grad_norm": 4.400868892669678,
"learning_rate": 0.0001534653887901754,
"loss": 0.3852,
"step": 126
},
{
"epoch": 0.34162743779421656,
"grad_norm": 4.978918552398682,
"learning_rate": 0.00015272999533119162,
"loss": 0.8162,
"step": 127
},
{
"epoch": 0.34431741761936785,
"grad_norm": 5.046586990356445,
"learning_rate": 0.00015199063052725745,
"loss": 0.649,
"step": 128
},
{
"epoch": 0.34700739744451914,
"grad_norm": 7.412467956542969,
"learning_rate": 0.0001512473500634277,
"loss": 0.6579,
"step": 129
},
{
"epoch": 0.3496973772696705,
"grad_norm": 3.8262441158294678,
"learning_rate": 0.00015050020991966406,
"loss": 0.4359,
"step": 130
},
{
"epoch": 0.3523873570948218,
"grad_norm": 5.179169654846191,
"learning_rate": 0.0001497492663666189,
"loss": 0.5676,
"step": 131
},
{
"epoch": 0.3550773369199731,
"grad_norm": 5.74229097366333,
"learning_rate": 0.00014899457596139729,
"loss": 0.3635,
"step": 132
},
{
"epoch": 0.3577673167451244,
"grad_norm": 7.098540782928467,
"learning_rate": 0.00014823619554329745,
"loss": 0.996,
"step": 133
},
{
"epoch": 0.3604572965702757,
"grad_norm": 4.635382652282715,
"learning_rate": 0.00014747418222952995,
"loss": 0.7149,
"step": 134
},
{
"epoch": 0.36314727639542704,
"grad_norm": 3.750243663787842,
"learning_rate": 0.0001467085934109158,
"loss": 0.3169,
"step": 135
},
{
"epoch": 0.36583725622057833,
"grad_norm": 4.545015811920166,
"learning_rate": 0.00014593948674756417,
"loss": 0.5511,
"step": 136
},
{
"epoch": 0.3685272360457297,
"grad_norm": 5.990297794342041,
"learning_rate": 0.0001451669201645298,
"loss": 0.766,
"step": 137
},
{
"epoch": 0.37121721587088097,
"grad_norm": 3.692354679107666,
"learning_rate": 0.00014439095184745024,
"loss": 0.4151,
"step": 138
},
{
"epoch": 0.37390719569603226,
"grad_norm": 3.4247729778289795,
"learning_rate": 0.00014361164023816376,
"loss": 0.466,
"step": 139
},
{
"epoch": 0.3765971755211836,
"grad_norm": 3.962257146835327,
"learning_rate": 0.00014282904403030772,
"loss": 0.4263,
"step": 140
},
{
"epoch": 0.3792871553463349,
"grad_norm": 5.7197771072387695,
"learning_rate": 0.00014204322216489814,
"loss": 0.4988,
"step": 141
},
{
"epoch": 0.38197713517148624,
"grad_norm": 5.587864398956299,
"learning_rate": 0.00014125423382589048,
"loss": 0.6946,
"step": 142
},
{
"epoch": 0.3846671149966375,
"grad_norm": 11.981307029724121,
"learning_rate": 0.00014046213843572236,
"loss": 0.7456,
"step": 143
},
{
"epoch": 0.3873570948217888,
"grad_norm": 6.747979164123535,
"learning_rate": 0.00013966699565083802,
"loss": 1.2804,
"step": 144
},
{
"epoch": 0.39004707464694016,
"grad_norm": 4.663575649261475,
"learning_rate": 0.0001388688653571954,
"loss": 0.5548,
"step": 145
},
{
"epoch": 0.39273705447209145,
"grad_norm": 5.274585247039795,
"learning_rate": 0.00013806780766575588,
"loss": 0.6681,
"step": 146
},
{
"epoch": 0.3954270342972428,
"grad_norm": 13.038918495178223,
"learning_rate": 0.00013726388290795697,
"loss": 1.082,
"step": 147
},
{
"epoch": 0.3981170141223941,
"grad_norm": 7.035642623901367,
"learning_rate": 0.00013645715163116846,
"loss": 0.3975,
"step": 148
},
{
"epoch": 0.4008069939475454,
"grad_norm": 5.065128326416016,
"learning_rate": 0.00013564767459413237,
"loss": 0.2747,
"step": 149
},
{
"epoch": 0.4034969737726967,
"grad_norm": 4.475830554962158,
"learning_rate": 0.0001348355127623869,
"loss": 0.2169,
"step": 150
},
{
"epoch": 0.406186953597848,
"grad_norm": 4.0652031898498535,
"learning_rate": 0.00013402072730367475,
"loss": 1.7546,
"step": 151
},
{
"epoch": 0.4088769334229993,
"grad_norm": 4.62870454788208,
"learning_rate": 0.0001332033795833364,
"loss": 1.5081,
"step": 152
},
{
"epoch": 0.41156691324815065,
"grad_norm": 3.8758082389831543,
"learning_rate": 0.0001323835311596884,
"loss": 1.371,
"step": 153
},
{
"epoch": 0.41425689307330194,
"grad_norm": 4.078228950500488,
"learning_rate": 0.00013156124377938699,
"loss": 1.5507,
"step": 154
},
{
"epoch": 0.4169468728984533,
"grad_norm": 3.6525630950927734,
"learning_rate": 0.0001307365793727778,
"loss": 1.1093,
"step": 155
},
{
"epoch": 0.41963685272360457,
"grad_norm": 4.3088202476501465,
"learning_rate": 0.00012990960004923154,
"loss": 1.6154,
"step": 156
},
{
"epoch": 0.42232683254875586,
"grad_norm": 4.335425853729248,
"learning_rate": 0.00012908036809246623,
"loss": 1.4037,
"step": 157
},
{
"epoch": 0.4250168123739072,
"grad_norm": 3.7850985527038574,
"learning_rate": 0.00012824894595585637,
"loss": 1.1471,
"step": 158
},
{
"epoch": 0.4277067921990585,
"grad_norm": 4.085525035858154,
"learning_rate": 0.00012741539625772918,
"loss": 1.2586,
"step": 159
},
{
"epoch": 0.43039677202420984,
"grad_norm": 3.4970481395721436,
"learning_rate": 0.0001265797817766486,
"loss": 1.1133,
"step": 160
},
{
"epoch": 0.43308675184936113,
"grad_norm": 4.015367031097412,
"learning_rate": 0.0001257421654466872,
"loss": 0.71,
"step": 161
},
{
"epoch": 0.4357767316745124,
"grad_norm": 3.805530071258545,
"learning_rate": 0.00012490261035268612,
"loss": 1.4369,
"step": 162
},
{
"epoch": 0.43846671149966376,
"grad_norm": 4.442086696624756,
"learning_rate": 0.00012406117972550414,
"loss": 1.1577,
"step": 163
},
{
"epoch": 0.44115669132481505,
"grad_norm": 3.171997308731079,
"learning_rate": 0.00012321793693725509,
"loss": 0.667,
"step": 164
},
{
"epoch": 0.4438466711499664,
"grad_norm": 4.185075759887695,
"learning_rate": 0.0001223729454965354,
"loss": 0.7278,
"step": 165
},
{
"epoch": 0.4465366509751177,
"grad_norm": 3.8975086212158203,
"learning_rate": 0.00012152626904364067,
"loss": 0.9939,
"step": 166
},
{
"epoch": 0.449226630800269,
"grad_norm": 3.1474146842956543,
"learning_rate": 0.00012067797134577275,
"loss": 0.7392,
"step": 167
},
{
"epoch": 0.4519166106254203,
"grad_norm": 3.5632522106170654,
"learning_rate": 0.00011982811629223709,
"loss": 0.8636,
"step": 168
},
{
"epoch": 0.4546065904505716,
"grad_norm": 2.6525533199310303,
"learning_rate": 0.00011897676788963101,
"loss": 0.3818,
"step": 169
},
{
"epoch": 0.45729657027572296,
"grad_norm": 3.889469861984253,
"learning_rate": 0.0001181239902570229,
"loss": 0.5985,
"step": 170
},
{
"epoch": 0.45998655010087425,
"grad_norm": 3.6286370754241943,
"learning_rate": 0.00011726984762112328,
"loss": 0.8639,
"step": 171
},
{
"epoch": 0.46267652992602554,
"grad_norm": 2.5282163619995117,
"learning_rate": 0.0001164144043114475,
"loss": 0.3303,
"step": 172
},
{
"epoch": 0.4653665097511769,
"grad_norm": 4.00683069229126,
"learning_rate": 0.00011555772475547084,
"loss": 0.414,
"step": 173
},
{
"epoch": 0.46805648957632817,
"grad_norm": 5.255921363830566,
"learning_rate": 0.00011469987347377602,
"loss": 0.8622,
"step": 174
},
{
"epoch": 0.47074646940147946,
"grad_norm": 4.201918601989746,
"learning_rate": 0.00011384091507519403,
"loss": 0.8862,
"step": 175
},
{
"epoch": 0.4734364492266308,
"grad_norm": 4.199880599975586,
"learning_rate": 0.00011298091425193806,
"loss": 0.4554,
"step": 176
},
{
"epoch": 0.4761264290517821,
"grad_norm": 3.6669838428497314,
"learning_rate": 0.00011211993577473121,
"loss": 0.343,
"step": 177
},
{
"epoch": 0.47881640887693344,
"grad_norm": 4.186169147491455,
"learning_rate": 0.00011125804448792831,
"loss": 0.8039,
"step": 178
},
{
"epoch": 0.48150638870208473,
"grad_norm": 4.209519386291504,
"learning_rate": 0.00011039530530463218,
"loss": 0.3221,
"step": 179
},
{
"epoch": 0.484196368527236,
"grad_norm": 2.875875234603882,
"learning_rate": 0.00010953178320180475,
"loss": 0.2874,
"step": 180
},
{
"epoch": 0.48688634835238737,
"grad_norm": 4.24071741104126,
"learning_rate": 0.00010866754321537338,
"loss": 0.5502,
"step": 181
},
{
"epoch": 0.48957632817753866,
"grad_norm": 4.230165481567383,
"learning_rate": 0.0001078026504353325,
"loss": 0.5396,
"step": 182
},
{
"epoch": 0.49226630800269,
"grad_norm": 4.387772560119629,
"learning_rate": 0.0001069371700008416,
"loss": 0.5987,
"step": 183
},
{
"epoch": 0.4949562878278413,
"grad_norm": 4.988356113433838,
"learning_rate": 0.00010607116709531918,
"loss": 0.6046,
"step": 184
},
{
"epoch": 0.4976462676529926,
"grad_norm": 4.388515472412109,
"learning_rate": 0.00010520470694153353,
"loss": 0.595,
"step": 185
},
{
"epoch": 0.5003362474781439,
"grad_norm": 4.310067653656006,
"learning_rate": 0.00010433785479669038,
"loss": 0.5557,
"step": 186
},
{
"epoch": 0.5003362474781439,
"eval_loss": 0.8278390765190125,
"eval_runtime": 10.698,
"eval_samples_per_second": 14.676,
"eval_steps_per_second": 7.385,
"step": 186
},
{
"epoch": 0.5030262273032953,
"grad_norm": 4.27438497543335,
"learning_rate": 0.0001034706759475182,
"loss": 0.7565,
"step": 187
},
{
"epoch": 0.5057162071284466,
"grad_norm": 6.716769218444824,
"learning_rate": 0.0001026032357053512,
"loss": 0.8371,
"step": 188
},
{
"epoch": 0.5084061869535978,
"grad_norm": 2.8589985370635986,
"learning_rate": 0.0001017355994012102,
"loss": 0.2835,
"step": 189
},
{
"epoch": 0.5110961667787491,
"grad_norm": 2.675102949142456,
"learning_rate": 0.00010086783238088244,
"loss": 0.2332,
"step": 190
},
{
"epoch": 0.5137861466039004,
"grad_norm": 5.922438144683838,
"learning_rate": 0.0001,
"loss": 0.3974,
"step": 191
},
{
"epoch": 0.5164761264290518,
"grad_norm": 9.693557739257812,
"learning_rate": 9.913216761911755e-05,
"loss": 0.8956,
"step": 192
},
{
"epoch": 0.5191661062542031,
"grad_norm": 5.175564765930176,
"learning_rate": 9.826440059878982e-05,
"loss": 0.4924,
"step": 193
},
{
"epoch": 0.5218560860793544,
"grad_norm": 4.789131164550781,
"learning_rate": 9.739676429464881e-05,
"loss": 0.6214,
"step": 194
},
{
"epoch": 0.5245460659045057,
"grad_norm": 4.584465026855469,
"learning_rate": 9.652932405248181e-05,
"loss": 0.6807,
"step": 195
},
{
"epoch": 0.527236045729657,
"grad_norm": 8.758447647094727,
"learning_rate": 9.566214520330966e-05,
"loss": 1.2434,
"step": 196
},
{
"epoch": 0.5299260255548084,
"grad_norm": 8.869791984558105,
"learning_rate": 9.479529305846652e-05,
"loss": 0.903,
"step": 197
},
{
"epoch": 0.5326160053799597,
"grad_norm": 3.716257333755493,
"learning_rate": 9.392883290468083e-05,
"loss": 0.3255,
"step": 198
},
{
"epoch": 0.535305985205111,
"grad_norm": 5.400476455688477,
"learning_rate": 9.306282999915839e-05,
"loss": 0.4036,
"step": 199
},
{
"epoch": 0.5379959650302623,
"grad_norm": 3.5948078632354736,
"learning_rate": 9.219734956466752e-05,
"loss": 0.1489,
"step": 200
},
{
"epoch": 0.5406859448554135,
"grad_norm": 3.1145272254943848,
"learning_rate": 9.133245678462663e-05,
"loss": 1.6383,
"step": 201
},
{
"epoch": 0.543375924680565,
"grad_norm": 3.916707754135132,
"learning_rate": 9.046821679819527e-05,
"loss": 1.7301,
"step": 202
},
{
"epoch": 0.5460659045057162,
"grad_norm": 3.5734024047851562,
"learning_rate": 8.960469469536786e-05,
"loss": 1.1493,
"step": 203
},
{
"epoch": 0.5487558843308675,
"grad_norm": 3.514202356338501,
"learning_rate": 8.874195551207174e-05,
"loss": 1.2045,
"step": 204
},
{
"epoch": 0.5514458641560188,
"grad_norm": 3.9733996391296387,
"learning_rate": 8.788006422526881e-05,
"loss": 1.8557,
"step": 205
},
{
"epoch": 0.5541358439811701,
"grad_norm": 4.543676853179932,
"learning_rate": 8.701908574806197e-05,
"loss": 1.3852,
"step": 206
},
{
"epoch": 0.5568258238063215,
"grad_norm": 3.916227102279663,
"learning_rate": 8.615908492480598e-05,
"loss": 1.1968,
"step": 207
},
{
"epoch": 0.5595158036314728,
"grad_norm": 4.097233295440674,
"learning_rate": 8.530012652622397e-05,
"loss": 1.2521,
"step": 208
},
{
"epoch": 0.5622057834566241,
"grad_norm": 3.402857780456543,
"learning_rate": 8.444227524452918e-05,
"loss": 0.8308,
"step": 209
},
{
"epoch": 0.5648957632817754,
"grad_norm": 4.228405952453613,
"learning_rate": 8.358559568855249e-05,
"loss": 1.1485,
"step": 210
},
{
"epoch": 0.5675857431069267,
"grad_norm": 3.699495553970337,
"learning_rate": 8.273015237887673e-05,
"loss": 0.8204,
"step": 211
},
{
"epoch": 0.570275722932078,
"grad_norm": 3.4216742515563965,
"learning_rate": 8.187600974297714e-05,
"loss": 0.7515,
"step": 212
},
{
"epoch": 0.5729657027572294,
"grad_norm": 3.6286754608154297,
"learning_rate": 8.102323211036904e-05,
"loss": 0.6262,
"step": 213
},
{
"epoch": 0.5756556825823806,
"grad_norm": 4.869045734405518,
"learning_rate": 8.017188370776292e-05,
"loss": 0.9851,
"step": 214
},
{
"epoch": 0.5783456624075319,
"grad_norm": 5.106837749481201,
"learning_rate": 7.932202865422726e-05,
"loss": 1.122,
"step": 215
},
{
"epoch": 0.5810356422326832,
"grad_norm": 4.490080833435059,
"learning_rate": 7.847373095635937e-05,
"loss": 0.8354,
"step": 216
},
{
"epoch": 0.5837256220578345,
"grad_norm": 3.4671199321746826,
"learning_rate": 7.762705450346462e-05,
"loss": 0.8034,
"step": 217
},
{
"epoch": 0.5864156018829859,
"grad_norm": 5.198472499847412,
"learning_rate": 7.678206306274495e-05,
"loss": 0.9827,
"step": 218
},
{
"epoch": 0.5891055817081372,
"grad_norm": 3.910879135131836,
"learning_rate": 7.59388202744959e-05,
"loss": 0.5839,
"step": 219
},
{
"epoch": 0.5917955615332885,
"grad_norm": 3.6499719619750977,
"learning_rate": 7.509738964731389e-05,
"loss": 0.4724,
"step": 220
},
{
"epoch": 0.5944855413584398,
"grad_norm": 4.461284160614014,
"learning_rate": 7.425783455331281e-05,
"loss": 0.6628,
"step": 221
},
{
"epoch": 0.5971755211835911,
"grad_norm": 3.570058584213257,
"learning_rate": 7.342021822335143e-05,
"loss": 0.455,
"step": 222
},
{
"epoch": 0.5998655010087425,
"grad_norm": 4.93247652053833,
"learning_rate": 7.258460374227085e-05,
"loss": 1.1265,
"step": 223
},
{
"epoch": 0.6025554808338938,
"grad_norm": 3.4849631786346436,
"learning_rate": 7.175105404414362e-05,
"loss": 0.3686,
"step": 224
},
{
"epoch": 0.605245460659045,
"grad_norm": 2.503511428833008,
"learning_rate": 7.091963190753376e-05,
"loss": 0.3223,
"step": 225
},
{
"epoch": 0.6079354404841963,
"grad_norm": 4.293323040008545,
"learning_rate": 7.009039995076844e-05,
"loss": 0.5115,
"step": 226
},
{
"epoch": 0.6106254203093476,
"grad_norm": 4.681167125701904,
"learning_rate": 6.926342062722223e-05,
"loss": 0.8861,
"step": 227
},
{
"epoch": 0.613315400134499,
"grad_norm": 4.363972187042236,
"learning_rate": 6.843875622061304e-05,
"loss": 0.714,
"step": 228
},
{
"epoch": 0.6160053799596503,
"grad_norm": 4.711676597595215,
"learning_rate": 6.761646884031164e-05,
"loss": 0.9218,
"step": 229
},
{
"epoch": 0.6186953597848016,
"grad_norm": 5.378860950469971,
"learning_rate": 6.679662041666362e-05,
"loss": 0.8466,
"step": 230
},
{
"epoch": 0.6213853396099529,
"grad_norm": 4.945761203765869,
"learning_rate": 6.597927269632526e-05,
"loss": 0.8303,
"step": 231
},
{
"epoch": 0.6240753194351042,
"grad_norm": 5.643571853637695,
"learning_rate": 6.516448723761315e-05,
"loss": 0.9418,
"step": 232
},
{
"epoch": 0.6267652992602556,
"grad_norm": 3.7865352630615234,
"learning_rate": 6.435232540586763e-05,
"loss": 0.447,
"step": 233
},
{
"epoch": 0.6294552790854069,
"grad_norm": 4.775557994842529,
"learning_rate": 6.354284836883156e-05,
"loss": 0.7068,
"step": 234
},
{
"epoch": 0.6321452589105582,
"grad_norm": 4.313781261444092,
"learning_rate": 6.273611709204304e-05,
"loss": 0.4621,
"step": 235
},
{
"epoch": 0.6348352387357095,
"grad_norm": 3.1749980449676514,
"learning_rate": 6.193219233424414e-05,
"loss": 0.2996,
"step": 236
},
{
"epoch": 0.6375252185608608,
"grad_norm": 5.823219299316406,
"learning_rate": 6.11311346428046e-05,
"loss": 0.7987,
"step": 237
},
{
"epoch": 0.6402151983860122,
"grad_norm": 4.825180530548096,
"learning_rate": 6.033300434916203e-05,
"loss": 0.9309,
"step": 238
},
{
"epoch": 0.6429051782111634,
"grad_norm": 7.609663486480713,
"learning_rate": 5.9537861564277654e-05,
"loss": 0.6973,
"step": 239
},
{
"epoch": 0.6455951580363147,
"grad_norm": 3.5007476806640625,
"learning_rate": 5.8745766174109495e-05,
"loss": 0.5831,
"step": 240
},
{
"epoch": 0.648285137861466,
"grad_norm": 4.732518196105957,
"learning_rate": 5.795677783510187e-05,
"loss": 0.447,
"step": 241
},
{
"epoch": 0.6509751176866173,
"grad_norm": 4.874922752380371,
"learning_rate": 5.7170955969692265e-05,
"loss": 0.429,
"step": 242
},
{
"epoch": 0.6536650975117687,
"grad_norm": 2.738624095916748,
"learning_rate": 5.638835976183627e-05,
"loss": 0.316,
"step": 243
},
{
"epoch": 0.65635507733692,
"grad_norm": 4.868707180023193,
"learning_rate": 5.5609048152549794e-05,
"loss": 0.528,
"step": 244
},
{
"epoch": 0.6590450571620713,
"grad_norm": 2.545112371444702,
"learning_rate": 5.483307983547026e-05,
"loss": 0.2108,
"step": 245
},
{
"epoch": 0.6617350369872226,
"grad_norm": 5.225026607513428,
"learning_rate": 5.406051325243586e-05,
"loss": 0.6883,
"step": 246
},
{
"epoch": 0.6644250168123739,
"grad_norm": 6.781031608581543,
"learning_rate": 5.329140658908423e-05,
"loss": 0.7684,
"step": 247
},
{
"epoch": 0.6671149966375253,
"grad_norm": 4.9204630851745605,
"learning_rate": 5.2525817770470084e-05,
"loss": 0.529,
"step": 248
},
{
"epoch": 0.6698049764626766,
"grad_norm": 4.677596092224121,
"learning_rate": 5.1763804456702545e-05,
"loss": 0.1916,
"step": 249
},
{
"epoch": 0.6724949562878278,
"grad_norm": 5.289106369018555,
"learning_rate": 5.1005424038602724e-05,
"loss": 0.5018,
"step": 250
},
{
"epoch": 0.6751849361129791,
"grad_norm": 3.4332501888275146,
"learning_rate": 5.025073363338111e-05,
"loss": 1.4558,
"step": 251
},
{
"epoch": 0.6778749159381304,
"grad_norm": 3.6167006492614746,
"learning_rate": 4.949979008033596e-05,
"loss": 1.3181,
"step": 252
},
{
"epoch": 0.6805648957632818,
"grad_norm": 4.852591514587402,
"learning_rate": 4.8752649936572304e-05,
"loss": 1.5339,
"step": 253
},
{
"epoch": 0.6832548755884331,
"grad_norm": 4.116457462310791,
"learning_rate": 4.800936947274255e-05,
"loss": 1.8746,
"step": 254
},
{
"epoch": 0.6859448554135844,
"grad_norm": 3.9800620079040527,
"learning_rate": 4.7270004668808397e-05,
"loss": 1.8474,
"step": 255
},
{
"epoch": 0.6886348352387357,
"grad_norm": 3.9870662689208984,
"learning_rate": 4.65346112098246e-05,
"loss": 1.148,
"step": 256
},
{
"epoch": 0.691324815063887,
"grad_norm": 4.0448503494262695,
"learning_rate": 4.5803244481745275e-05,
"loss": 1.0557,
"step": 257
},
{
"epoch": 0.6940147948890383,
"grad_norm": 4.282130241394043,
"learning_rate": 4.5075959567252335e-05,
"loss": 1.2731,
"step": 258
},
{
"epoch": 0.6967047747141897,
"grad_norm": 4.488638401031494,
"learning_rate": 4.435281124160715e-05,
"loss": 1.3722,
"step": 259
},
{
"epoch": 0.699394754539341,
"grad_norm": 3.9812207221984863,
"learning_rate": 4.363385396852491e-05,
"loss": 1.3536,
"step": 260
},
{
"epoch": 0.7020847343644923,
"grad_norm": 5.047592639923096,
"learning_rate": 4.291914189607297e-05,
"loss": 0.7947,
"step": 261
},
{
"epoch": 0.7047747141896435,
"grad_norm": 4.314056396484375,
"learning_rate": 4.220872885259247e-05,
"loss": 0.6368,
"step": 262
},
{
"epoch": 0.7074646940147948,
"grad_norm": 4.209965705871582,
"learning_rate": 4.1502668342644455e-05,
"loss": 0.7786,
"step": 263
},
{
"epoch": 0.7101546738399462,
"grad_norm": 5.0463151931762695,
"learning_rate": 4.080101354298016e-05,
"loss": 0.5957,
"step": 264
},
{
"epoch": 0.7128446536650975,
"grad_norm": 5.0057373046875,
"learning_rate": 4.0103817298535794e-05,
"loss": 1.1632,
"step": 265
},
{
"epoch": 0.7155346334902488,
"grad_norm": 4.640236854553223,
"learning_rate": 3.9411132118452896e-05,
"loss": 1.523,
"step": 266
},
{
"epoch": 0.7182246133154001,
"grad_norm": 3.843372344970703,
"learning_rate": 3.872301017212337e-05,
"loss": 1.0536,
"step": 267
},
{
"epoch": 0.7209145931405514,
"grad_norm": 4.180513858795166,
"learning_rate": 3.8039503285260506e-05,
"loss": 0.7683,
"step": 268
},
{
"epoch": 0.7236045729657028,
"grad_norm": 7.624268054962158,
"learning_rate": 3.73606629359955e-05,
"loss": 0.817,
"step": 269
},
{
"epoch": 0.7262945527908541,
"grad_norm": 5.006471633911133,
"learning_rate": 3.6686540251000756e-05,
"loss": 0.899,
"step": 270
},
{
"epoch": 0.7289845326160054,
"grad_norm": 4.611303806304932,
"learning_rate": 3.6017186001639036e-05,
"loss": 0.3317,
"step": 271
},
{
"epoch": 0.7316745124411567,
"grad_norm": 4.701048374176025,
"learning_rate": 3.535265060013965e-05,
"loss": 0.9114,
"step": 272
},
{
"epoch": 0.734364492266308,
"grad_norm": 5.010507583618164,
"learning_rate": 3.4692984095801796e-05,
"loss": 0.6747,
"step": 273
},
{
"epoch": 0.7370544720914594,
"grad_norm": 4.114429473876953,
"learning_rate": 3.4038236171224946e-05,
"loss": 0.5408,
"step": 274
},
{
"epoch": 0.7397444519166106,
"grad_norm": 5.027583122253418,
"learning_rate": 3.3388456138567225e-05,
"loss": 0.4589,
"step": 275
},
{
"epoch": 0.7424344317417619,
"grad_norm": 5.864731788635254,
"learning_rate": 3.274369293583121e-05,
"loss": 0.9165,
"step": 276
},
{
"epoch": 0.7451244115669132,
"grad_norm": 4.747946739196777,
"learning_rate": 3.210399512317849e-05,
"loss": 0.5087,
"step": 277
},
{
"epoch": 0.7478143913920645,
"grad_norm": 3.576078414916992,
"learning_rate": 3.146941087927203e-05,
"loss": 0.5728,
"step": 278
},
{
"epoch": 0.7505043712172159,
"grad_norm": 2.740264892578125,
"learning_rate": 3.0839987997647935e-05,
"loss": 0.2307,
"step": 279
},
{
"epoch": 0.7505043712172159,
"eval_loss": 0.7389675974845886,
"eval_runtime": 10.7313,
"eval_samples_per_second": 14.63,
"eval_steps_per_second": 7.362,
"step": 279
},
{
"epoch": 0.7531943510423672,
"grad_norm": 3.4270105361938477,
"learning_rate": 3.0215773883115706e-05,
"loss": 0.5658,
"step": 280
},
{
"epoch": 0.7558843308675185,
"grad_norm": 3.4167211055755615,
"learning_rate": 2.9596815548187908e-05,
"loss": 0.1781,
"step": 281
},
{
"epoch": 0.7585743106926698,
"grad_norm": 3.9443657398223877,
"learning_rate": 2.8983159609539635e-05,
"loss": 0.5545,
"step": 282
},
{
"epoch": 0.7612642905178211,
"grad_norm": 3.164463758468628,
"learning_rate": 2.8374852284497446e-05,
"loss": 0.334,
"step": 283
},
{
"epoch": 0.7639542703429725,
"grad_norm": 3.6277055740356445,
"learning_rate": 2.7771939387558554e-05,
"loss": 0.411,
"step": 284
},
{
"epoch": 0.7666442501681238,
"grad_norm": 4.296345233917236,
"learning_rate": 2.717446632694025e-05,
"loss": 0.3111,
"step": 285
},
{
"epoch": 0.769334229993275,
"grad_norm": 4.06040096282959,
"learning_rate": 2.6582478101160167e-05,
"loss": 0.4634,
"step": 286
},
{
"epoch": 0.7720242098184263,
"grad_norm": 4.600436687469482,
"learning_rate": 2.599601929564709e-05,
"loss": 0.6998,
"step": 287
},
{
"epoch": 0.7747141896435776,
"grad_norm": 3.8486735820770264,
"learning_rate": 2.5415134079383006e-05,
"loss": 0.3987,
"step": 288
},
{
"epoch": 0.777404169468729,
"grad_norm": 5.362851142883301,
"learning_rate": 2.4839866201576646e-05,
"loss": 0.3466,
"step": 289
},
{
"epoch": 0.7800941492938803,
"grad_norm": 3.8688018321990967,
"learning_rate": 2.4270258988368376e-05,
"loss": 0.2902,
"step": 290
},
{
"epoch": 0.7827841291190316,
"grad_norm": 4.354773044586182,
"learning_rate": 2.3706355339567286e-05,
"loss": 0.4149,
"step": 291
},
{
"epoch": 0.7854741089441829,
"grad_norm": 7.11607027053833,
"learning_rate": 2.3148197725419983e-05,
"loss": 0.7291,
"step": 292
},
{
"epoch": 0.7881640887693342,
"grad_norm": 5.43526029586792,
"learning_rate": 2.2595828183412172e-05,
"loss": 0.2716,
"step": 293
},
{
"epoch": 0.7908540685944856,
"grad_norm": 3.004659414291382,
"learning_rate": 2.2049288315102412e-05,
"loss": 0.3067,
"step": 294
},
{
"epoch": 0.7935440484196369,
"grad_norm": 4.5855560302734375,
"learning_rate": 2.1508619282989084e-05,
"loss": 0.2618,
"step": 295
},
{
"epoch": 0.7962340282447882,
"grad_norm": 4.773977756500244,
"learning_rate": 2.097386180741019e-05,
"loss": 0.5023,
"step": 296
},
{
"epoch": 0.7989240080699395,
"grad_norm": 9.166229248046875,
"learning_rate": 2.0445056163476374e-05,
"loss": 0.4224,
"step": 297
},
{
"epoch": 0.8016139878950908,
"grad_norm": 6.276297092437744,
"learning_rate": 1.9922242178037864e-05,
"loss": 0.8068,
"step": 298
},
{
"epoch": 0.8043039677202422,
"grad_norm": 5.523612976074219,
"learning_rate": 1.940545922668472e-05,
"loss": 0.4406,
"step": 299
},
{
"epoch": 0.8069939475453934,
"grad_norm": 1.5128313302993774,
"learning_rate": 1.88947462307814e-05,
"loss": 0.0216,
"step": 300
},
{
"epoch": 0.8096839273705447,
"grad_norm": 2.8309073448181152,
"learning_rate": 1.8390141654535265e-05,
"loss": 1.299,
"step": 301
},
{
"epoch": 0.812373907195696,
"grad_norm": 3.6739649772644043,
"learning_rate": 1.789168350209983e-05,
"loss": 1.5798,
"step": 302
},
{
"epoch": 0.8150638870208473,
"grad_norm": 3.935307741165161,
"learning_rate": 1.739940931471239e-05,
"loss": 1.295,
"step": 303
},
{
"epoch": 0.8177538668459986,
"grad_norm": 4.4844865798950195,
"learning_rate": 1.6913356167866578e-05,
"loss": 1.225,
"step": 304
},
{
"epoch": 0.82044384667115,
"grad_norm": 4.518765449523926,
"learning_rate": 1.6433560668520176e-05,
"loss": 1.4111,
"step": 305
},
{
"epoch": 0.8231338264963013,
"grad_norm": 4.362013339996338,
"learning_rate": 1.5960058952337887e-05,
"loss": 1.1839,
"step": 306
},
{
"epoch": 0.8258238063214526,
"grad_norm": 4.76102352142334,
"learning_rate": 1.5492886680969963e-05,
"loss": 1.2118,
"step": 307
},
{
"epoch": 0.8285137861466039,
"grad_norm": 5.4755539894104,
"learning_rate": 1.5032079039366209e-05,
"loss": 1.4798,
"step": 308
},
{
"epoch": 0.8312037659717552,
"grad_norm": 3.792975902557373,
"learning_rate": 1.4577670733126203e-05,
"loss": 0.7013,
"step": 309
},
{
"epoch": 0.8338937457969066,
"grad_norm": 5.135954856872559,
"learning_rate": 1.4129695985885228e-05,
"loss": 1.5141,
"step": 310
},
{
"epoch": 0.8365837256220578,
"grad_norm": 3.417525291442871,
"learning_rate": 1.3688188536736968e-05,
"loss": 0.8687,
"step": 311
},
{
"epoch": 0.8392737054472091,
"grad_norm": 4.7601728439331055,
"learning_rate": 1.3253181637692324e-05,
"loss": 0.9127,
"step": 312
},
{
"epoch": 0.8419636852723604,
"grad_norm": 4.601919174194336,
"learning_rate": 1.2824708051175016e-05,
"loss": 1.0878,
"step": 313
},
{
"epoch": 0.8446536650975117,
"grad_norm": 3.320221185684204,
"learning_rate": 1.2402800047554208e-05,
"loss": 0.6061,
"step": 314
},
{
"epoch": 0.8473436449226631,
"grad_norm": 4.236156463623047,
"learning_rate": 1.1987489402713981e-05,
"loss": 0.7456,
"step": 315
},
{
"epoch": 0.8500336247478144,
"grad_norm": 6.007240295410156,
"learning_rate": 1.1578807395660207e-05,
"loss": 1.5298,
"step": 316
},
{
"epoch": 0.8527236045729657,
"grad_norm": 5.775532245635986,
"learning_rate": 1.1176784806164676e-05,
"loss": 0.7343,
"step": 317
},
{
"epoch": 0.855413584398117,
"grad_norm": 5.709627628326416,
"learning_rate": 1.078145191244706e-05,
"loss": 1.2876,
"step": 318
},
{
"epoch": 0.8581035642232683,
"grad_norm": 5.935501575469971,
"learning_rate": 1.0392838488894463e-05,
"loss": 0.9374,
"step": 319
},
{
"epoch": 0.8607935440484197,
"grad_norm": 4.249516010284424,
"learning_rate": 1.0010973803818857e-05,
"loss": 0.5061,
"step": 320
},
{
"epoch": 0.863483523873571,
"grad_norm": 4.154758453369141,
"learning_rate": 9.635886617252975e-06,
"loss": 0.1188,
"step": 321
},
{
"epoch": 0.8661735036987223,
"grad_norm": 3.874020576477051,
"learning_rate": 9.267605178784033e-06,
"loss": 0.4923,
"step": 322
},
{
"epoch": 0.8688634835238735,
"grad_norm": 3.575878143310547,
"learning_rate": 8.906157225426315e-06,
"loss": 0.3217,
"step": 323
},
{
"epoch": 0.8715534633490248,
"grad_norm": 4.050719261169434,
"learning_rate": 8.55156997953197e-06,
"loss": 0.4612,
"step": 324
},
{
"epoch": 0.8742434431741762,
"grad_norm": 3.588498830795288,
"learning_rate": 8.203870146740932e-06,
"loss": 0.2259,
"step": 325
},
{
"epoch": 0.8769334229993275,
"grad_norm": 5.262954235076904,
"learning_rate": 7.86308391396956e-06,
"loss": 0.7654,
"step": 326
},
{
"epoch": 0.8796234028244788,
"grad_norm": 5.5735087394714355,
"learning_rate": 7.529236947438256e-06,
"loss": 0.5849,
"step": 327
},
{
"epoch": 0.8823133826496301,
"grad_norm": 4.838580131530762,
"learning_rate": 7.202354390738608e-06,
"loss": 0.3913,
"step": 328
},
{
"epoch": 0.8850033624747814,
"grad_norm": 5.6935038566589355,
"learning_rate": 6.882460862939522e-06,
"loss": 0.7206,
"step": 329
},
{
"epoch": 0.8876933422999328,
"grad_norm": 2.3508174419403076,
"learning_rate": 6.5695804567332044e-06,
"loss": 0.1703,
"step": 330
},
{
"epoch": 0.8903833221250841,
"grad_norm": 5.699828624725342,
"learning_rate": 6.263736736620551e-06,
"loss": 0.4676,
"step": 331
},
{
"epoch": 0.8930733019502354,
"grad_norm": 4.048695087432861,
"learning_rate": 5.964952737136353e-06,
"loss": 0.5628,
"step": 332
},
{
"epoch": 0.8957632817753867,
"grad_norm": 4.811221599578857,
"learning_rate": 5.673250961114529e-06,
"loss": 0.7418,
"step": 333
},
{
"epoch": 0.898453261600538,
"grad_norm": 3.3414437770843506,
"learning_rate": 5.388653377993324e-06,
"loss": 0.3143,
"step": 334
},
{
"epoch": 0.9011432414256894,
"grad_norm": 5.924250602722168,
"learning_rate": 5.111181422160671e-06,
"loss": 0.5284,
"step": 335
},
{
"epoch": 0.9038332212508406,
"grad_norm": 6.767046928405762,
"learning_rate": 4.840855991339799e-06,
"loss": 0.6351,
"step": 336
},
{
"epoch": 0.9065232010759919,
"grad_norm": 4.555798053741455,
"learning_rate": 4.577697445015472e-06,
"loss": 0.5253,
"step": 337
},
{
"epoch": 0.9092131809011432,
"grad_norm": 5.7803730964660645,
"learning_rate": 4.321725602900473e-06,
"loss": 0.7582,
"step": 338
},
{
"epoch": 0.9119031607262945,
"grad_norm": 4.016640663146973,
"learning_rate": 4.072959743443017e-06,
"loss": 0.2845,
"step": 339
},
{
"epoch": 0.9145931405514459,
"grad_norm": 5.46890926361084,
"learning_rate": 3.83141860237467e-06,
"loss": 0.6128,
"step": 340
},
{
"epoch": 0.9172831203765972,
"grad_norm": 4.543710708618164,
"learning_rate": 3.5971203712993894e-06,
"loss": 0.5227,
"step": 341
},
{
"epoch": 0.9199731002017485,
"grad_norm": 4.0189008712768555,
"learning_rate": 3.3700826963233735e-06,
"loss": 0.4072,
"step": 342
},
{
"epoch": 0.9226630800268998,
"grad_norm": 5.0270490646362305,
"learning_rate": 3.1503226767260252e-06,
"loss": 0.5361,
"step": 343
},
{
"epoch": 0.9253530598520511,
"grad_norm": 7.237580299377441,
"learning_rate": 2.9378568636721835e-06,
"loss": 0.9466,
"step": 344
},
{
"epoch": 0.9280430396772025,
"grad_norm": 8.795455932617188,
"learning_rate": 2.732701258965531e-06,
"loss": 0.6604,
"step": 345
},
{
"epoch": 0.9307330195023538,
"grad_norm": 11.6528959274292,
"learning_rate": 2.5348713138434564e-06,
"loss": 0.5807,
"step": 346
},
{
"epoch": 0.933422999327505,
"grad_norm": 8.07696533203125,
"learning_rate": 2.3443819278132996e-06,
"loss": 0.7975,
"step": 347
},
{
"epoch": 0.9361129791526563,
"grad_norm": 4.788589954376221,
"learning_rate": 2.161247447530268e-06,
"loss": 0.6227,
"step": 348
},
{
"epoch": 0.9388029589778076,
"grad_norm": 7.453376293182373,
"learning_rate": 1.985481665716882e-06,
"loss": 0.4651,
"step": 349
},
{
"epoch": 0.9414929388029589,
"grad_norm": 4.3519392013549805,
"learning_rate": 1.8170978201241474e-06,
"loss": 0.1668,
"step": 350
},
{
"epoch": 0.9441829186281103,
"grad_norm": 3.087855577468872,
"learning_rate": 1.6561085925346332e-06,
"loss": 1.2559,
"step": 351
},
{
"epoch": 0.9468728984532616,
"grad_norm": 3.9484481811523438,
"learning_rate": 1.5025261078073005e-06,
"loss": 1.0505,
"step": 352
},
{
"epoch": 0.9495628782784129,
"grad_norm": 4.509681701660156,
"learning_rate": 1.3563619329643119e-06,
"loss": 1.316,
"step": 353
},
{
"epoch": 0.9522528581035642,
"grad_norm": 4.409306049346924,
"learning_rate": 1.2176270763198828e-06,
"loss": 0.9114,
"step": 354
},
{
"epoch": 0.9549428379287155,
"grad_norm": 5.652538299560547,
"learning_rate": 1.0863319866512346e-06,
"loss": 1.1458,
"step": 355
},
{
"epoch": 0.9576328177538669,
"grad_norm": 6.170865535736084,
"learning_rate": 9.624865524115346e-07,
"loss": 1.1232,
"step": 356
},
{
"epoch": 0.9603227975790182,
"grad_norm": 5.357152938842773,
"learning_rate": 8.461001009852809e-07,
"loss": 0.9592,
"step": 357
},
{
"epoch": 0.9630127774041695,
"grad_norm": 4.322149753570557,
"learning_rate": 7.371813979857312e-07,
"loss": 0.7773,
"step": 358
},
{
"epoch": 0.9657027572293208,
"grad_norm": 3.6123275756835938,
"learning_rate": 6.357386465947301e-07,
"loss": 0.5652,
"step": 359
},
{
"epoch": 0.968392737054472,
"grad_norm": 3.7311031818389893,
"learning_rate": 5.417794869449377e-07,
"loss": 0.6096,
"step": 360
},
{
"epoch": 0.9710827168796234,
"grad_norm": 5.762843608856201,
"learning_rate": 4.5531099554435576e-07,
"loss": 0.9279,
"step": 361
},
{
"epoch": 0.9737726967047747,
"grad_norm": 4.97388219833374,
"learning_rate": 3.763396847433875e-07,
"loss": 0.5789,
"step": 362
},
{
"epoch": 0.976462676529926,
"grad_norm": 4.815624713897705,
"learning_rate": 3.048715022443749e-07,
"loss": 0.5138,
"step": 363
},
{
"epoch": 0.9791526563550773,
"grad_norm": 3.541781425476074,
"learning_rate": 2.409118306536229e-07,
"loss": 0.259,
"step": 364
},
{
"epoch": 0.9818426361802286,
"grad_norm": 2.7444493770599365,
"learning_rate": 1.8446548707604648e-07,
"loss": 0.2707,
"step": 365
},
{
"epoch": 0.98453261600538,
"grad_norm": 5.796267986297607,
"learning_rate": 1.3553672275230523e-07,
"loss": 0.5347,
"step": 366
},
{
"epoch": 0.9872225958305313,
"grad_norm": 5.090404987335205,
"learning_rate": 9.412922273871471e-08,
"loss": 0.3201,
"step": 367
},
{
"epoch": 0.9899125756556826,
"grad_norm": 4.630456924438477,
"learning_rate": 6.024610562962441e-08,
"loss": 0.4391,
"step": 368
},
{
"epoch": 0.9926025554808339,
"grad_norm": 4.325840473175049,
"learning_rate": 3.388992332259422e-08,
"loss": 0.3675,
"step": 369
},
{
"epoch": 0.9952925353059852,
"grad_norm": 9.686969757080078,
"learning_rate": 1.506266082615948e-08,
"loss": 0.6909,
"step": 370
},
{
"epoch": 0.9979825151311366,
"grad_norm": 4.668429851531982,
"learning_rate": 3.7657361103837776e-09,
"loss": 0.285,
"step": 371
},
{
"epoch": 1.0013449899125757,
"grad_norm": 4.755204200744629,
"learning_rate": 0.0,
"loss": 0.9972,
"step": 372
},
{
"epoch": 1.0013449899125757,
"eval_loss": 0.7320420145988464,
"eval_runtime": 10.7106,
"eval_samples_per_second": 14.658,
"eval_steps_per_second": 7.376,
"step": 372
}
],
"logging_steps": 1,
"max_steps": 372,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 93,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.211789436077998e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}