{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0013449899125757,
  "eval_steps": 93,
  "global_step": 372,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0026899798251513113,
      "grad_norm": 1.8751003742218018,
      "learning_rate": 2e-05,
      "loss": 2.0835,
      "step": 1
    },
    {
      "epoch": 0.005379959650302623,
      "grad_norm": 2.948612928390503,
      "learning_rate": 4e-05,
      "loss": 2.7117,
      "step": 2
    },
    {
      "epoch": 0.008069939475453935,
      "grad_norm": 2.65693998336792,
      "learning_rate": 6e-05,
      "loss": 2.2257,
      "step": 3
    },
    {
      "epoch": 0.010759919300605245,
      "grad_norm": 3.0478532314300537,
      "learning_rate": 8e-05,
      "loss": 2.5187,
      "step": 4
    },
    {
      "epoch": 0.013449899125756557,
      "grad_norm": 5.911397457122803,
      "learning_rate": 0.0001,
      "loss": 2.3052,
      "step": 5
    },
    {
      "epoch": 0.01613987895090787,
      "grad_norm": 6.985674858093262,
      "learning_rate": 0.00012,
      "loss": 2.2497,
      "step": 6
    },
    {
      "epoch": 0.01882985877605918,
      "grad_norm": 12.322802543640137,
      "learning_rate": 0.00014,
      "loss": 2.3409,
      "step": 7
    },
    {
      "epoch": 0.02151983860121049,
      "grad_norm": 5.827511787414551,
      "learning_rate": 0.00016,
      "loss": 2.4239,
      "step": 8
    },
    {
      "epoch": 0.0242098184263618,
      "grad_norm": 4.404603481292725,
      "learning_rate": 0.00018,
      "loss": 2.2837,
      "step": 9
    },
    {
      "epoch": 0.026899798251513115,
      "grad_norm": 5.033199310302734,
      "learning_rate": 0.0002,
      "loss": 1.9387,
      "step": 10
    },
    {
      "epoch": 0.029589778076664425,
      "grad_norm": 5.6907196044921875,
      "learning_rate": 0.00019999623426388962,
      "loss": 1.7416,
      "step": 11
    },
    {
      "epoch": 0.03227975790181574,
      "grad_norm": 6.009058952331543,
      "learning_rate": 0.00019998493733917384,
      "loss": 1.1876,
      "step": 12
    },
    {
      "epoch": 0.03496973772696705,
      "grad_norm": 7.73223352432251,
      "learning_rate": 0.00019996611007667742,
      "loss": 1.6472,
      "step": 13
    },
    {
      "epoch": 0.03765971755211836,
      "grad_norm": 9.206978797912598,
      "learning_rate": 0.00019993975389437038,
      "loss": 1.3337,
      "step": 14
    },
    {
      "epoch": 0.04034969737726967,
      "grad_norm": 8.522892951965332,
      "learning_rate": 0.00019990587077726128,
      "loss": 1.107,
      "step": 15
    },
    {
      "epoch": 0.04303967720242098,
      "grad_norm": 6.441080093383789,
      "learning_rate": 0.0001998644632772477,
      "loss": 1.3421,
      "step": 16
    },
    {
      "epoch": 0.04572965702757229,
      "grad_norm": 10.297110557556152,
      "learning_rate": 0.00019981553451292396,
      "loss": 1.542,
      "step": 17
    },
    {
      "epoch": 0.0484196368527236,
      "grad_norm": 8.839176177978516,
      "learning_rate": 0.0001997590881693464,
      "loss": 1.3331,
      "step": 18
    },
    {
      "epoch": 0.05110961667787491,
      "grad_norm": 7.464210510253906,
      "learning_rate": 0.00019969512849775565,
      "loss": 0.6544,
      "step": 19
    },
    {
      "epoch": 0.05379959650302623,
      "grad_norm": 8.324406623840332,
      "learning_rate": 0.00019962366031525664,
      "loss": 0.6081,
      "step": 20
    },
    {
      "epoch": 0.05648957632817754,
      "grad_norm": 7.356775760650635,
      "learning_rate": 0.00019954468900445566,
      "loss": 0.8796,
      "step": 21
    },
    {
      "epoch": 0.05917955615332885,
      "grad_norm": 4.948946475982666,
      "learning_rate": 0.00019945822051305507,
      "loss": 0.6637,
      "step": 22
    },
    {
      "epoch": 0.06186953597848016,
      "grad_norm": 4.626830577850342,
      "learning_rate": 0.00019936426135340528,
      "loss": 0.9787,
      "step": 23
    },
    {
      "epoch": 0.06455951580363148,
      "grad_norm": 5.476314067840576,
      "learning_rate": 0.0001992628186020143,
      "loss": 1.1509,
      "step": 24
    },
    {
      "epoch": 0.06724949562878278,
      "grad_norm": 4.901749134063721,
      "learning_rate": 0.00019915389989901474,
      "loss": 1.2591,
      "step": 25
    },
    {
      "epoch": 0.0699394754539341,
      "grad_norm": 5.085122108459473,
      "learning_rate": 0.00019903751344758848,
      "loss": 0.7551,
      "step": 26
    },
    {
      "epoch": 0.0726294552790854,
      "grad_norm": 7.4879231452941895,
      "learning_rate": 0.0001989136680133488,
      "loss": 0.8804,
      "step": 27
    },
    {
      "epoch": 0.07531943510423672,
      "grad_norm": 4.756115913391113,
      "learning_rate": 0.00019878237292368013,
      "loss": 0.7665,
      "step": 28
    },
    {
      "epoch": 0.07800941492938802,
      "grad_norm": 4.777318954467773,
      "learning_rate": 0.0001986436380670357,
      "loss": 1.0634,
      "step": 29
    },
    {
      "epoch": 0.08069939475453934,
      "grad_norm": 4.6226325035095215,
      "learning_rate": 0.00019849747389219272,
      "loss": 0.5563,
      "step": 30
    },
    {
      "epoch": 0.08338937457969066,
      "grad_norm": 4.621855735778809,
      "learning_rate": 0.0001983438914074654,
      "loss": 0.6377,
      "step": 31
    },
    {
      "epoch": 0.08607935440484196,
      "grad_norm": 4.654213905334473,
      "learning_rate": 0.00019818290217987587,
      "loss": 0.7768,
      "step": 32
    },
    {
      "epoch": 0.08876933422999328,
      "grad_norm": 4.971796989440918,
      "learning_rate": 0.00019801451833428312,
      "loss": 0.7793,
      "step": 33
    },
    {
      "epoch": 0.09145931405514458,
      "grad_norm": 4.303219318389893,
      "learning_rate": 0.00019783875255246973,
      "loss": 0.7334,
      "step": 34
    },
    {
      "epoch": 0.0941492938802959,
      "grad_norm": 4.168240547180176,
      "learning_rate": 0.0001976556180721867,
      "loss": 0.7872,
      "step": 35
    },
    {
      "epoch": 0.0968392737054472,
      "grad_norm": 3.7110626697540283,
      "learning_rate": 0.00019746512868615656,
      "loss": 0.9156,
      "step": 36
    },
    {
      "epoch": 0.09952925353059852,
      "grad_norm": 4.882535457611084,
      "learning_rate": 0.00019726729874103448,
      "loss": 1.1328,
      "step": 37
    },
    {
      "epoch": 0.10221923335574983,
      "grad_norm": 7.1945624351501465,
      "learning_rate": 0.00019706214313632784,
      "loss": 1.173,
      "step": 38
    },
    {
      "epoch": 0.10490921318090114,
      "grad_norm": 8.771895408630371,
      "learning_rate": 0.00019684967732327396,
      "loss": 0.9978,
      "step": 39
    },
    {
      "epoch": 0.10759919300605246,
      "grad_norm": 5.988046169281006,
      "learning_rate": 0.00019662991730367663,
      "loss": 0.7832,
      "step": 40
    },
    {
      "epoch": 0.11028917283120376,
      "grad_norm": 12.11080551147461,
      "learning_rate": 0.00019640287962870062,
      "loss": 1.593,
      "step": 41
    },
    {
      "epoch": 0.11297915265635508,
      "grad_norm": 8.772387504577637,
      "learning_rate": 0.00019616858139762534,
      "loss": 0.7483,
      "step": 42
    },
    {
      "epoch": 0.11566913248150638,
      "grad_norm": 8.320154190063477,
      "learning_rate": 0.000195927040256557,
      "loss": 0.7829,
      "step": 43
    },
    {
      "epoch": 0.1183591123066577,
      "grad_norm": 9.61361026763916,
      "learning_rate": 0.00019567827439709954,
      "loss": 1.014,
      "step": 44
    },
    {
      "epoch": 0.121049092131809,
      "grad_norm": 5.479886531829834,
      "learning_rate": 0.00019542230255498454,
      "loss": 0.5207,
      "step": 45
    },
    {
      "epoch": 0.12373907195696032,
      "grad_norm": 18.779264450073242,
      "learning_rate": 0.0001951591440086602,
      "loss": 1.5535,
      "step": 46
    },
    {
      "epoch": 0.12642905178211164,
      "grad_norm": 11.512892723083496,
      "learning_rate": 0.00019488881857783935,
      "loss": 1.3748,
      "step": 47
    },
    {
      "epoch": 0.12911903160726296,
      "grad_norm": 16.13918685913086,
      "learning_rate": 0.00019461134662200668,
      "loss": 1.1999,
      "step": 48
    },
    {
      "epoch": 0.13180901143241425,
      "grad_norm": 11.946172714233398,
      "learning_rate": 0.00019432674903888548,
      "loss": 0.7801,
      "step": 49
    },
    {
      "epoch": 0.13449899125756556,
      "grad_norm": 10.913107872009277,
      "learning_rate": 0.0001940350472628637,
      "loss": 0.4945,
      "step": 50
    },
    {
      "epoch": 0.13718897108271688,
      "grad_norm": 7.555224895477295,
      "learning_rate": 0.00019373626326337946,
      "loss": 1.7617,
      "step": 51
    },
    {
      "epoch": 0.1398789509078682,
      "grad_norm": 8.530354499816895,
      "learning_rate": 0.0001934304195432668,
      "loss": 2.1801,
      "step": 52
    },
    {
      "epoch": 0.1425689307330195,
      "grad_norm": 6.264369487762451,
      "learning_rate": 0.0001931175391370605,
      "loss": 1.514,
      "step": 53
    },
    {
      "epoch": 0.1452589105581708,
      "grad_norm": 5.179037094116211,
      "learning_rate": 0.00019279764560926142,
      "loss": 1.746,
      "step": 54
    },
    {
      "epoch": 0.14794889038332212,
      "grad_norm": 4.9991536140441895,
      "learning_rate": 0.00019247076305256176,
      "loss": 1.4403,
      "step": 55
    },
    {
      "epoch": 0.15063887020847344,
      "grad_norm": 5.004469871520996,
      "learning_rate": 0.00019213691608603047,
      "loss": 2.0104,
      "step": 56
    },
    {
      "epoch": 0.15332885003362476,
      "grad_norm": 5.55224609375,
      "learning_rate": 0.00019179612985325908,
      "loss": 1.5966,
      "step": 57
    },
    {
      "epoch": 0.15601882985877605,
      "grad_norm": 4.276426315307617,
      "learning_rate": 0.00019144843002046806,
      "loss": 1.3126,
      "step": 58
    },
    {
      "epoch": 0.15870880968392737,
      "grad_norm": 6.414350509643555,
      "learning_rate": 0.0001910938427745737,
      "loss": 1.4917,
      "step": 59
    },
    {
      "epoch": 0.16139878950907868,
      "grad_norm": 4.202977657318115,
      "learning_rate": 0.000190732394821216,
      "loss": 1.0693,
      "step": 60
    },
    {
      "epoch": 0.16408876933423,
      "grad_norm": 4.08539342880249,
      "learning_rate": 0.00019036411338274703,
      "loss": 1.0895,
      "step": 61
    },
    {
      "epoch": 0.16677874915938132,
      "grad_norm": 3.5492026805877686,
      "learning_rate": 0.00018998902619618116,
      "loss": 0.9263,
      "step": 62
    },
    {
      "epoch": 0.1694687289845326,
      "grad_norm": 3.8257992267608643,
      "learning_rate": 0.00018960716151110554,
      "loss": 1.3858,
      "step": 63
    },
    {
      "epoch": 0.17215870880968392,
      "grad_norm": 3.3236279487609863,
      "learning_rate": 0.00018921854808755294,
      "loss": 1.0358,
      "step": 64
    },
    {
      "epoch": 0.17484868863483524,
      "grad_norm": 3.356065511703491,
      "learning_rate": 0.00018882321519383534,
      "loss": 0.8704,
      "step": 65
    },
    {
      "epoch": 0.17753866845998656,
      "grad_norm": 4.296020984649658,
      "learning_rate": 0.00018842119260433982,
      "loss": 1.0503,
      "step": 66
    },
    {
      "epoch": 0.18022864828513785,
      "grad_norm": 5.693877696990967,
      "learning_rate": 0.00018801251059728604,
      "loss": 0.6922,
      "step": 67
    },
    {
      "epoch": 0.18291862811028917,
      "grad_norm": 5.999386310577393,
      "learning_rate": 0.0001875971999524458,
      "loss": 0.824,
      "step": 68
    },
    {
      "epoch": 0.18560860793544048,
      "grad_norm": 4.900099754333496,
      "learning_rate": 0.000187175291948825,
      "loss": 0.7182,
      "step": 69
    },
    {
      "epoch": 0.1882985877605918,
      "grad_norm": 4.141096115112305,
      "learning_rate": 0.0001867468183623077,
      "loss": 0.4006,
      "step": 70
    },
    {
      "epoch": 0.19098856758574312,
      "grad_norm": 6.806368350982666,
      "learning_rate": 0.00018631181146326305,
      "loss": 0.8936,
      "step": 71
    },
    {
      "epoch": 0.1936785474108944,
      "grad_norm": 2.945824146270752,
      "learning_rate": 0.0001858703040141148,
      "loss": 0.5178,
      "step": 72
    },
    {
      "epoch": 0.19636852723604573,
      "grad_norm": 6.985172271728516,
      "learning_rate": 0.00018542232926687383,
      "loss": 0.8644,
      "step": 73
    },
    {
      "epoch": 0.19905850706119704,
      "grad_norm": 5.231998920440674,
      "learning_rate": 0.0001849679209606338,
      "loss": 1.0585,
      "step": 74
    },
    {
      "epoch": 0.20174848688634836,
      "grad_norm": 4.8978705406188965,
      "learning_rate": 0.00018450711331903006,
      "loss": 0.7828,
      "step": 75
    },
    {
      "epoch": 0.20443846671149965,
      "grad_norm": 5.309878826141357,
      "learning_rate": 0.00018403994104766212,
      "loss": 0.7666,
      "step": 76
    },
    {
      "epoch": 0.20712844653665097,
      "grad_norm": 5.227763652801514,
      "learning_rate": 0.00018356643933147986,
      "loss": 0.8396,
      "step": 77
    },
    {
      "epoch": 0.20981842636180228,
      "grad_norm": 4.239619731903076,
      "learning_rate": 0.00018308664383213344,
      "loss": 0.6439,
      "step": 78
    },
    {
      "epoch": 0.2125084061869536,
      "grad_norm": 5.731531620025635,
      "learning_rate": 0.00018260059068528762,
      "loss": 1.3371,
      "step": 79
    },
    {
      "epoch": 0.21519838601210492,
      "grad_norm": 4.006597518920898,
      "learning_rate": 0.00018210831649790018,
      "loss": 0.5272,
      "step": 80
    },
    {
      "epoch": 0.2178883658372562,
      "grad_norm": 4.0596699714660645,
      "learning_rate": 0.00018160985834546475,
      "loss": 0.4416,
      "step": 81
    },
    {
      "epoch": 0.22057834566240753,
      "grad_norm": 4.053659915924072,
      "learning_rate": 0.00018110525376921862,
      "loss": 0.4781,
      "step": 82
    },
    {
      "epoch": 0.22326832548755884,
      "grad_norm": 4.120569705963135,
      "learning_rate": 0.00018059454077331527,
      "loss": 0.8082,
      "step": 83
    },
    {
      "epoch": 0.22595830531271016,
      "grad_norm": 6.420701503753662,
      "learning_rate": 0.00018007775782196214,
      "loss": 0.5476,
      "step": 84
    },
    {
      "epoch": 0.22864828513786148,
      "grad_norm": 13.603157997131348,
      "learning_rate": 0.00017955494383652365,
      "loss": 0.7857,
      "step": 85
    },
    {
      "epoch": 0.23133826496301277,
      "grad_norm": 7.784971237182617,
      "learning_rate": 0.00017902613819258985,
      "loss": 0.7705,
      "step": 86
    },
    {
      "epoch": 0.23402824478816409,
      "grad_norm": 5.966789722442627,
      "learning_rate": 0.00017849138071701092,
      "loss": 0.9065,
      "step": 87
    },
    {
      "epoch": 0.2367182246133154,
      "grad_norm": 9.451849937438965,
      "learning_rate": 0.0001779507116848976,
      "loss": 0.998,
      "step": 88
    },
    {
      "epoch": 0.23940820443846672,
      "grad_norm": 6.712332725524902,
      "learning_rate": 0.00017740417181658788,
      "loss": 1.1464,
      "step": 89
    },
    {
      "epoch": 0.242098184263618,
      "grad_norm": 4.188553333282471,
      "learning_rate": 0.00017685180227458003,
      "loss": 0.6356,
      "step": 90
    },
    {
      "epoch": 0.24478816408876933,
      "grad_norm": 9.703150749206543,
      "learning_rate": 0.00017629364466043273,
      "loss": 1.0548,
      "step": 91
    },
    {
      "epoch": 0.24747814391392065,
      "grad_norm": 6.08168363571167,
      "learning_rate": 0.00017572974101163165,
      "loss": 0.7252,
      "step": 92
    },
    {
      "epoch": 0.25016812373907193,
      "grad_norm": 10.167409896850586,
      "learning_rate": 0.00017516013379842337,
      "loss": 0.658,
      "step": 93
    },
    {
      "epoch": 0.25016812373907193,
      "eval_loss": 0.9310864210128784,
      "eval_runtime": 10.8326,
      "eval_samples_per_second": 14.493,
      "eval_steps_per_second": 7.293,
      "step": 93
    },
    {
      "epoch": 0.2528581035642233,
      "grad_norm": 5.78995418548584,
      "learning_rate": 0.00017458486592061704,
      "loss": 0.9346,
      "step": 94
    },
    {
      "epoch": 0.25554808338937457,
      "grad_norm": 3.517900228500366,
      "learning_rate": 0.00017400398070435293,
      "loss": 0.3506,
      "step": 95
    },
    {
      "epoch": 0.2582380632145259,
      "grad_norm": 5.804417610168457,
      "learning_rate": 0.00017341752189883983,
      "loss": 0.4959,
      "step": 96
    },
    {
      "epoch": 0.2609280430396772,
      "grad_norm": 8.148117065429688,
      "learning_rate": 0.00017282553367305975,
      "loss": 0.9842,
      "step": 97
    },
    {
      "epoch": 0.2636180228648285,
      "grad_norm": 9.511378288269043,
      "learning_rate": 0.0001722280606124415,
      "loss": 0.7143,
      "step": 98
    },
    {
      "epoch": 0.26630800268997984,
      "grad_norm": 6.079991340637207,
      "learning_rate": 0.00017162514771550255,
      "loss": 0.2979,
      "step": 99
    },
    {
      "epoch": 0.26899798251513113,
      "grad_norm": 6.114333152770996,
      "learning_rate": 0.00017101684039046036,
      "loss": 0.5812,
      "step": 100
    },
    {
      "epoch": 0.2716879623402825,
      "grad_norm": 4.91884183883667,
      "learning_rate": 0.0001704031844518121,
      "loss": 1.8317,
      "step": 101
    },
    {
      "epoch": 0.27437794216543376,
      "grad_norm": 5.735188007354736,
      "learning_rate": 0.0001697842261168843,
      "loss": 2.3345,
      "step": 102
    },
    {
      "epoch": 0.27706792199058505,
      "grad_norm": 5.317649841308594,
      "learning_rate": 0.0001691600120023521,
      "loss": 2.0851,
      "step": 103
    },
    {
      "epoch": 0.2797579018157364,
      "grad_norm": 7.778799057006836,
      "learning_rate": 0.00016853058912072802,
      "loss": 1.1674,
      "step": 104
    },
    {
      "epoch": 0.2824478816408877,
      "grad_norm": 4.196416854858398,
      "learning_rate": 0.00016789600487682156,
      "loss": 1.5939,
      "step": 105
    },
    {
      "epoch": 0.285137861466039,
      "grad_norm": 4.3927741050720215,
      "learning_rate": 0.0001672563070641688,
      "loss": 1.4615,
      "step": 106
    },
    {
      "epoch": 0.2878278412911903,
      "grad_norm": 4.284142017364502,
      "learning_rate": 0.0001666115438614328,
      "loss": 1.9508,
      "step": 107
    },
    {
      "epoch": 0.2905178211163416,
      "grad_norm": 5.4508867263793945,
      "learning_rate": 0.00016596176382877506,
      "loss": 1.3256,
      "step": 108
    },
    {
      "epoch": 0.29320780094149296,
      "grad_norm": 11.987678527832031,
      "learning_rate": 0.00016530701590419824,
      "loss": 0.9202,
      "step": 109
    },
    {
      "epoch": 0.29589778076664425,
      "grad_norm": 5.667636394500732,
      "learning_rate": 0.00016464734939986036,
      "loss": 1.3247,
      "step": 110
    },
    {
      "epoch": 0.29858776059179554,
      "grad_norm": 3.8087687492370605,
      "learning_rate": 0.00016398281399836097,
      "loss": 0.9626,
      "step": 111
    },
    {
      "epoch": 0.3012777404169469,
      "grad_norm": 5.772204875946045,
      "learning_rate": 0.00016331345974899923,
      "loss": 1.3912,
      "step": 112
    },
    {
      "epoch": 0.30396772024209817,
      "grad_norm": 3.2174160480499268,
      "learning_rate": 0.00016263933706400451,
      "loss": 1.0545,
      "step": 113
    },
    {
      "epoch": 0.3066577000672495,
      "grad_norm": 3.539743423461914,
      "learning_rate": 0.00016196049671473954,
      "loss": 0.9489,
      "step": 114
    },
    {
      "epoch": 0.3093476798924008,
      "grad_norm": 3.6935033798217773,
      "learning_rate": 0.0001612769898278766,
      "loss": 1.0005,
      "step": 115
    },
    {
      "epoch": 0.3120376597175521,
      "grad_norm": 3.477961301803589,
      "learning_rate": 0.00016058886788154712,
      "loss": 0.6155,
      "step": 116
    },
    {
      "epoch": 0.31472763954270344,
      "grad_norm": 3.9399242401123047,
      "learning_rate": 0.00015989618270146423,
      "loss": 0.7689,
      "step": 117
    },
    {
      "epoch": 0.31741761936785473,
      "grad_norm": 4.4496846199035645,
      "learning_rate": 0.0001591989864570199,
      "loss": 1.0174,
      "step": 118
    },
    {
      "epoch": 0.3201075991930061,
      "grad_norm": 4.519758224487305,
      "learning_rate": 0.00015849733165735556,
      "loss": 0.9051,
      "step": 119
    },
    {
      "epoch": 0.32279757901815737,
      "grad_norm": 3.636235237121582,
      "learning_rate": 0.00015779127114740757,
      "loss": 0.5993,
      "step": 120
    },
    {
      "epoch": 0.32548755884330866,
      "grad_norm": 2.2947537899017334,
      "learning_rate": 0.0001570808581039271,
      "loss": 0.23,
      "step": 121
    },
    {
      "epoch": 0.32817753866846,
      "grad_norm": 3.0490782260894775,
      "learning_rate": 0.00015636614603147512,
      "loss": 0.5818,
      "step": 122
    },
    {
      "epoch": 0.3308675184936113,
      "grad_norm": 3.2933220863342285,
      "learning_rate": 0.0001556471887583929,
      "loss": 0.6548,
      "step": 123
    },
    {
      "epoch": 0.33355749831876264,
      "grad_norm": 4.488528251647949,
      "learning_rate": 0.0001549240404327477,
      "loss": 0.9628,
      "step": 124
    },
    {
      "epoch": 0.3362474781439139,
      "grad_norm": 4.679425239562988,
      "learning_rate": 0.00015419675551825475,
      "loss": 0.4106,
      "step": 125
    },
    {
      "epoch": 0.3389374579690652,
      "grad_norm": 4.400868892669678,
      "learning_rate": 0.0001534653887901754,
      "loss": 0.3852,
      "step": 126
    },
    {
      "epoch": 0.34162743779421656,
      "grad_norm": 4.978918552398682,
      "learning_rate": 0.00015272999533119162,
      "loss": 0.8162,
      "step": 127
    },
    {
      "epoch": 0.34431741761936785,
      "grad_norm": 5.046586990356445,
      "learning_rate": 0.00015199063052725745,
      "loss": 0.649,
      "step": 128
    },
    {
      "epoch": 0.34700739744451914,
      "grad_norm": 7.412467956542969,
      "learning_rate": 0.0001512473500634277,
      "loss": 0.6579,
      "step": 129
    },
    {
      "epoch": 0.3496973772696705,
      "grad_norm": 3.8262441158294678,
      "learning_rate": 0.00015050020991966406,
      "loss": 0.4359,
      "step": 130
    },
    {
      "epoch": 0.3523873570948218,
      "grad_norm": 5.179169654846191,
      "learning_rate": 0.0001497492663666189,
      "loss": 0.5676,
      "step": 131
    },
    {
      "epoch": 0.3550773369199731,
      "grad_norm": 5.74229097366333,
      "learning_rate": 0.00014899457596139729,
      "loss": 0.3635,
      "step": 132
    },
    {
      "epoch": 0.3577673167451244,
      "grad_norm": 7.098540782928467,
      "learning_rate": 0.00014823619554329745,
      "loss": 0.996,
      "step": 133
    },
    {
      "epoch": 0.3604572965702757,
      "grad_norm": 4.635382652282715,
      "learning_rate": 0.00014747418222952995,
      "loss": 0.7149,
      "step": 134
    },
    {
      "epoch": 0.36314727639542704,
      "grad_norm": 3.750243663787842,
      "learning_rate": 0.0001467085934109158,
      "loss": 0.3169,
      "step": 135
    },
    {
      "epoch": 0.36583725622057833,
      "grad_norm": 4.545015811920166,
      "learning_rate": 0.00014593948674756417,
      "loss": 0.5511,
      "step": 136
    },
    {
      "epoch": 0.3685272360457297,
      "grad_norm": 5.990297794342041,
      "learning_rate": 0.0001451669201645298,
      "loss": 0.766,
      "step": 137
    },
    {
      "epoch": 0.37121721587088097,
      "grad_norm": 3.692354679107666,
      "learning_rate": 0.00014439095184745024,
      "loss": 0.4151,
      "step": 138
    },
    {
      "epoch": 0.37390719569603226,
      "grad_norm": 3.4247729778289795,
      "learning_rate": 0.00014361164023816376,
      "loss": 0.466,
      "step": 139
    },
    {
      "epoch": 0.3765971755211836,
      "grad_norm": 3.962257146835327,
      "learning_rate": 0.00014282904403030772,
      "loss": 0.4263,
      "step": 140
    },
    {
      "epoch": 0.3792871553463349,
      "grad_norm": 5.7197771072387695,
      "learning_rate": 0.00014204322216489814,
      "loss": 0.4988,
      "step": 141
    },
    {
      "epoch": 0.38197713517148624,
      "grad_norm": 5.587864398956299,
      "learning_rate": 0.00014125423382589048,
      "loss": 0.6946,
      "step": 142
    },
    {
      "epoch": 0.3846671149966375,
      "grad_norm": 11.981307029724121,
      "learning_rate": 0.00014046213843572236,
      "loss": 0.7456,
      "step": 143
    },
    {
      "epoch": 0.3873570948217888,
      "grad_norm": 6.747979164123535,
      "learning_rate": 0.00013966699565083802,
      "loss": 1.2804,
      "step": 144
    },
    {
      "epoch": 0.39004707464694016,
      "grad_norm": 4.663575649261475,
      "learning_rate": 0.0001388688653571954,
      "loss": 0.5548,
      "step": 145
    },
    {
      "epoch": 0.39273705447209145,
      "grad_norm": 5.274585247039795,
      "learning_rate": 0.00013806780766575588,
      "loss": 0.6681,
      "step": 146
    },
    {
      "epoch": 0.3954270342972428,
      "grad_norm": 13.038918495178223,
      "learning_rate": 0.00013726388290795697,
      "loss": 1.082,
      "step": 147
    },
    {
      "epoch": 0.3981170141223941,
      "grad_norm": 7.035642623901367,
      "learning_rate": 0.00013645715163116846,
      "loss": 0.3975,
      "step": 148
    },
    {
      "epoch": 0.4008069939475454,
      "grad_norm": 5.065128326416016,
      "learning_rate": 0.00013564767459413237,
      "loss": 0.2747,
      "step": 149
    },
    {
      "epoch": 0.4034969737726967,
      "grad_norm": 4.475830554962158,
      "learning_rate": 0.0001348355127623869,
      "loss": 0.2169,
      "step": 150
    },
    {
      "epoch": 0.406186953597848,
      "grad_norm": 4.0652031898498535,
      "learning_rate": 0.00013402072730367475,
      "loss": 1.7546,
      "step": 151
    },
    {
      "epoch": 0.4088769334229993,
      "grad_norm": 4.62870454788208,
      "learning_rate": 0.0001332033795833364,
      "loss": 1.5081,
      "step": 152
    },
    {
      "epoch": 0.41156691324815065,
      "grad_norm": 3.8758082389831543,
      "learning_rate": 0.0001323835311596884,
      "loss": 1.371,
      "step": 153
    },
    {
      "epoch": 0.41425689307330194,
      "grad_norm": 4.078228950500488,
      "learning_rate": 0.00013156124377938699,
      "loss": 1.5507,
      "step": 154
    },
    {
      "epoch": 0.4169468728984533,
      "grad_norm": 3.6525630950927734,
      "learning_rate": 0.0001307365793727778,
      "loss": 1.1093,
      "step": 155
    },
    {
      "epoch": 0.41963685272360457,
      "grad_norm": 4.3088202476501465,
      "learning_rate": 0.00012990960004923154,
      "loss": 1.6154,
      "step": 156
    },
    {
      "epoch": 0.42232683254875586,
      "grad_norm": 4.335425853729248,
      "learning_rate": 0.00012908036809246623,
      "loss": 1.4037,
      "step": 157
    },
    {
      "epoch": 0.4250168123739072,
      "grad_norm": 3.7850985527038574,
      "learning_rate": 0.00012824894595585637,
      "loss": 1.1471,
      "step": 158
    },
    {
      "epoch": 0.4277067921990585,
      "grad_norm": 4.085525035858154,
      "learning_rate": 0.00012741539625772918,
      "loss": 1.2586,
      "step": 159
    },
    {
      "epoch": 0.43039677202420984,
      "grad_norm": 3.4970481395721436,
      "learning_rate": 0.0001265797817766486,
      "loss": 1.1133,
      "step": 160
    },
    {
      "epoch": 0.43308675184936113,
      "grad_norm": 4.015367031097412,
      "learning_rate": 0.0001257421654466872,
      "loss": 0.71,
      "step": 161
    },
    {
      "epoch": 0.4357767316745124,
      "grad_norm": 3.805530071258545,
      "learning_rate": 0.00012490261035268612,
      "loss": 1.4369,
      "step": 162
    },
    {
      "epoch": 0.43846671149966376,
      "grad_norm": 4.442086696624756,
      "learning_rate": 0.00012406117972550414,
      "loss": 1.1577,
      "step": 163
    },
    {
      "epoch": 0.44115669132481505,
      "grad_norm": 3.171997308731079,
      "learning_rate": 0.00012321793693725509,
      "loss": 0.667,
      "step": 164
    },
    {
      "epoch": 0.4438466711499664,
      "grad_norm": 4.185075759887695,
      "learning_rate": 0.0001223729454965354,
      "loss": 0.7278,
      "step": 165
    },
    {
      "epoch": 0.4465366509751177,
      "grad_norm": 3.8975086212158203,
      "learning_rate": 0.00012152626904364067,
      "loss": 0.9939,
      "step": 166
    },
    {
      "epoch": 0.449226630800269,
      "grad_norm": 3.1474146842956543,
      "learning_rate": 0.00012067797134577275,
      "loss": 0.7392,
      "step": 167
    },
    {
      "epoch": 0.4519166106254203,
      "grad_norm": 3.5632522106170654,
      "learning_rate": 0.00011982811629223709,
      "loss": 0.8636,
      "step": 168
    },
    {
      "epoch": 0.4546065904505716,
      "grad_norm": 2.6525533199310303,
      "learning_rate": 0.00011897676788963101,
      "loss": 0.3818,
      "step": 169
    },
    {
      "epoch": 0.45729657027572296,
      "grad_norm": 3.889469861984253,
      "learning_rate": 0.0001181239902570229,
      "loss": 0.5985,
      "step": 170
    },
    {
      "epoch": 0.45998655010087425,
      "grad_norm": 3.6286370754241943,
      "learning_rate": 0.00011726984762112328,
      "loss": 0.8639,
      "step": 171
    },
    {
      "epoch": 0.46267652992602554,
      "grad_norm": 2.5282163619995117,
      "learning_rate": 0.0001164144043114475,
      "loss": 0.3303,
      "step": 172
    },
    {
      "epoch": 0.4653665097511769,
      "grad_norm": 4.00683069229126,
      "learning_rate": 0.00011555772475547084,
      "loss": 0.414,
      "step": 173
    },
    {
      "epoch": 0.46805648957632817,
      "grad_norm": 5.255921363830566,
      "learning_rate": 0.00011469987347377602,
      "loss": 0.8622,
      "step": 174
    },
    {
      "epoch": 0.47074646940147946,
      "grad_norm": 4.201918601989746,
      "learning_rate": 0.00011384091507519403,
      "loss": 0.8862,
      "step": 175
    },
    {
      "epoch": 0.4734364492266308,
      "grad_norm": 4.199880599975586,
      "learning_rate": 0.00011298091425193806,
      "loss": 0.4554,
      "step": 176
    },
    {
      "epoch": 0.4761264290517821,
      "grad_norm": 3.6669838428497314,
      "learning_rate": 0.00011211993577473121,
      "loss": 0.343,
      "step": 177
    },
    {
      "epoch": 0.47881640887693344,
      "grad_norm": 4.186169147491455,
      "learning_rate": 0.00011125804448792831,
      "loss": 0.8039,
      "step": 178
    },
    {
      "epoch": 0.48150638870208473,
      "grad_norm": 4.209519386291504,
      "learning_rate": 0.00011039530530463218,
      "loss": 0.3221,
      "step": 179
    },
    {
      "epoch": 0.484196368527236,
      "grad_norm": 2.875875234603882,
      "learning_rate": 0.00010953178320180475,
      "loss": 0.2874,
      "step": 180
    },
    {
      "epoch": 0.48688634835238737,
      "grad_norm": 4.24071741104126,
      "learning_rate": 0.00010866754321537338,
      "loss": 0.5502,
      "step": 181
    },
    {
      "epoch": 0.48957632817753866,
      "grad_norm": 4.230165481567383,
      "learning_rate": 0.0001078026504353325,
      "loss": 0.5396,
      "step": 182
    },
    {
      "epoch": 0.49226630800269,
      "grad_norm": 4.387772560119629,
      "learning_rate": 0.0001069371700008416,
      "loss": 0.5987,
      "step": 183
    },
    {
      "epoch": 0.4949562878278413,
      "grad_norm": 4.988356113433838,
      "learning_rate": 0.00010607116709531918,
      "loss": 0.6046,
      "step": 184
    },
    {
      "epoch": 0.4976462676529926,
      "grad_norm": 4.388515472412109,
      "learning_rate": 0.00010520470694153353,
      "loss": 0.595,
      "step": 185
    },
    {
      "epoch": 0.5003362474781439,
      "grad_norm": 4.310067653656006,
      "learning_rate": 0.00010433785479669038,
      "loss": 0.5557,
      "step": 186
    },
    {
      "epoch": 0.5003362474781439,
      "eval_loss": 0.8278390765190125,
      "eval_runtime": 10.698,
      "eval_samples_per_second": 14.676,
      "eval_steps_per_second": 7.385,
      "step": 186
    },
    {
      "epoch": 0.5030262273032953,
      "grad_norm": 4.27438497543335,
      "learning_rate": 0.0001034706759475182,
      "loss": 0.7565,
      "step": 187
    },
    {
      "epoch": 0.5057162071284466,
      "grad_norm": 6.716769218444824,
      "learning_rate": 0.0001026032357053512,
      "loss": 0.8371,
      "step": 188
    },
    {
      "epoch": 0.5084061869535978,
      "grad_norm": 2.8589985370635986,
      "learning_rate": 0.0001017355994012102,
      "loss": 0.2835,
      "step": 189
    },
    {
      "epoch": 0.5110961667787491,
      "grad_norm": 2.675102949142456,
      "learning_rate": 0.00010086783238088244,
      "loss": 0.2332,
      "step": 190
    },
    {
      "epoch": 0.5137861466039004,
      "grad_norm": 5.922438144683838,
      "learning_rate": 0.0001,
      "loss": 0.3974,
      "step": 191
    },
    {
      "epoch": 0.5164761264290518,
      "grad_norm": 9.693557739257812,
      "learning_rate": 9.913216761911755e-05,
      "loss": 0.8956,
      "step": 192
    },
    {
      "epoch": 0.5191661062542031,
      "grad_norm": 5.175564765930176,
      "learning_rate": 9.826440059878982e-05,
      "loss": 0.4924,
      "step": 193
    },
    {
      "epoch": 0.5218560860793544,
      "grad_norm": 4.789131164550781,
      "learning_rate": 9.739676429464881e-05,
      "loss": 0.6214,
      "step": 194
    },
    {
      "epoch": 0.5245460659045057,
      "grad_norm": 4.584465026855469,
      "learning_rate": 9.652932405248181e-05,
      "loss": 0.6807,
      "step": 195
    },
    {
      "epoch": 0.527236045729657,
      "grad_norm": 8.758447647094727,
      "learning_rate": 9.566214520330966e-05,
      "loss": 1.2434,
      "step": 196
    },
    {
      "epoch": 0.5299260255548084,
      "grad_norm": 8.869791984558105,
      "learning_rate": 9.479529305846652e-05,
      "loss": 0.903,
      "step": 197
    },
    {
      "epoch": 0.5326160053799597,
      "grad_norm": 3.716257333755493,
      "learning_rate": 9.392883290468083e-05,
      "loss": 0.3255,
      "step": 198
    },
    {
      "epoch": 0.535305985205111,
      "grad_norm": 5.400476455688477,
      "learning_rate": 9.306282999915839e-05,
      "loss": 0.4036,
      "step": 199
    },
    {
      "epoch": 0.5379959650302623,
      "grad_norm": 3.5948078632354736,
      "learning_rate": 9.219734956466752e-05,
      "loss": 0.1489,
      "step": 200
    },
    {
      "epoch": 0.5406859448554135,
      "grad_norm": 3.1145272254943848,
      "learning_rate": 9.133245678462663e-05,
      "loss": 1.6383,
      "step": 201
    },
    {
      "epoch": 0.543375924680565,
      "grad_norm": 3.916707754135132,
      "learning_rate": 9.046821679819527e-05,
      "loss": 1.7301,
      "step": 202
    },
    {
      "epoch": 0.5460659045057162,
      "grad_norm": 3.5734024047851562,
      "learning_rate": 8.960469469536786e-05,
      "loss": 1.1493,
      "step": 203
    },
    {
      "epoch": 0.5487558843308675,
      "grad_norm": 3.514202356338501,
      "learning_rate": 8.874195551207174e-05,
      "loss": 1.2045,
      "step": 204
    },
    {
      "epoch": 0.5514458641560188,
      "grad_norm": 3.9733996391296387,
      "learning_rate": 8.788006422526881e-05,
      "loss": 1.8557,
      "step": 205
    },
    {
      "epoch": 0.5541358439811701,
      "grad_norm": 4.543676853179932,
      "learning_rate": 8.701908574806197e-05,
      "loss": 1.3852,
      "step": 206
    },
    {
      "epoch": 0.5568258238063215,
      "grad_norm": 3.916227102279663,
      "learning_rate": 8.615908492480598e-05,
      "loss": 1.1968,
      "step": 207
    },
    {
      "epoch": 0.5595158036314728,
      "grad_norm": 4.097233295440674,
      "learning_rate": 8.530012652622397e-05,
      "loss": 1.2521,
      "step": 208
    },
    {
      "epoch": 0.5622057834566241,
      "grad_norm": 3.402857780456543,
      "learning_rate": 8.444227524452918e-05,
      "loss": 0.8308,
      "step": 209
    },
    {
      "epoch": 0.5648957632817754,
      "grad_norm": 4.228405952453613,
      "learning_rate": 8.358559568855249e-05,
      "loss": 1.1485,
      "step": 210
    },
    {
      "epoch": 0.5675857431069267,
      "grad_norm": 3.699495553970337,
      "learning_rate": 8.273015237887673e-05,
      "loss": 0.8204,
      "step": 211
    },
    {
      "epoch": 0.570275722932078,
      "grad_norm": 3.4216742515563965,
      "learning_rate": 8.187600974297714e-05,
      "loss": 0.7515,
      "step": 212
    },
    {
      "epoch": 0.5729657027572294,
      "grad_norm": 3.6286754608154297,
      "learning_rate": 8.102323211036904e-05,
      "loss": 0.6262,
      "step": 213
    },
    {
      "epoch": 0.5756556825823806,
      "grad_norm": 4.869045734405518,
      "learning_rate": 8.017188370776292e-05,
      "loss": 0.9851,
      "step": 214
    },
    {
      "epoch": 0.5783456624075319,
      "grad_norm": 5.106837749481201,
      "learning_rate": 7.932202865422726e-05,
      "loss": 1.122,
      "step": 215
    },
    {
      "epoch": 0.5810356422326832,
      "grad_norm": 4.490080833435059,
      "learning_rate": 7.847373095635937e-05,
      "loss": 0.8354,
      "step": 216
    },
    {
      "epoch": 0.5837256220578345,
      "grad_norm": 3.4671199321746826,
      "learning_rate": 7.762705450346462e-05,
      "loss": 0.8034,
      "step": 217
    },
    {
      "epoch": 0.5864156018829859,
      "grad_norm": 5.198472499847412,
      "learning_rate": 7.678206306274495e-05,
      "loss": 0.9827,
      "step": 218
    },
    {
      "epoch": 0.5891055817081372,
      "grad_norm": 3.910879135131836,
      "learning_rate": 7.59388202744959e-05,
      "loss": 0.5839,
      "step": 219
    },
    {
      "epoch": 0.5917955615332885,
      "grad_norm": 3.6499719619750977,
      "learning_rate": 7.509738964731389e-05,
      "loss": 0.4724,
      "step": 220
    },
    {
      "epoch": 0.5944855413584398,
      "grad_norm": 4.461284160614014,
      "learning_rate": 7.425783455331281e-05,
      "loss": 0.6628,
      "step": 221
    },
    {
      "epoch": 0.5971755211835911,
      "grad_norm": 3.570058584213257,
      "learning_rate": 7.342021822335143e-05,
      "loss": 0.455,
      "step": 222
    },
    {
      "epoch": 0.5998655010087425,
      "grad_norm": 4.93247652053833,
      "learning_rate": 7.258460374227085e-05,
      "loss": 1.1265,
      "step": 223
    },
    {
      "epoch": 0.6025554808338938,
      "grad_norm": 3.4849631786346436,
      "learning_rate": 7.175105404414362e-05,
      "loss": 0.3686,
      "step": 224
    },
    {
      "epoch": 0.605245460659045,
      "grad_norm": 2.503511428833008,
      "learning_rate": 7.091963190753376e-05,
      "loss": 0.3223,
      "step": 225
    },
    {
      "epoch": 0.6079354404841963,
      "grad_norm": 4.293323040008545,
      "learning_rate": 7.009039995076844e-05,
      "loss": 0.5115,
      "step": 226
    },
    {
      "epoch": 0.6106254203093476,
      "grad_norm": 4.681167125701904,
      "learning_rate": 6.926342062722223e-05,
      "loss": 0.8861,
      "step": 227
    },
    {
      "epoch": 0.613315400134499,
      "grad_norm": 4.363972187042236,
      "learning_rate": 6.843875622061304e-05,
      "loss": 0.714,
      "step": 228
    },
    {
      "epoch": 0.6160053799596503,
      "grad_norm": 4.711676597595215,
      "learning_rate": 6.761646884031164e-05,
      "loss": 0.9218,
      "step": 229
    },
    {
      "epoch": 0.6186953597848016,
      "grad_norm": 5.378860950469971,
      "learning_rate": 6.679662041666362e-05,
      "loss": 0.8466,
      "step": 230
    },
    {
      "epoch": 0.6213853396099529,
      "grad_norm": 4.945761203765869,
      "learning_rate": 6.597927269632526e-05,
      "loss": 0.8303,
      "step": 231
    },
    {
      "epoch": 0.6240753194351042,
      "grad_norm": 5.643571853637695,
      "learning_rate": 6.516448723761315e-05,
      "loss": 0.9418,
      "step": 232
    },
    {
      "epoch": 0.6267652992602556,
      "grad_norm": 3.7865352630615234,
      "learning_rate": 6.435232540586763e-05,
      "loss": 0.447,
      "step": 233
    },
    {
      "epoch": 0.6294552790854069,
      "grad_norm": 4.775557994842529,
      "learning_rate": 6.354284836883156e-05,
      "loss": 0.7068,
      "step": 234
    },
    {
      "epoch": 0.6321452589105582,
      "grad_norm": 4.313781261444092,
      "learning_rate": 6.273611709204304e-05,
      "loss": 0.4621,
      "step": 235
    },
    {
      "epoch": 0.6348352387357095,
      "grad_norm": 3.1749980449676514,
      "learning_rate": 6.193219233424414e-05,
      "loss": 0.2996,
      "step": 236
    },
    {
      "epoch": 0.6375252185608608,
      "grad_norm": 5.823219299316406,
      "learning_rate": 6.11311346428046e-05,
      "loss": 0.7987,
      "step": 237
    },
    {
      "epoch": 0.6402151983860122,
      "grad_norm": 4.825180530548096,
      "learning_rate": 6.033300434916203e-05,
      "loss": 0.9309,
      "step": 238
    },
    {
      "epoch": 0.6429051782111634,
      "grad_norm": 7.609663486480713,
      "learning_rate": 5.9537861564277654e-05,
      "loss": 0.6973,
      "step": 239
    },
    {
      "epoch": 0.6455951580363147,
      "grad_norm": 3.5007476806640625,
      "learning_rate": 5.8745766174109495e-05,
      "loss": 0.5831,
      "step": 240
    },
    {
      "epoch": 0.648285137861466,
      "grad_norm": 4.732518196105957,
      "learning_rate": 5.795677783510187e-05,
      "loss": 0.447,
      "step": 241
    },
    {
      "epoch": 0.6509751176866173,
      "grad_norm": 4.874922752380371,
      "learning_rate": 5.7170955969692265e-05,
      "loss": 0.429,
      "step": 242
    },
    {
      "epoch": 0.6536650975117687,
      "grad_norm": 2.738624095916748,
      "learning_rate": 5.638835976183627e-05,
      "loss": 0.316,
      "step": 243
    },
    {
      "epoch": 0.65635507733692,
      "grad_norm": 4.868707180023193,
      "learning_rate": 5.5609048152549794e-05,
      "loss": 0.528,
      "step": 244
    },
    {
      "epoch": 0.6590450571620713,
      "grad_norm": 2.545112371444702,
      "learning_rate": 5.483307983547026e-05,
      "loss": 0.2108,
      "step": 245
    },
    {
      "epoch": 0.6617350369872226,
      "grad_norm": 5.225026607513428,
      "learning_rate": 5.406051325243586e-05,
      "loss": 0.6883,
      "step": 246
    },
    {
      "epoch": 0.6644250168123739,
      "grad_norm": 6.781031608581543,
      "learning_rate": 5.329140658908423e-05,
      "loss": 0.7684,
      "step": 247
    },
    {
      "epoch": 0.6671149966375253,
      "grad_norm": 4.9204630851745605,
      "learning_rate": 5.2525817770470084e-05,
      "loss": 0.529,
      "step": 248
    },
    {
      "epoch": 0.6698049764626766,
      "grad_norm": 4.677596092224121,
      "learning_rate": 5.1763804456702545e-05,
      "loss": 0.1916,
      "step": 249
    },
    {
      "epoch": 0.6724949562878278,
      "grad_norm": 5.289106369018555,
      "learning_rate": 5.1005424038602724e-05,
      "loss": 0.5018,
      "step": 250
    },
    {
      "epoch": 0.6751849361129791,
      "grad_norm": 3.4332501888275146,
      "learning_rate": 5.025073363338111e-05,
      "loss": 1.4558,
      "step": 251
    },
    {
      "epoch": 0.6778749159381304,
      "grad_norm": 3.6167006492614746,
      "learning_rate": 4.949979008033596e-05,
      "loss": 1.3181,
      "step": 252
    },
    {
      "epoch": 0.6805648957632818,
      "grad_norm": 4.852591514587402,
      "learning_rate": 4.8752649936572304e-05,
      "loss": 1.5339,
      "step": 253
    },
    {
      "epoch": 0.6832548755884331,
      "grad_norm": 4.116457462310791,
      "learning_rate": 4.800936947274255e-05,
      "loss": 1.8746,
      "step": 254
    },
    {
      "epoch": 0.6859448554135844,
      "grad_norm": 3.9800620079040527,
      "learning_rate": 4.7270004668808397e-05,
      "loss": 1.8474,
      "step": 255
    },
    {
      "epoch": 0.6886348352387357,
      "grad_norm": 3.9870662689208984,
      "learning_rate": 4.65346112098246e-05,
      "loss": 1.148,
      "step": 256
    },
    {
      "epoch": 0.691324815063887,
      "grad_norm": 4.0448503494262695,
      "learning_rate": 4.5803244481745275e-05,
      "loss": 1.0557,
      "step": 257
    },
    {
      "epoch": 0.6940147948890383,
      "grad_norm": 4.282130241394043,
      "learning_rate": 4.5075959567252335e-05,
      "loss": 1.2731,
      "step": 258
    },
    {
      "epoch": 0.6967047747141897,
      "grad_norm": 4.488638401031494,
      "learning_rate": 4.435281124160715e-05,
      "loss": 1.3722,
      "step": 259
    },
    {
      "epoch": 0.699394754539341,
      "grad_norm": 3.9812207221984863,
      "learning_rate": 4.363385396852491e-05,
      "loss": 1.3536,
      "step": 260
    },
    {
      "epoch": 0.7020847343644923,
      "grad_norm": 5.047592639923096,
      "learning_rate": 4.291914189607297e-05,
      "loss": 0.7947,
      "step": 261
    },
    {
      "epoch": 0.7047747141896435,
      "grad_norm": 4.314056396484375,
      "learning_rate": 4.220872885259247e-05,
      "loss": 0.6368,
      "step": 262
    },
    {
      "epoch": 0.7074646940147948,
      "grad_norm": 4.209965705871582,
      "learning_rate": 4.1502668342644455e-05,
      "loss": 0.7786,
      "step": 263
    },
    {
      "epoch": 0.7101546738399462,
      "grad_norm": 5.0463151931762695,
      "learning_rate": 4.080101354298016e-05,
      "loss": 0.5957,
      "step": 264
    },
    {
      "epoch": 0.7128446536650975,
      "grad_norm": 5.0057373046875,
      "learning_rate": 4.0103817298535794e-05,
      "loss": 1.1632,
      "step": 265
    },
    {
      "epoch": 0.7155346334902488,
      "grad_norm": 4.640236854553223,
      "learning_rate": 3.9411132118452896e-05,
      "loss": 1.523,
      "step": 266
    },
    {
      "epoch": 0.7182246133154001,
      "grad_norm": 3.843372344970703,
      "learning_rate": 3.872301017212337e-05,
      "loss": 1.0536,
      "step": 267
    },
    {
      "epoch": 0.7209145931405514,
      "grad_norm": 4.180513858795166,
      "learning_rate": 3.8039503285260506e-05,
      "loss": 0.7683,
      "step": 268
    },
    {
      "epoch": 0.7236045729657028,
      "grad_norm": 7.624268054962158,
      "learning_rate": 3.73606629359955e-05,
      "loss": 0.817,
      "step": 269
    },
    {
      "epoch": 0.7262945527908541,
      "grad_norm": 5.006471633911133,
      "learning_rate": 3.6686540251000756e-05,
      "loss": 0.899,
      "step": 270
    },
    {
      "epoch": 0.7289845326160054,
      "grad_norm": 4.611303806304932,
      "learning_rate": 3.6017186001639036e-05,
      "loss": 0.3317,
      "step": 271
    },
    {
      "epoch": 0.7316745124411567,
      "grad_norm": 4.701048374176025,
      "learning_rate": 3.535265060013965e-05,
      "loss": 0.9114,
      "step": 272
    },
    {
      "epoch": 0.734364492266308,
      "grad_norm": 5.010507583618164,
      "learning_rate": 3.4692984095801796e-05,
      "loss": 0.6747,
      "step": 273
    },
    {
      "epoch": 0.7370544720914594,
      "grad_norm": 4.114429473876953,
      "learning_rate": 3.4038236171224946e-05,
      "loss": 0.5408,
      "step": 274
    },
    {
      "epoch": 0.7397444519166106,
      "grad_norm": 5.027583122253418,
      "learning_rate": 3.3388456138567225e-05,
      "loss": 0.4589,
      "step": 275
    },
    {
      "epoch": 0.7424344317417619,
      "grad_norm": 5.864731788635254,
      "learning_rate": 3.274369293583121e-05,
      "loss": 0.9165,
      "step": 276
    },
    {
      "epoch": 0.7451244115669132,
      "grad_norm": 4.747946739196777,
      "learning_rate": 3.210399512317849e-05,
      "loss": 0.5087,
      "step": 277
    },
    {
      "epoch": 0.7478143913920645,
      "grad_norm": 3.576078414916992,
      "learning_rate": 3.146941087927203e-05,
      "loss": 0.5728,
      "step": 278
    },
    {
      "epoch": 0.7505043712172159,
      "grad_norm": 2.740264892578125,
      "learning_rate": 3.0839987997647935e-05,
      "loss": 0.2307,
      "step": 279
    },
    {
      "epoch": 0.7505043712172159,
      "eval_loss": 0.7389675974845886,
      "eval_runtime": 10.7313,
      "eval_samples_per_second": 14.63,
      "eval_steps_per_second": 7.362,
      "step": 279
    },
    {
      "epoch": 0.7531943510423672,
      "grad_norm": 3.4270105361938477,
      "learning_rate": 3.0215773883115706e-05,
      "loss": 0.5658,
      "step": 280
    },
    {
      "epoch": 0.7558843308675185,
      "grad_norm": 3.4167211055755615,
      "learning_rate": 2.9596815548187908e-05,
      "loss": 0.1781,
      "step": 281
    },
    {
      "epoch": 0.7585743106926698,
      "grad_norm": 3.9443657398223877,
      "learning_rate": 2.8983159609539635e-05,
      "loss": 0.5545,
      "step": 282
    },
    {
      "epoch": 0.7612642905178211,
      "grad_norm": 3.164463758468628,
      "learning_rate": 2.8374852284497446e-05,
      "loss": 0.334,
      "step": 283
    },
    {
      "epoch": 0.7639542703429725,
      "grad_norm": 3.6277055740356445,
      "learning_rate": 2.7771939387558554e-05,
      "loss": 0.411,
      "step": 284
    },
    {
      "epoch": 0.7666442501681238,
      "grad_norm": 4.296345233917236,
      "learning_rate": 2.717446632694025e-05,
      "loss": 0.3111,
      "step": 285
    },
    {
      "epoch": 0.769334229993275,
      "grad_norm": 4.06040096282959,
      "learning_rate": 2.6582478101160167e-05,
      "loss": 0.4634,
      "step": 286
    },
    {
      "epoch": 0.7720242098184263,
      "grad_norm": 4.600436687469482,
      "learning_rate": 2.599601929564709e-05,
      "loss": 0.6998,
      "step": 287
    },
    {
      "epoch": 0.7747141896435776,
      "grad_norm": 3.8486735820770264,
      "learning_rate": 2.5415134079383006e-05,
      "loss": 0.3987,
      "step": 288
    },
    {
      "epoch": 0.777404169468729,
      "grad_norm": 5.362851142883301,
      "learning_rate": 2.4839866201576646e-05,
      "loss": 0.3466,
      "step": 289
    },
    {
      "epoch": 0.7800941492938803,
      "grad_norm": 3.8688018321990967,
      "learning_rate": 2.4270258988368376e-05,
      "loss": 0.2902,
      "step": 290
    },
    {
      "epoch": 0.7827841291190316,
      "grad_norm": 4.354773044586182,
      "learning_rate": 2.3706355339567286e-05,
      "loss": 0.4149,
      "step": 291
    },
    {
      "epoch": 0.7854741089441829,
      "grad_norm": 7.11607027053833,
      "learning_rate": 2.3148197725419983e-05,
      "loss": 0.7291,
      "step": 292
    },
    {
      "epoch": 0.7881640887693342,
      "grad_norm": 5.43526029586792,
      "learning_rate": 2.2595828183412172e-05,
      "loss": 0.2716,
      "step": 293
    },
    {
      "epoch": 0.7908540685944856,
      "grad_norm": 3.004659414291382,
      "learning_rate": 2.2049288315102412e-05,
      "loss": 0.3067,
      "step": 294
    },
    {
      "epoch": 0.7935440484196369,
      "grad_norm": 4.5855560302734375,
      "learning_rate": 2.1508619282989084e-05,
      "loss": 0.2618,
      "step": 295
    },
    {
      "epoch": 0.7962340282447882,
      "grad_norm": 4.773977756500244,
      "learning_rate": 2.097386180741019e-05,
      "loss": 0.5023,
      "step": 296
    },
    {
      "epoch": 0.7989240080699395,
      "grad_norm": 9.166229248046875,
      "learning_rate": 2.0445056163476374e-05,
      "loss": 0.4224,
      "step": 297
    },
    {
      "epoch": 0.8016139878950908,
      "grad_norm": 6.276297092437744,
      "learning_rate": 1.9922242178037864e-05,
      "loss": 0.8068,
      "step": 298
    },
    {
      "epoch": 0.8043039677202422,
      "grad_norm": 5.523612976074219,
      "learning_rate": 1.940545922668472e-05,
      "loss": 0.4406,
      "step": 299
    },
    {
      "epoch": 0.8069939475453934,
      "grad_norm": 1.5128313302993774,
      "learning_rate": 1.88947462307814e-05,
      "loss": 0.0216,
      "step": 300
    },
    {
      "epoch": 0.8096839273705447,
      "grad_norm": 2.8309073448181152,
      "learning_rate": 1.8390141654535265e-05,
      "loss": 1.299,
      "step": 301
    },
    {
      "epoch": 0.812373907195696,
      "grad_norm": 3.6739649772644043,
      "learning_rate": 1.789168350209983e-05,
      "loss": 1.5798,
      "step": 302
    },
    {
      "epoch": 0.8150638870208473,
      "grad_norm": 3.935307741165161,
      "learning_rate": 1.739940931471239e-05,
      "loss": 1.295,
      "step": 303
    },
    {
      "epoch": 0.8177538668459986,
      "grad_norm": 4.4844865798950195,
      "learning_rate": 1.6913356167866578e-05,
      "loss": 1.225,
      "step": 304
    },
    {
      "epoch": 0.82044384667115,
      "grad_norm": 4.518765449523926,
      "learning_rate": 1.6433560668520176e-05,
      "loss": 1.4111,
      "step": 305
    },
    {
      "epoch": 0.8231338264963013,
      "grad_norm": 4.362013339996338,
      "learning_rate": 1.5960058952337887e-05,
      "loss": 1.1839,
      "step": 306
    },
    {
      "epoch": 0.8258238063214526,
      "grad_norm": 4.76102352142334,
      "learning_rate": 1.5492886680969963e-05,
      "loss": 1.2118,
      "step": 307
    },
    {
      "epoch": 0.8285137861466039,
      "grad_norm": 5.4755539894104,
      "learning_rate": 1.5032079039366209e-05,
      "loss": 1.4798,
      "step": 308
    },
    {
      "epoch": 0.8312037659717552,
      "grad_norm": 3.792975902557373,
      "learning_rate": 1.4577670733126203e-05,
      "loss": 0.7013,
      "step": 309
    },
    {
      "epoch": 0.8338937457969066,
      "grad_norm": 5.135954856872559,
      "learning_rate": 1.4129695985885228e-05,
      "loss": 1.5141,
      "step": 310
    },
    {
      "epoch": 0.8365837256220578,
      "grad_norm": 3.417525291442871,
      "learning_rate": 1.3688188536736968e-05,
      "loss": 0.8687,
      "step": 311
    },
    {
      "epoch": 0.8392737054472091,
      "grad_norm": 4.7601728439331055,
      "learning_rate": 1.3253181637692324e-05,
      "loss": 0.9127,
      "step": 312
    },
    {
      "epoch": 0.8419636852723604,
      "grad_norm": 4.601919174194336,
      "learning_rate": 1.2824708051175016e-05,
      "loss": 1.0878,
      "step": 313
    },
    {
      "epoch": 0.8446536650975117,
      "grad_norm": 3.320221185684204,
      "learning_rate": 1.2402800047554208e-05,
      "loss": 0.6061,
      "step": 314
    },
    {
      "epoch": 0.8473436449226631,
      "grad_norm": 4.236156463623047,
      "learning_rate": 1.1987489402713981e-05,
      "loss": 0.7456,
      "step": 315
    },
    {
      "epoch": 0.8500336247478144,
      "grad_norm": 6.007240295410156,
      "learning_rate": 1.1578807395660207e-05,
      "loss": 1.5298,
      "step": 316
    },
    {
      "epoch": 0.8527236045729657,
      "grad_norm": 5.775532245635986,
      "learning_rate": 1.1176784806164676e-05,
      "loss": 0.7343,
      "step": 317
    },
    {
      "epoch": 0.855413584398117,
      "grad_norm": 5.709627628326416,
      "learning_rate": 1.078145191244706e-05,
      "loss": 1.2876,
      "step": 318
    },
    {
      "epoch": 0.8581035642232683,
      "grad_norm": 5.935501575469971,
      "learning_rate": 1.0392838488894463e-05,
      "loss": 0.9374,
      "step": 319
    },
    {
      "epoch": 0.8607935440484197,
      "grad_norm": 4.249516010284424,
      "learning_rate": 1.0010973803818857e-05,
      "loss": 0.5061,
      "step": 320
    },
    {
      "epoch": 0.863483523873571,
      "grad_norm": 4.154758453369141,
      "learning_rate": 9.635886617252975e-06,
      "loss": 0.1188,
      "step": 321
    },
    {
      "epoch": 0.8661735036987223,
      "grad_norm": 3.874020576477051,
      "learning_rate": 9.267605178784033e-06,
      "loss": 0.4923,
      "step": 322
    },
    {
      "epoch": 0.8688634835238735,
      "grad_norm": 3.575878143310547,
      "learning_rate": 8.906157225426315e-06,
      "loss": 0.3217,
      "step": 323
    },
    {
      "epoch": 0.8715534633490248,
      "grad_norm": 4.050719261169434,
      "learning_rate": 8.55156997953197e-06,
      "loss": 0.4612,
      "step": 324
    },
    {
      "epoch": 0.8742434431741762,
      "grad_norm": 3.588498830795288,
      "learning_rate": 8.203870146740932e-06,
      "loss": 0.2259,
      "step": 325
    },
    {
      "epoch": 0.8769334229993275,
      "grad_norm": 5.262954235076904,
      "learning_rate": 7.86308391396956e-06,
      "loss": 0.7654,
      "step": 326
    },
    {
      "epoch": 0.8796234028244788,
      "grad_norm": 5.5735087394714355,
      "learning_rate": 7.529236947438256e-06,
      "loss": 0.5849,
      "step": 327
    },
    {
      "epoch": 0.8823133826496301,
      "grad_norm": 4.838580131530762,
      "learning_rate": 7.202354390738608e-06,
      "loss": 0.3913,
      "step": 328
    },
    {
      "epoch": 0.8850033624747814,
      "grad_norm": 5.6935038566589355,
      "learning_rate": 6.882460862939522e-06,
      "loss": 0.7206,
      "step": 329
    },
    {
      "epoch": 0.8876933422999328,
      "grad_norm": 2.3508174419403076,
      "learning_rate": 6.5695804567332044e-06,
      "loss": 0.1703,
      "step": 330
    },
    {
      "epoch": 0.8903833221250841,
      "grad_norm": 5.699828624725342,
      "learning_rate": 6.263736736620551e-06,
      "loss": 0.4676,
      "step": 331
    },
    {
      "epoch": 0.8930733019502354,
      "grad_norm": 4.048695087432861,
      "learning_rate": 5.964952737136353e-06,
      "loss": 0.5628,
      "step": 332
    },
    {
      "epoch": 0.8957632817753867,
      "grad_norm": 4.811221599578857,
      "learning_rate": 5.673250961114529e-06,
      "loss": 0.7418,
      "step": 333
    },
    {
      "epoch": 0.898453261600538,
      "grad_norm": 3.3414437770843506,
      "learning_rate": 5.388653377993324e-06,
      "loss": 0.3143,
      "step": 334
    },
    {
      "epoch": 0.9011432414256894,
      "grad_norm": 5.924250602722168,
      "learning_rate": 5.111181422160671e-06,
      "loss": 0.5284,
      "step": 335
    },
    {
      "epoch": 0.9038332212508406,
      "grad_norm": 6.767046928405762,
      "learning_rate": 4.840855991339799e-06,
      "loss": 0.6351,
      "step": 336
    },
    {
      "epoch": 0.9065232010759919,
      "grad_norm": 4.555798053741455,
      "learning_rate": 4.577697445015472e-06,
      "loss": 0.5253,
      "step": 337
    },
    {
      "epoch": 0.9092131809011432,
      "grad_norm": 5.7803730964660645,
      "learning_rate": 4.321725602900473e-06,
      "loss": 0.7582,
      "step": 338
    },
    {
      "epoch": 0.9119031607262945,
      "grad_norm": 4.016640663146973,
      "learning_rate": 4.072959743443017e-06,
      "loss": 0.2845,
      "step": 339
    },
    {
      "epoch": 0.9145931405514459,
      "grad_norm": 5.46890926361084,
      "learning_rate": 3.83141860237467e-06,
      "loss": 0.6128,
      "step": 340
    },
    {
      "epoch": 0.9172831203765972,
      "grad_norm": 4.543710708618164,
      "learning_rate": 3.5971203712993894e-06,
      "loss": 0.5227,
      "step": 341
    },
    {
      "epoch": 0.9199731002017485,
      "grad_norm": 4.0189008712768555,
      "learning_rate": 3.3700826963233735e-06,
      "loss": 0.4072,
      "step": 342
    },
    {
      "epoch": 0.9226630800268998,
      "grad_norm": 5.0270490646362305,
      "learning_rate": 3.1503226767260252e-06,
      "loss": 0.5361,
      "step": 343
    },
    {
      "epoch": 0.9253530598520511,
      "grad_norm": 7.237580299377441,
      "learning_rate": 2.9378568636721835e-06,
      "loss": 0.9466,
      "step": 344
    },
    {
      "epoch": 0.9280430396772025,
      "grad_norm": 8.795455932617188,
      "learning_rate": 2.732701258965531e-06,
      "loss": 0.6604,
      "step": 345
    },
    {
      "epoch": 0.9307330195023538,
      "grad_norm": 11.6528959274292,
      "learning_rate": 2.5348713138434564e-06,
      "loss": 0.5807,
      "step": 346
    },
    {
      "epoch": 0.933422999327505,
      "grad_norm": 8.07696533203125,
      "learning_rate": 2.3443819278132996e-06,
      "loss": 0.7975,
      "step": 347
    },
    {
      "epoch": 0.9361129791526563,
      "grad_norm": 4.788589954376221,
      "learning_rate": 2.161247447530268e-06,
      "loss": 0.6227,
      "step": 348
    },
    {
      "epoch": 0.9388029589778076,
      "grad_norm": 7.453376293182373,
      "learning_rate": 1.985481665716882e-06,
      "loss": 0.4651,
      "step": 349
    },
    {
      "epoch": 0.9414929388029589,
      "grad_norm": 4.3519392013549805,
      "learning_rate": 1.8170978201241474e-06,
      "loss": 0.1668,
      "step": 350
    },
    {
      "epoch": 0.9441829186281103,
      "grad_norm": 3.087855577468872,
      "learning_rate": 1.6561085925346332e-06,
      "loss": 1.2559,
      "step": 351
    },
    {
      "epoch": 0.9468728984532616,
      "grad_norm": 3.9484481811523438,
      "learning_rate": 1.5025261078073005e-06,
      "loss": 1.0505,
      "step": 352
    },
    {
      "epoch": 0.9495628782784129,
      "grad_norm": 4.509681701660156,
      "learning_rate": 1.3563619329643119e-06,
      "loss": 1.316,
      "step": 353
    },
    {
      "epoch": 0.9522528581035642,
      "grad_norm": 4.409306049346924,
      "learning_rate": 1.2176270763198828e-06,
      "loss": 0.9114,
      "step": 354
    },
    {
      "epoch": 0.9549428379287155,
      "grad_norm": 5.652538299560547,
      "learning_rate": 1.0863319866512346e-06,
      "loss": 1.1458,
      "step": 355
    },
    {
      "epoch": 0.9576328177538669,
      "grad_norm": 6.170865535736084,
      "learning_rate": 9.624865524115346e-07,
      "loss": 1.1232,
      "step": 356
    },
    {
      "epoch": 0.9603227975790182,
      "grad_norm": 5.357152938842773,
      "learning_rate": 8.461001009852809e-07,
      "loss": 0.9592,
      "step": 357
    },
    {
      "epoch": 0.9630127774041695,
      "grad_norm": 4.322149753570557,
      "learning_rate": 7.371813979857312e-07,
      "loss": 0.7773,
      "step": 358
    },
    {
      "epoch": 0.9657027572293208,
      "grad_norm": 3.6123275756835938,
      "learning_rate": 6.357386465947301e-07,
      "loss": 0.5652,
      "step": 359
    },
    {
      "epoch": 0.968392737054472,
      "grad_norm": 3.7311031818389893,
      "learning_rate": 5.417794869449377e-07,
      "loss": 0.6096,
      "step": 360
    },
    {
      "epoch": 0.9710827168796234,
      "grad_norm": 5.762843608856201,
      "learning_rate": 4.5531099554435576e-07,
      "loss": 0.9279,
      "step": 361
    },
    {
      "epoch": 0.9737726967047747,
      "grad_norm": 4.97388219833374,
      "learning_rate": 3.763396847433875e-07,
      "loss": 0.5789,
      "step": 362
    },
    {
      "epoch": 0.976462676529926,
      "grad_norm": 4.815624713897705,
      "learning_rate": 3.048715022443749e-07,
      "loss": 0.5138,
      "step": 363
    },
    {
      "epoch": 0.9791526563550773,
      "grad_norm": 3.541781425476074,
      "learning_rate": 2.409118306536229e-07,
      "loss": 0.259,
      "step": 364
    },
    {
      "epoch": 0.9818426361802286,
      "grad_norm": 2.7444493770599365,
      "learning_rate": 1.8446548707604648e-07,
      "loss": 0.2707,
      "step": 365
    },
    {
      "epoch": 0.98453261600538,
      "grad_norm": 5.796267986297607,
      "learning_rate": 1.3553672275230523e-07,
      "loss": 0.5347,
      "step": 366
    },
    {
      "epoch": 0.9872225958305313,
      "grad_norm": 5.090404987335205,
      "learning_rate": 9.412922273871471e-08,
      "loss": 0.3201,
      "step": 367
    },
    {
      "epoch": 0.9899125756556826,
      "grad_norm": 4.630456924438477,
      "learning_rate": 6.024610562962441e-08,
      "loss": 0.4391,
      "step": 368
    },
    {
      "epoch": 0.9926025554808339,
      "grad_norm": 4.325840473175049,
      "learning_rate": 3.388992332259422e-08,
      "loss": 0.3675,
      "step": 369
    },
    {
      "epoch": 0.9952925353059852,
      "grad_norm": 9.686969757080078,
      "learning_rate": 1.506266082615948e-08,
      "loss": 0.6909,
      "step": 370
    },
    {
      "epoch": 0.9979825151311366,
      "grad_norm": 4.668429851531982,
      "learning_rate": 3.7657361103837776e-09,
      "loss": 0.285,
      "step": 371
    },
    {
      "epoch": 1.0013449899125757,
      "grad_norm": 4.755204200744629,
      "learning_rate": 0.0,
      "loss": 0.9972,
      "step": 372
    },
    {
      "epoch": 1.0013449899125757,
      "eval_loss": 0.7320420145988464,
      "eval_runtime": 10.7106,
      "eval_samples_per_second": 14.658,
      "eval_steps_per_second": 7.376,
      "step": 372
    }
  ],
  "logging_steps": 1,
  "max_steps": 372,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 93,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.211789436077998e+17,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}