Training in progress, step 346, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1222 -3

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:724944de7f094b15fc569143ca4a81d654c9b7997910df4bcf6bdf73d2f0add2
 size 125248064

 version https://git-lfs.github.com/spec/v1
+oid sha256:7c39b303b3b24c90608dda6b811a1313074653c2b41eefe49d8b132629403952
 size 125248064

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7674b18a175bfe375f927b20b24e4c90bf61ae3515ba608d178abbf28e4d0011
-size 64219860

 version https://git-lfs.github.com/spec/v1
+oid sha256:5dfc4521350c95d9abc2abce112bce9288c164b96a81482af74bacb192d8ba2b
+size 64220436

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2cf419bfa6a2420a5571eb38ddd6b8c30257fad8d42dd391fc762aaf3e7ee3db
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:d8eb5c3002d7e8ae1e2b9def5fa5c5531d299883fc64b5898be75ce56c2463ee
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6a1b7fb8afbb4805aa4088e62b7ce1004bcb0c85f1755cc1c6ba61487ae7fa2b
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:799bba65fc813f88f98a0005d4dd9bc4b536663da3ba1e7b7fc073c9e9774986
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.251088534107402,
   "eval_steps": 173,
-  "global_step": 173,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1234,6 +1234,1225 @@
       "eval_samples_per_second": 8.978,
       "eval_steps_per_second": 4.505,
       "step": 173
     }
   ],
   "logging_steps": 1,
@@ -1253,7 +2472,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.0947220535771136e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.502177068214804,
   "eval_steps": 173,
+  "global_step": 346,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 8.978,
       "eval_steps_per_second": 4.505,
       "step": 173
+    },
+    {
+      "epoch": 0.2525399129172714,
+      "grad_norm": 0.24334195256233215,
+      "learning_rate": 0.00017256662525779983,
+      "loss": 0.5441,
+      "step": 174
+    },
+    {
+      "epoch": 0.2539912917271408,
+      "grad_norm": 0.3937956988811493,
+      "learning_rate": 0.00017224750460699556,
+      "loss": 1.3447,
+      "step": 175
+    },
+    {
+      "epoch": 0.25544267053701014,
+      "grad_norm": 0.32860031723976135,
+      "learning_rate": 0.00017192683734099353,
+      "loss": 1.1908,
+      "step": 176
+    },
+    {
+      "epoch": 0.25689404934687954,
+      "grad_norm": 0.40750887989997864,
+      "learning_rate": 0.000171604630324375,
+      "loss": 0.4587,
+      "step": 177
+    },
+    {
+      "epoch": 0.25834542815674894,
+      "grad_norm": 0.24471426010131836,
+      "learning_rate": 0.00017128089045468294,
+      "loss": 0.4731,
+      "step": 178
+    },
+    {
+      "epoch": 0.2597968069666183,
+      "grad_norm": 0.3871125876903534,
+      "learning_rate": 0.0001709556246622744,
+      "loss": 1.5461,
+      "step": 179
+    },
+    {
+      "epoch": 0.2612481857764877,
+      "grad_norm": 0.32278481125831604,
+      "learning_rate": 0.00017062883991017218,
+      "loss": 1.0052,
+      "step": 180
+    },
+    {
+      "epoch": 0.262699564586357,
+      "grad_norm": 0.2906551957130432,
+      "learning_rate": 0.00017030054319391562,
+      "loss": 1.1581,
+      "step": 181
+    },
+    {
+      "epoch": 0.2641509433962264,
+      "grad_norm": 0.2662639319896698,
+      "learning_rate": 0.00016997074154141096,
+      "loss": 0.4337,
+      "step": 182
+    },
+    {
+      "epoch": 0.2656023222060958,
+      "grad_norm": 0.30343329906463623,
+      "learning_rate": 0.00016963944201278097,
+      "loss": 0.9429,
+      "step": 183
+    },
+    {
+      "epoch": 0.26705370101596515,
+      "grad_norm": 0.48314595222473145,
+      "learning_rate": 0.00016930665170021363,
+      "loss": 1.146,
+      "step": 184
+    },
+    {
+      "epoch": 0.26850507982583455,
+      "grad_norm": 0.22615785896778107,
+      "learning_rate": 0.00016897237772781044,
+      "loss": 0.288,
+      "step": 185
+    },
+    {
+      "epoch": 0.26995645863570394,
+      "grad_norm": 0.3443828225135803,
+      "learning_rate": 0.00016863662725143382,
+      "loss": 1.0044,
+      "step": 186
+    },
+    {
+      "epoch": 0.2714078374455733,
+      "grad_norm": 0.3521503508090973,
+      "learning_rate": 0.00016829940745855408,
+      "loss": 0.917,
+      "step": 187
+    },
+    {
+      "epoch": 0.2728592162554427,
+      "grad_norm": 0.3083937168121338,
+      "learning_rate": 0.00016796072556809534,
+      "loss": 1.1641,
+      "step": 188
+    },
+    {
+      "epoch": 0.274310595065312,
+      "grad_norm": 0.1671004444360733,
+      "learning_rate": 0.0001676205888302812,
+      "loss": 0.2087,
+      "step": 189
+    },
+    {
+      "epoch": 0.2757619738751814,
+      "grad_norm": 0.3343052268028259,
+      "learning_rate": 0.00016727900452647926,
+      "loss": 0.8929,
+      "step": 190
+    },
+    {
+      "epoch": 0.2772133526850508,
+      "grad_norm": 0.2790353000164032,
+      "learning_rate": 0.00016693597996904564,
+      "loss": 0.8818,
+      "step": 191
+    },
+    {
+      "epoch": 0.27866473149492016,
+      "grad_norm": 0.3753064274787903,
+      "learning_rate": 0.00016659152250116812,
+      "loss": 0.7531,
+      "step": 192
+    },
+    {
+      "epoch": 0.28011611030478956,
+      "grad_norm": 0.3089276850223541,
+      "learning_rate": 0.00016624563949670904,
+      "loss": 1.2895,
+      "step": 193
+    },
+    {
+      "epoch": 0.28156748911465895,
+      "grad_norm": 0.24410511553287506,
+      "learning_rate": 0.00016589833836004752,
+      "loss": 0.444,
+      "step": 194
+    },
+    {
+      "epoch": 0.2830188679245283,
+      "grad_norm": 0.4272465705871582,
+      "learning_rate": 0.00016554962652592077,
+      "loss": 0.9711,
+      "step": 195
+    },
+    {
+      "epoch": 0.2844702467343977,
+      "grad_norm": 0.29573267698287964,
+      "learning_rate": 0.00016519951145926515,
+      "loss": 0.3583,
+      "step": 196
+    },
+    {
+      "epoch": 0.28592162554426703,
+      "grad_norm": 0.42986953258514404,
+      "learning_rate": 0.00016484800065505627,
+      "loss": 1.6862,
+      "step": 197
+    },
+    {
+      "epoch": 0.28737300435413643,
+      "grad_norm": 0.31086039543151855,
+      "learning_rate": 0.00016449510163814854,
+      "loss": 0.5808,
+      "step": 198
+    },
+    {
+      "epoch": 0.2888243831640058,
+      "grad_norm": 0.4590548574924469,
+      "learning_rate": 0.000164140821963114,
+      "loss": 1.837,
+      "step": 199
+    },
+    {
+      "epoch": 0.29027576197387517,
+      "grad_norm": 0.4661678969860077,
+      "learning_rate": 0.00016378516921408077,
+      "loss": 0.8179,
+      "step": 200
+    },
+    {
+      "epoch": 0.29172714078374457,
+      "grad_norm": 0.5419331192970276,
+      "learning_rate": 0.00016342815100457063,
+      "loss": 0.8166,
+      "step": 201
+    },
+    {
+      "epoch": 0.2931785195936139,
+      "grad_norm": 0.34207096695899963,
+      "learning_rate": 0.00016306977497733592,
+      "loss": 0.8683,
+      "step": 202
+    },
+    {
+      "epoch": 0.2946298984034833,
+      "grad_norm": 0.4035762548446655,
+      "learning_rate": 0.00016271004880419608,
+      "loss": 0.7494,
+      "step": 203
+    },
+    {
+      "epoch": 0.2960812772133527,
+      "grad_norm": 0.40903428196907043,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 0.3219,
+      "step": 204
+    },
+    {
+      "epoch": 0.29753265602322204,
+      "grad_norm": 0.3800163269042969,
+      "learning_rate": 0.000161986576851828,
+      "loss": 1.0624,
+      "step": 205
+    },
+    {
+      "epoch": 0.29898403483309144,
+      "grad_norm": 0.20540378987789154,
+      "learning_rate": 0.00016162284656009274,
+      "loss": 0.2874,
+      "step": 206
+    },
+    {
+      "epoch": 0.30043541364296084,
+      "grad_norm": 0.3501240313053131,
+      "learning_rate": 0.00016125779709710665,
+      "loss": 0.9726,
+      "step": 207
+    },
+    {
+      "epoch": 0.3018867924528302,
+      "grad_norm": 0.46438068151474,
+      "learning_rate": 0.00016089143627754862,
+      "loss": 0.9383,
+      "step": 208
+    },
+    {
+      "epoch": 0.3033381712626996,
+      "grad_norm": 0.37778106331825256,
+      "learning_rate": 0.00016052377194417,
+      "loss": 1.9048,
+      "step": 209
+    },
+    {
+      "epoch": 0.3047895500725689,
+      "grad_norm": 0.4382157325744629,
+      "learning_rate": 0.0001601548119676266,
+      "loss": 1.2582,
+      "step": 210
+    },
+    {
+      "epoch": 0.3062409288824383,
+      "grad_norm": 0.442871630191803,
+      "learning_rate": 0.00015978456424631032,
+      "loss": 1.086,
+      "step": 211
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.5801809430122375,
+      "learning_rate": 0.00015941303670618018,
+      "loss": 1.234,
+      "step": 212
+    },
+    {
+      "epoch": 0.30914368650217705,
+      "grad_norm": 0.2682696282863617,
+      "learning_rate": 0.00015904023730059228,
+      "loss": 0.6148,
+      "step": 213
+    },
+    {
+      "epoch": 0.31059506531204645,
+      "grad_norm": 0.25142902135849,
+      "learning_rate": 0.00015866617401012994,
+      "loss": 0.6304,
+      "step": 214
+    },
+    {
+      "epoch": 0.31204644412191584,
+      "grad_norm": 0.36662545800209045,
+      "learning_rate": 0.00015829085484243266,
+      "loss": 0.8005,
+      "step": 215
+    },
+    {
+      "epoch": 0.3134978229317852,
+      "grad_norm": 0.2825576961040497,
+      "learning_rate": 0.00015791428783202465,
+      "loss": 0.7511,
+      "step": 216
+    },
+    {
+      "epoch": 0.3149492017416546,
+      "grad_norm": 0.33648526668548584,
+      "learning_rate": 0.00015753648104014297,
+      "loss": 0.9242,
+      "step": 217
+    },
+    {
+      "epoch": 0.3164005805515239,
+      "grad_norm": 0.5301381945610046,
+      "learning_rate": 0.00015715744255456496,
+      "loss": 0.9624,
+      "step": 218
+    },
+    {
+      "epoch": 0.3178519593613933,
+      "grad_norm": 0.3330354690551758,
+      "learning_rate": 0.00015677718048943496,
+      "loss": 1.6149,
+      "step": 219
+    },
+    {
+      "epoch": 0.3193033381712627,
+      "grad_norm": 0.2863297164440155,
+      "learning_rate": 0.00015639570298509064,
+      "loss": 0.5041,
+      "step": 220
+    },
+    {
+      "epoch": 0.32075471698113206,
+      "grad_norm": 0.30266642570495605,
+      "learning_rate": 0.00015601301820788898,
+      "loss": 0.5615,
+      "step": 221
+    },
+    {
+      "epoch": 0.32220609579100146,
+      "grad_norm": 0.3737308084964752,
+      "learning_rate": 0.00015562913435003114,
+      "loss": 0.5868,
+      "step": 222
+    },
+    {
+      "epoch": 0.32365747460087085,
+      "grad_norm": 0.41030675172805786,
+      "learning_rate": 0.00015524405962938714,
+      "loss": 0.933,
+      "step": 223
+    },
+    {
+      "epoch": 0.3251088534107402,
+      "grad_norm": 0.22345803678035736,
+      "learning_rate": 0.0001548578022893202,
+      "loss": 0.4072,
+      "step": 224
+    },
+    {
+      "epoch": 0.3265602322206096,
+      "grad_norm": 0.4588911235332489,
+      "learning_rate": 0.00015447037059851,
+      "loss": 1.3702,
+      "step": 225
+    },
+    {
+      "epoch": 0.32801161103047893,
+      "grad_norm": 0.40378692746162415,
+      "learning_rate": 0.00015408177285077565,
+      "loss": 0.7234,
+      "step": 226
+    },
+    {
+      "epoch": 0.32946298984034833,
+      "grad_norm": 0.36572182178497314,
+      "learning_rate": 0.0001536920173648984,
+      "loss": 0.6561,
+      "step": 227
+    },
+    {
+      "epoch": 0.3309143686502177,
+      "grad_norm": 0.24225015938282013,
+      "learning_rate": 0.0001533011124844433,
+      "loss": 0.5723,
+      "step": 228
+    },
+    {
+      "epoch": 0.33236574746008707,
+      "grad_norm": 0.4880613386631012,
+      "learning_rate": 0.00015290906657758083,
+      "loss": 0.8659,
+      "step": 229
+    },
+    {
+      "epoch": 0.33381712626995647,
+      "grad_norm": 0.4412708878517151,
+      "learning_rate": 0.00015251588803690753,
+      "loss": 0.8392,
+      "step": 230
+    },
+    {
+      "epoch": 0.3352685050798258,
+      "grad_norm": 0.2982364594936371,
+      "learning_rate": 0.00015212158527926634,
+      "loss": 0.8567,
+      "step": 231
+    },
+    {
+      "epoch": 0.3367198838896952,
+      "grad_norm": 0.3174440860748291,
+      "learning_rate": 0.0001517261667455667,
+      "loss": 0.6775,
+      "step": 232
+    },
+    {
+      "epoch": 0.3381712626995646,
+      "grad_norm": 0.23971304297447205,
+      "learning_rate": 0.00015132964090060357,
+      "loss": 0.2637,
+      "step": 233
+    },
+    {
+      "epoch": 0.33962264150943394,
+      "grad_norm": 0.3869999647140503,
+      "learning_rate": 0.00015093201623287631,
+      "loss": 0.7993,
+      "step": 234
+    },
+    {
+      "epoch": 0.34107402031930334,
+      "grad_norm": 0.4646807014942169,
+      "learning_rate": 0.000150533301254407,
+      "loss": 1.0936,
+      "step": 235
+    },
+    {
+      "epoch": 0.34252539912917274,
+      "grad_norm": 0.39114901423454285,
+      "learning_rate": 0.0001501335045005582,
+      "loss": 1.0431,
+      "step": 236
+    },
+    {
+      "epoch": 0.3439767779390421,
+      "grad_norm": 0.32341787219047546,
+      "learning_rate": 0.00014973263452985024,
+      "loss": 0.5171,
+      "step": 237
+    },
+    {
+      "epoch": 0.3454281567489115,
+      "grad_norm": 0.3025743365287781,
+      "learning_rate": 0.00014933069992377793,
+      "loss": 0.59,
+      "step": 238
+    },
+    {
+      "epoch": 0.3468795355587808,
+      "grad_norm": 0.49004557728767395,
+      "learning_rate": 0.00014892770928662697,
+      "loss": 1.6646,
+      "step": 239
+    },
+    {
+      "epoch": 0.3483309143686502,
+      "grad_norm": 0.5597532391548157,
+      "learning_rate": 0.00014852367124528975,
+      "loss": 1.3429,
+      "step": 240
+    },
+    {
+      "epoch": 0.3497822931785196,
+      "grad_norm": 0.4048459827899933,
+      "learning_rate": 0.00014811859444908052,
+      "loss": 1.4101,
+      "step": 241
+    },
+    {
+      "epoch": 0.35123367198838895,
+      "grad_norm": 0.5641282796859741,
+      "learning_rate": 0.00014771248756955042,
+      "loss": 1.6691,
+      "step": 242
+    },
+    {
+      "epoch": 0.35268505079825835,
+      "grad_norm": 0.4494577646255493,
+      "learning_rate": 0.00014730535930030172,
+      "loss": 1.211,
+      "step": 243
+    },
+    {
+      "epoch": 0.35413642960812775,
+      "grad_norm": 0.3181702792644501,
+      "learning_rate": 0.00014689721835680182,
+      "loss": 0.4831,
+      "step": 244
+    },
+    {
+      "epoch": 0.3555878084179971,
+      "grad_norm": 0.3166160583496094,
+      "learning_rate": 0.00014648807347619663,
+      "loss": 1.59,
+      "step": 245
+    },
+    {
+      "epoch": 0.3570391872278665,
+      "grad_norm": 0.3532291054725647,
+      "learning_rate": 0.00014607793341712334,
+      "loss": 0.3531,
+      "step": 246
+    },
+    {
+      "epoch": 0.3584905660377358,
+      "grad_norm": 0.3703453540802002,
+      "learning_rate": 0.00014566680695952332,
+      "loss": 0.7499,
+      "step": 247
+    },
+    {
+      "epoch": 0.3599419448476052,
+      "grad_norm": 0.29574841260910034,
+      "learning_rate": 0.00014525470290445392,
+      "loss": 0.7956,
+      "step": 248
+    },
+    {
+      "epoch": 0.3613933236574746,
+      "grad_norm": 0.41456523537635803,
+      "learning_rate": 0.0001448416300738999,
+      "loss": 1.2597,
+      "step": 249
+    },
+    {
+      "epoch": 0.36284470246734396,
+      "grad_norm": 0.3268081545829773,
+      "learning_rate": 0.000144427597310585,
+      "loss": 0.7744,
+      "step": 250
+    },
+    {
+      "epoch": 0.36429608127721336,
+      "grad_norm": 0.4419240951538086,
+      "learning_rate": 0.00014401261347778233,
+      "loss": 1.2126,
+      "step": 251
+    },
+    {
+      "epoch": 0.36574746008708275,
+      "grad_norm": 0.4968956410884857,
+      "learning_rate": 0.0001435966874591247,
+      "loss": 0.8786,
+      "step": 252
+    },
+    {
+      "epoch": 0.3671988388969521,
+      "grad_norm": 0.4824519455432892,
+      "learning_rate": 0.0001431798281584144,
+      "loss": 3.0825,
+      "step": 253
+    },
+    {
+      "epoch": 0.3686502177068215,
+      "grad_norm": 0.30820131301879883,
+      "learning_rate": 0.0001427620444994328,
+      "loss": 0.6051,
+      "step": 254
+    },
+    {
+      "epoch": 0.37010159651669083,
+      "grad_norm": 0.42164450883865356,
+      "learning_rate": 0.00014234334542574906,
+      "loss": 0.5161,
+      "step": 255
+    },
+    {
+      "epoch": 0.37155297532656023,
+      "grad_norm": 0.32636889815330505,
+      "learning_rate": 0.00014192373990052877,
+      "loss": 0.855,
+      "step": 256
+    },
+    {
+      "epoch": 0.37300435413642963,
+      "grad_norm": 0.332589715719223,
+      "learning_rate": 0.0001415032369063422,
+      "loss": 0.402,
+      "step": 257
+    },
+    {
+      "epoch": 0.37445573294629897,
+      "grad_norm": 0.37509143352508545,
+      "learning_rate": 0.00014108184544497178,
+      "loss": 1.1532,
+      "step": 258
+    },
+    {
+      "epoch": 0.37590711175616837,
+      "grad_norm": 0.2669030725955963,
+      "learning_rate": 0.00014065957453721962,
+      "loss": 0.5493,
+      "step": 259
+    },
+    {
+      "epoch": 0.37735849056603776,
+      "grad_norm": 0.387205570936203,
+      "learning_rate": 0.00014023643322271426,
+      "loss": 1.3697,
+      "step": 260
+    },
+    {
+      "epoch": 0.3788098693759071,
+      "grad_norm": 0.3195268213748932,
+      "learning_rate": 0.00013981243055971712,
+      "loss": 0.731,
+      "step": 261
+    },
+    {
+      "epoch": 0.3802612481857765,
+      "grad_norm": 0.3117748498916626,
+      "learning_rate": 0.00013938757562492873,
+      "loss": 0.4412,
+      "step": 262
+    },
+    {
+      "epoch": 0.38171262699564584,
+      "grad_norm": 0.43555042147636414,
+      "learning_rate": 0.00013896187751329437,
+      "loss": 0.9857,
+      "step": 263
+    },
+    {
+      "epoch": 0.38316400580551524,
+      "grad_norm": 0.3363146483898163,
+      "learning_rate": 0.0001385353453378093,
+      "loss": 0.8749,
+      "step": 264
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 0.3038422465324402,
+      "learning_rate": 0.00013810798822932378,
+      "loss": 0.5955,
+      "step": 265
+    },
+    {
+      "epoch": 0.386066763425254,
+      "grad_norm": 0.47416090965270996,
+      "learning_rate": 0.00013767981533634754,
+      "loss": 3.7495,
+      "step": 266
+    },
+    {
+      "epoch": 0.3875181422351234,
+      "grad_norm": 0.237370103597641,
+      "learning_rate": 0.00013725083582485397,
+      "loss": 0.3979,
+      "step": 267
+    },
+    {
+      "epoch": 0.3889695210449927,
+      "grad_norm": 0.1970636397600174,
+      "learning_rate": 0.0001368210588780838,
+      "loss": 0.297,
+      "step": 268
+    },
+    {
+      "epoch": 0.3904208998548621,
+      "grad_norm": 0.5174916982650757,
+      "learning_rate": 0.00013639049369634876,
+      "loss": 1.4907,
+      "step": 269
+    },
+    {
+      "epoch": 0.3918722786647315,
+      "grad_norm": 0.2811208665370941,
+      "learning_rate": 0.00013595914949683432,
+      "loss": 0.5772,
+      "step": 270
+    },
+    {
+      "epoch": 0.39332365747460085,
+      "grad_norm": 0.30495163798332214,
+      "learning_rate": 0.00013552703551340258,
+      "loss": 0.3624,
+      "step": 271
+    },
+    {
+      "epoch": 0.39477503628447025,
+      "grad_norm": 0.4540651738643646,
+      "learning_rate": 0.00013509416099639457,
+      "loss": 1.6049,
+      "step": 272
+    },
+    {
+      "epoch": 0.39622641509433965,
+      "grad_norm": 0.4534936547279358,
+      "learning_rate": 0.00013466053521243214,
+      "loss": 1.3386,
+      "step": 273
+    },
+    {
+      "epoch": 0.397677793904209,
+      "grad_norm": 0.23787882924079895,
+      "learning_rate": 0.00013422616744421966,
+      "loss": 0.5185,
+      "step": 274
+    },
+    {
+      "epoch": 0.3991291727140784,
+      "grad_norm": 0.35515129566192627,
+      "learning_rate": 0.00013379106699034537,
+      "loss": 1.4753,
+      "step": 275
+    },
+    {
+      "epoch": 0.4005805515239477,
+      "grad_norm": 0.38288307189941406,
+      "learning_rate": 0.00013335524316508208,
+      "loss": 1.4326,
+      "step": 276
+    },
+    {
+      "epoch": 0.4020319303338171,
+      "grad_norm": 0.27358075976371765,
+      "learning_rate": 0.0001329187052981881,
+      "loss": 0.683,
+      "step": 277
+    },
+    {
+      "epoch": 0.4034833091436865,
+      "grad_norm": 0.5965114831924438,
+      "learning_rate": 0.00013248146273470726,
+      "loss": 2.2199,
+      "step": 278
+    },
+    {
+      "epoch": 0.40493468795355586,
+      "grad_norm": 0.27985599637031555,
+      "learning_rate": 0.00013204352483476897,
+      "loss": 0.6224,
+      "step": 279
+    },
+    {
+      "epoch": 0.40638606676342526,
+      "grad_norm": 0.31146517395973206,
+      "learning_rate": 0.0001316049009733879,
+      "loss": 0.9722,
+      "step": 280
+    },
+    {
+      "epoch": 0.40783744557329465,
+      "grad_norm": 0.2670224905014038,
+      "learning_rate": 0.0001311656005402631,
+      "loss": 0.4706,
+      "step": 281
+    },
+    {
+      "epoch": 0.409288824383164,
+      "grad_norm": 0.30668875575065613,
+      "learning_rate": 0.00013072563293957723,
+      "loss": 0.8013,
+      "step": 282
+    },
+    {
+      "epoch": 0.4107402031930334,
+      "grad_norm": 1.6362825632095337,
+      "learning_rate": 0.00013028500758979506,
+      "loss": 1.7021,
+      "step": 283
+    },
+    {
+      "epoch": 0.41219158200290273,
+      "grad_norm": 0.3747856914997101,
+      "learning_rate": 0.00012984373392346194,
+      "loss": 0.7395,
+      "step": 284
+    },
+    {
+      "epoch": 0.41364296081277213,
+      "grad_norm": 0.29460155963897705,
+      "learning_rate": 0.0001294018213870018,
+      "loss": 0.4852,
+      "step": 285
+    },
+    {
+      "epoch": 0.41509433962264153,
+      "grad_norm": 0.3442453444004059,
+      "learning_rate": 0.00012895927944051502,
+      "loss": 1.0444,
+      "step": 286
+    },
+    {
+      "epoch": 0.41654571843251087,
+      "grad_norm": 0.4372914731502533,
+      "learning_rate": 0.00012851611755757586,
+      "loss": 0.9556,
+      "step": 287
+    },
+    {
+      "epoch": 0.41799709724238027,
+      "grad_norm": 0.36953088641166687,
+      "learning_rate": 0.00012807234522502968,
+      "loss": 0.6292,
+      "step": 288
+    },
+    {
+      "epoch": 0.41944847605224966,
+      "grad_norm": 0.2996106445789337,
+      "learning_rate": 0.00012762797194278984,
+      "loss": 0.4637,
+      "step": 289
+    },
+    {
+      "epoch": 0.420899854862119,
+      "grad_norm": 0.46969079971313477,
+      "learning_rate": 0.0001271830072236343,
+      "loss": 2.3145,
+      "step": 290
+    },
+    {
+      "epoch": 0.4223512336719884,
+      "grad_norm": 0.377268522977829,
+      "learning_rate": 0.00012673746059300208,
+      "loss": 1.1514,
+      "step": 291
+    },
+    {
+      "epoch": 0.42380261248185774,
+      "grad_norm": 0.37440070509910583,
+      "learning_rate": 0.00012629134158878918,
+      "loss": 1.2862,
+      "step": 292
+    },
+    {
+      "epoch": 0.42525399129172714,
+      "grad_norm": 0.47031837701797485,
+      "learning_rate": 0.0001258446597611447,
+      "loss": 1.3154,
+      "step": 293
+    },
+    {
+      "epoch": 0.42670537010159654,
+      "grad_norm": 0.28943926095962524,
+      "learning_rate": 0.00012539742467226601,
+      "loss": 0.7372,
+      "step": 294
+    },
+    {
+      "epoch": 0.4281567489114659,
+      "grad_norm": 0.3967966139316559,
+      "learning_rate": 0.00012494964589619423,
+      "loss": 1.7696,
+      "step": 295
+    },
+    {
+      "epoch": 0.4296081277213353,
+      "grad_norm": 0.3762686252593994,
+      "learning_rate": 0.00012450133301860952,
+      "loss": 1.2166,
+      "step": 296
+    },
+    {
+      "epoch": 0.4310595065312046,
+      "grad_norm": 0.3782839775085449,
+      "learning_rate": 0.00012405249563662537,
+      "loss": 0.9249,
+      "step": 297
+    },
+    {
+      "epoch": 0.432510885341074,
+      "grad_norm": 0.551977276802063,
+      "learning_rate": 0.0001236031433585836,
+      "loss": 0.7932,
+      "step": 298
+    },
+    {
+      "epoch": 0.4339622641509434,
+      "grad_norm": 0.4811376631259918,
+      "learning_rate": 0.00012315328580384842,
+      "loss": 1.4454,
+      "step": 299
+    },
+    {
+      "epoch": 0.43541364296081275,
+      "grad_norm": 0.6780052185058594,
+      "learning_rate": 0.00012270293260260067,
+      "loss": 1.8719,
+      "step": 300
+    },
+    {
+      "epoch": 0.43686502177068215,
+      "grad_norm": 0.5372028350830078,
+      "learning_rate": 0.00012225209339563145,
+      "loss": 1.13,
+      "step": 301
+    },
+    {
+      "epoch": 0.43831640058055155,
+      "grad_norm": 0.6659995317459106,
+      "learning_rate": 0.00012180077783413601,
+      "loss": 1.2524,
+      "step": 302
+    },
+    {
+      "epoch": 0.4397677793904209,
+      "grad_norm": 0.30327922105789185,
+      "learning_rate": 0.00012134899557950698,
+      "loss": 0.4541,
+      "step": 303
+    },
+    {
+      "epoch": 0.4412191582002903,
+      "grad_norm": 0.28171366453170776,
+      "learning_rate": 0.00012089675630312754,
+      "loss": 0.5961,
+      "step": 304
+    },
+    {
+      "epoch": 0.4426705370101596,
+      "grad_norm": 0.49443453550338745,
+      "learning_rate": 0.00012044406968616432,
+      "loss": 1.3738,
+      "step": 305
+    },
+    {
+      "epoch": 0.444121915820029,
+      "grad_norm": 0.27701106667518616,
+      "learning_rate": 0.00011999094541936047,
+      "loss": 0.8035,
+      "step": 306
+    },
+    {
+      "epoch": 0.4455732946298984,
+      "grad_norm": 0.44474315643310547,
+      "learning_rate": 0.00011953739320282778,
+      "loss": 1.9202,
+      "step": 307
+    },
+    {
+      "epoch": 0.44702467343976776,
+      "grad_norm": 0.36672115325927734,
+      "learning_rate": 0.00011908342274583936,
+      "loss": 1.3322,
+      "step": 308
+    },
+    {
+      "epoch": 0.44847605224963716,
+      "grad_norm": 0.47056931257247925,
+      "learning_rate": 0.00011862904376662167,
+      "loss": 0.6009,
+      "step": 309
+    },
+    {
+      "epoch": 0.44992743105950656,
+      "grad_norm": 0.33981776237487793,
+      "learning_rate": 0.00011817426599214636,
+      "loss": 0.8832,
+      "step": 310
+    },
+    {
+      "epoch": 0.4513788098693759,
+      "grad_norm": 0.5165224075317383,
+      "learning_rate": 0.0001177190991579223,
+      "loss": 1.2614,
+      "step": 311
+    },
+    {
+      "epoch": 0.4528301886792453,
+      "grad_norm": 0.36011219024658203,
+      "learning_rate": 0.00011726355300778693,
+      "loss": 0.5128,
+      "step": 312
+    },
+    {
+      "epoch": 0.45428156748911463,
+      "grad_norm": 0.4018263518810272,
+      "learning_rate": 0.00011680763729369783,
+      "loss": 0.876,
+      "step": 313
+    },
+    {
+      "epoch": 0.45573294629898403,
+      "grad_norm": 0.36778366565704346,
+      "learning_rate": 0.00011635136177552391,
+      "loss": 0.6217,
+      "step": 314
+    },
+    {
+      "epoch": 0.45718432510885343,
+      "grad_norm": 0.3395918905735016,
+      "learning_rate": 0.00011589473622083642,
+      "loss": 0.7758,
+      "step": 315
+    },
+    {
+      "epoch": 0.45863570391872277,
+      "grad_norm": 0.30903902649879456,
+      "learning_rate": 0.00011543777040469994,
+      "loss": 1.1969,
+      "step": 316
+    },
+    {
+      "epoch": 0.46008708272859217,
+      "grad_norm": 0.36154720187187195,
+      "learning_rate": 0.00011498047410946306,
+      "loss": 0.7883,
+      "step": 317
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 0.3478432595729828,
+      "learning_rate": 0.00011452285712454904,
+      "loss": 1.1028,
+      "step": 318
+    },
+    {
+      "epoch": 0.4629898403483309,
+      "grad_norm": 0.8324244618415833,
+      "learning_rate": 0.00011406492924624614,
+      "loss": 2.98,
+      "step": 319
+    },
+    {
+      "epoch": 0.4644412191582003,
+      "grad_norm": 0.36261022090911865,
+      "learning_rate": 0.00011360670027749807,
+      "loss": 0.8629,
+      "step": 320
+    },
+    {
+      "epoch": 0.46589259796806964,
+      "grad_norm": 0.42329132556915283,
+      "learning_rate": 0.00011314818002769389,
+      "loss": 1.8163,
+      "step": 321
+    },
+    {
+      "epoch": 0.46734397677793904,
+      "grad_norm": 0.5022541284561157,
+      "learning_rate": 0.0001126893783124583,
+      "loss": 1.5672,
+      "step": 322
+    },
+    {
+      "epoch": 0.46879535558780844,
+      "grad_norm": 0.3574770987033844,
+      "learning_rate": 0.00011223030495344127,
+      "loss": 0.3025,
+      "step": 323
+    },
+    {
+      "epoch": 0.4702467343976778,
+      "grad_norm": 0.4947843551635742,
+      "learning_rate": 0.00011177096977810803,
+      "loss": 0.8242,
+      "step": 324
+    },
+    {
+      "epoch": 0.4716981132075472,
+      "grad_norm": 0.4501658082008362,
+      "learning_rate": 0.00011131138261952845,
+      "loss": 0.8425,
+      "step": 325
+    },
+    {
+      "epoch": 0.4731494920174166,
+      "grad_norm": 0.21305899322032928,
+      "learning_rate": 0.00011085155331616663,
+      "loss": 0.245,
+      "step": 326
+    },
+    {
+      "epoch": 0.4746008708272859,
+      "grad_norm": 0.38023510575294495,
+      "learning_rate": 0.00011039149171167045,
+      "loss": 1.0404,
+      "step": 327
+    },
+    {
+      "epoch": 0.4760522496371553,
+      "grad_norm": 0.4849722385406494,
+      "learning_rate": 0.00010993120765466056,
+      "loss": 1.0257,
+      "step": 328
+    },
+    {
+      "epoch": 0.47750362844702465,
+      "grad_norm": 0.4337415099143982,
+      "learning_rate": 0.00010947071099851971,
+      "loss": 0.787,
+      "step": 329
+    },
+    {
+      "epoch": 0.47895500725689405,
+      "grad_norm": 0.3395763635635376,
+      "learning_rate": 0.00010901001160118189,
+      "loss": 0.9218,
+      "step": 330
+    },
+    {
+      "epoch": 0.48040638606676345,
+      "grad_norm": 0.5438852906227112,
+      "learning_rate": 0.00010854911932492114,
+      "loss": 0.7867,
+      "step": 331
+    },
+    {
+      "epoch": 0.4818577648766328,
+      "grad_norm": 0.299010306596756,
+      "learning_rate": 0.00010808804403614043,
+      "loss": 0.7583,
+      "step": 332
+    },
+    {
+      "epoch": 0.4833091436865022,
+      "grad_norm": 0.44489172101020813,
+      "learning_rate": 0.00010762679560516067,
+      "loss": 1.9207,
+      "step": 333
+    },
+    {
+      "epoch": 0.4847605224963715,
+      "grad_norm": 0.3161913752555847,
+      "learning_rate": 0.00010716538390600908,
+      "loss": 0.7175,
+      "step": 334
+    },
+    {
+      "epoch": 0.4862119013062409,
+      "grad_norm": 0.38730329275131226,
+      "learning_rate": 0.00010670381881620814,
+      "loss": 1.2888,
+      "step": 335
+    },
+    {
+      "epoch": 0.4876632801161103,
+      "grad_norm": 0.3712642788887024,
+      "learning_rate": 0.00010624211021656392,
+      "loss": 0.9692,
+      "step": 336
+    },
+    {
+      "epoch": 0.48911465892597966,
+      "grad_norm": 0.3168681859970093,
+      "learning_rate": 0.00010578026799095464,
+      "loss": 1.1365,
+      "step": 337
+    },
+    {
+      "epoch": 0.49056603773584906,
+      "grad_norm": 0.36398571729660034,
+      "learning_rate": 0.00010531830202611904,
+      "loss": 0.9235,
+      "step": 338
+    },
+    {
+      "epoch": 0.49201741654571846,
+      "grad_norm": 0.30887413024902344,
+      "learning_rate": 0.00010485622221144484,
+      "loss": 0.4427,
+      "step": 339
+    },
+    {
+      "epoch": 0.4934687953555878,
+      "grad_norm": 0.26171329617500305,
+      "learning_rate": 0.0001043940384387569,
+      "loss": 0.6965,
+      "step": 340
+    },
+    {
+      "epoch": 0.4949201741654572,
+      "grad_norm": 0.44587722420692444,
+      "learning_rate": 0.00010393176060210557,
+      "loss": 1.8459,
+      "step": 341
+    },
+    {
+      "epoch": 0.49637155297532654,
+      "grad_norm": 0.341634064912796,
+      "learning_rate": 0.0001034693985975548,
+      "loss": 0.7757,
+      "step": 342
+    },
+    {
+      "epoch": 0.49782293178519593,
+      "grad_norm": 0.378426194190979,
+      "learning_rate": 0.0001030069623229704,
+      "loss": 1.1976,
+      "step": 343
+    },
+    {
+      "epoch": 0.49927431059506533,
+      "grad_norm": 0.4587966799736023,
+      "learning_rate": 0.00010254446167780803,
+      "loss": 0.7019,
+      "step": 344
+    },
+    {
+      "epoch": 0.5007256894049347,
+      "grad_norm": 0.2761523127555847,
+      "learning_rate": 0.00010208190656290137,
+      "loss": 0.3916,
+      "step": 345
+    },
+    {
+      "epoch": 0.502177068214804,
+      "grad_norm": 0.42763715982437134,
+      "learning_rate": 0.00010161930688025017,
+      "loss": 0.8072,
+      "step": 346
+    },
+    {
+      "epoch": 0.502177068214804,
+      "eval_loss": 0.23982280492782593,
+      "eval_runtime": 32.4007,
+      "eval_samples_per_second": 8.981,
+      "eval_steps_per_second": 4.506,
+      "step": 346
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 2.1878609950015488e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null