Training in progress, step 280, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +991 -3

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d86f4e372d5aed8a68d588fb02af3117c13f017c3f469551b2c4c92b6bfe1bf
 size 80013120

 version https://git-lfs.github.com/spec/v1
+oid sha256:c9d19d032b2720552ed5a8c04c8453d710ed0eed172ae313734cb428d3f003fc
 size 80013120

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7cfe95fa8838cc69625accc8bdf08350c3bf91eee0786dfbac61f9decaf8f7e2
-size 41119636

 version https://git-lfs.github.com/spec/v1
+oid sha256:08e5683a29463e32746f14f186f042dd447b12cafcad678bbbddb34b9249098a
+size 41120084

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1857b486a75282b585b54850d858a16d3ce6d2026aa85faa0e5f4fe55552d5f2
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:3369e2942ff752b68da734b9eaf1a12b8c42e1d8b80214950313c71f22a426be
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:92ab57d319127f109d30cdfd0fbe95639822f206719c06f57561d7cc19a4f9f0
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:9fe9c01b8c53647998de80cbc88fe3102f7ee94466c3d3ba6db0d6d4b3bdc06d
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.25,
   "eval_steps": 140,
-  "global_step": 140,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -995,6 +995,994 @@
       "eval_samples_per_second": 17.654,
       "eval_steps_per_second": 8.827,
       "step": 140
     }
   ],
   "logging_steps": 1,
@@ -1014,7 +2002,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 4.568641623923098e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.5,
   "eval_steps": 140,
+  "global_step": 280,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 17.654,
       "eval_steps_per_second": 8.827,
       "step": 140
+    },
+    {
+      "epoch": 0.2517857142857143,
+      "grad_norm": 0.36249810457229614,
+      "learning_rate": 0.00017328668381631318,
+      "loss": 1.1208,
+      "step": 141
+    },
+    {
+      "epoch": 0.25357142857142856,
+      "grad_norm": 0.3750612735748291,
+      "learning_rate": 0.00017289686274214118,
+      "loss": 1.2502,
+      "step": 142
+    },
+    {
+      "epoch": 0.25535714285714284,
+      "grad_norm": 0.4201869070529938,
+      "learning_rate": 0.0001725046632837007,
+      "loss": 1.1947,
+      "step": 143
+    },
+    {
+      "epoch": 0.2571428571428571,
+      "grad_norm": 0.4865645468235016,
+      "learning_rate": 0.00017211009823716694,
+      "loss": 0.8749,
+      "step": 144
+    },
+    {
+      "epoch": 0.25892857142857145,
+      "grad_norm": 0.38693225383758545,
+      "learning_rate": 0.00017171318047589637,
+      "loss": 1.2495,
+      "step": 145
+    },
+    {
+      "epoch": 0.26071428571428573,
+      "grad_norm": 0.40707525610923767,
+      "learning_rate": 0.00017131392295000674,
+      "loss": 1.2321,
+      "step": 146
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 0.39570894837379456,
+      "learning_rate": 0.00017091233868595467,
+      "loss": 1.301,
+      "step": 147
+    },
+    {
+      "epoch": 0.2642857142857143,
+      "grad_norm": 0.4085226058959961,
+      "learning_rate": 0.00017050844078611056,
+      "loss": 1.5369,
+      "step": 148
+    },
+    {
+      "epoch": 0.26607142857142857,
+      "grad_norm": 0.47094810009002686,
+      "learning_rate": 0.0001701022424283311,
+      "loss": 1.9374,
+      "step": 149
+    },
+    {
+      "epoch": 0.26785714285714285,
+      "grad_norm": 0.8517308831214905,
+      "learning_rate": 0.00016969375686552937,
+      "loss": 1.808,
+      "step": 150
+    },
+    {
+      "epoch": 0.26964285714285713,
+      "grad_norm": 0.1922745406627655,
+      "learning_rate": 0.00016928299742524234,
+      "loss": 1.6608,
+      "step": 151
+    },
+    {
+      "epoch": 0.2714285714285714,
+      "grad_norm": 0.2090916484594345,
+      "learning_rate": 0.00016886997750919619,
+      "loss": 1.8009,
+      "step": 152
+    },
+    {
+      "epoch": 0.2732142857142857,
+      "grad_norm": 0.21698515117168427,
+      "learning_rate": 0.00016845471059286887,
+      "loss": 1.7821,
+      "step": 153
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.21791532635688782,
+      "learning_rate": 0.00016803721022505067,
+      "loss": 1.5901,
+      "step": 154
+    },
+    {
+      "epoch": 0.2767857142857143,
+      "grad_norm": 0.22199980914592743,
+      "learning_rate": 0.00016761749002740193,
+      "loss": 1.7047,
+      "step": 155
+    },
+    {
+      "epoch": 0.2785714285714286,
+      "grad_norm": 0.2096625566482544,
+      "learning_rate": 0.0001671955636940088,
+      "loss": 1.6898,
+      "step": 156
+    },
+    {
+      "epoch": 0.28035714285714286,
+      "grad_norm": 0.22975414991378784,
+      "learning_rate": 0.00016677144499093626,
+      "loss": 1.7631,
+      "step": 157
+    },
+    {
+      "epoch": 0.28214285714285714,
+      "grad_norm": 0.2187148928642273,
+      "learning_rate": 0.0001663451477557792,
+      "loss": 1.7872,
+      "step": 158
+    },
+    {
+      "epoch": 0.2839285714285714,
+      "grad_norm": 0.2257414609193802,
+      "learning_rate": 0.0001659166858972107,
+      "loss": 1.7732,
+      "step": 159
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 0.22986693680286407,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 1.7031,
+      "step": 160
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.21585014462471008,
+      "learning_rate": 0.0001650533242971987,
+      "loss": 1.8421,
+      "step": 161
+    },
+    {
+      "epoch": 0.2892857142857143,
+      "grad_norm": 0.22519604861736298,
+      "learning_rate": 0.00016461845272439741,
+      "loss": 1.6529,
+      "step": 162
+    },
+    {
+      "epoch": 0.2910714285714286,
+      "grad_norm": 0.22279705107212067,
+      "learning_rate": 0.0001641814728645502,
+      "loss": 1.9288,
+      "step": 163
+    },
+    {
+      "epoch": 0.29285714285714287,
+      "grad_norm": 0.22392615675926208,
+      "learning_rate": 0.000163742398974869,
+      "loss": 1.693,
+      "step": 164
+    },
+    {
+      "epoch": 0.29464285714285715,
+      "grad_norm": 0.22729454934597015,
+      "learning_rate": 0.00016330124538088705,
+      "loss": 1.7027,
+      "step": 165
+    },
+    {
+      "epoch": 0.29642857142857143,
+      "grad_norm": 0.2229882776737213,
+      "learning_rate": 0.00016285802647599156,
+      "loss": 1.8262,
+      "step": 166
+    },
+    {
+      "epoch": 0.2982142857142857,
+      "grad_norm": 0.25520074367523193,
+      "learning_rate": 0.00016241275672095395,
+      "loss": 1.6009,
+      "step": 167
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.24272315204143524,
+      "learning_rate": 0.00016196545064345812,
+      "loss": 1.9227,
+      "step": 168
+    },
+    {
+      "epoch": 0.30178571428571427,
+      "grad_norm": 0.24380216002464294,
+      "learning_rate": 0.00016151612283762652,
+      "loss": 1.5198,
+      "step": 169
+    },
+    {
+      "epoch": 0.30357142857142855,
+      "grad_norm": 0.3242342472076416,
+      "learning_rate": 0.00016106478796354382,
+      "loss": 1.6981,
+      "step": 170
+    },
+    {
+      "epoch": 0.3053571428571429,
+      "grad_norm": 0.277855783700943,
+      "learning_rate": 0.00016061146074677885,
+      "loss": 1.7011,
+      "step": 171
+    },
+    {
+      "epoch": 0.30714285714285716,
+      "grad_norm": 0.2710039019584656,
+      "learning_rate": 0.00016015615597790388,
+      "loss": 1.7522,
+      "step": 172
+    },
+    {
+      "epoch": 0.30892857142857144,
+      "grad_norm": 0.26541268825531006,
+      "learning_rate": 0.00015969888851201226,
+      "loss": 1.3804,
+      "step": 173
+    },
+    {
+      "epoch": 0.3107142857142857,
+      "grad_norm": 0.28985923528671265,
+      "learning_rate": 0.00015923967326823368,
+      "loss": 1.6453,
+      "step": 174
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.33939245343208313,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 1.1725,
+      "step": 175
+    },
+    {
+      "epoch": 0.3142857142857143,
+      "grad_norm": 0.29770731925964355,
+      "learning_rate": 0.0001583154594407932,
+      "loss": 1.6746,
+      "step": 176
+    },
+    {
+      "epoch": 0.31607142857142856,
+      "grad_norm": 0.3280562460422516,
+      "learning_rate": 0.0001578504910111811,
+      "loss": 1.1357,
+      "step": 177
+    },
+    {
+      "epoch": 0.31785714285714284,
+      "grad_norm": 0.2856597304344177,
+      "learning_rate": 0.00015738363511079776,
+      "loss": 1.1127,
+      "step": 178
+    },
+    {
+      "epoch": 0.3196428571428571,
+      "grad_norm": 0.316491961479187,
+      "learning_rate": 0.00015691490697161182,
+      "loss": 1.4281,
+      "step": 179
+    },
+    {
+      "epoch": 0.32142857142857145,
+      "grad_norm": 0.3632654845714569,
+      "learning_rate": 0.00015644432188667695,
+      "loss": 1.3413,
+      "step": 180
+    },
+    {
+      "epoch": 0.32321428571428573,
+      "grad_norm": 0.34329405426979065,
+      "learning_rate": 0.00015597189520963277,
+      "loss": 1.0579,
+      "step": 181
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.32447105646133423,
+      "learning_rate": 0.00015549764235420405,
+      "loss": 1.243,
+      "step": 182
+    },
+    {
+      "epoch": 0.3267857142857143,
+      "grad_norm": 0.3558500409126282,
+      "learning_rate": 0.0001550215787936977,
+      "loss": 1.1376,
+      "step": 183
+    },
+    {
+      "epoch": 0.32857142857142857,
+      "grad_norm": 0.3373570740222931,
+      "learning_rate": 0.00015454372006049803,
+      "loss": 1.1251,
+      "step": 184
+    },
+    {
+      "epoch": 0.33035714285714285,
+      "grad_norm": 0.36412546038627625,
+      "learning_rate": 0.00015406408174555976,
+      "loss": 1.3238,
+      "step": 185
+    },
+    {
+      "epoch": 0.33214285714285713,
+      "grad_norm": 0.364442378282547,
+      "learning_rate": 0.00015358267949789966,
+      "loss": 0.9448,
+      "step": 186
+    },
+    {
+      "epoch": 0.3339285714285714,
+      "grad_norm": 0.3172107934951782,
+      "learning_rate": 0.00015309952902408576,
+      "loss": 1.2744,
+      "step": 187
+    },
+    {
+      "epoch": 0.3357142857142857,
+      "grad_norm": 0.34173399209976196,
+      "learning_rate": 0.00015261464608772488,
+      "loss": 1.0923,
+      "step": 188
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 0.33419185876846313,
+      "learning_rate": 0.0001521280465089484,
+      "loss": 1.2762,
+      "step": 189
+    },
+    {
+      "epoch": 0.3392857142857143,
+      "grad_norm": 0.3866868317127228,
+      "learning_rate": 0.0001516397461638962,
+      "loss": 0.9595,
+      "step": 190
+    },
+    {
+      "epoch": 0.3410714285714286,
+      "grad_norm": 0.3978990614414215,
+      "learning_rate": 0.00015114976098419842,
+      "loss": 0.9993,
+      "step": 191
+    },
+    {
+      "epoch": 0.34285714285714286,
+      "grad_norm": 0.3546142876148224,
+      "learning_rate": 0.00015065810695645584,
+      "loss": 1.3421,
+      "step": 192
+    },
+    {
+      "epoch": 0.34464285714285714,
+      "grad_norm": 0.39728498458862305,
+      "learning_rate": 0.00015016480012171828,
+      "loss": 1.1209,
+      "step": 193
+    },
+    {
+      "epoch": 0.3464285714285714,
+      "grad_norm": 0.4170741140842438,
+      "learning_rate": 0.00014966985657496114,
+      "loss": 1.0024,
+      "step": 194
+    },
+    {
+      "epoch": 0.3482142857142857,
+      "grad_norm": 0.4226652681827545,
+      "learning_rate": 0.0001491732924645604,
+      "loss": 1.3139,
+      "step": 195
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.3712114691734314,
+      "learning_rate": 0.00014867512399176563,
+      "loss": 1.1574,
+      "step": 196
+    },
+    {
+      "epoch": 0.3517857142857143,
+      "grad_norm": 0.3655322790145874,
+      "learning_rate": 0.00014817536741017152,
+      "loss": 1.6149,
+      "step": 197
+    },
+    {
+      "epoch": 0.3535714285714286,
+      "grad_norm": 0.4362059533596039,
+      "learning_rate": 0.0001476740390251875,
+      "loss": 1.7657,
+      "step": 198
+    },
+    {
+      "epoch": 0.35535714285714287,
+      "grad_norm": 0.43134769797325134,
+      "learning_rate": 0.00014717115519350567,
+      "loss": 1.7167,
+      "step": 199
+    },
+    {
+      "epoch": 0.35714285714285715,
+      "grad_norm": 0.7784890532493591,
+      "learning_rate": 0.00014666673232256738,
+      "loss": 2.036,
+      "step": 200
+    },
+    {
+      "epoch": 0.35892857142857143,
+      "grad_norm": 0.17376984655857086,
+      "learning_rate": 0.0001461607868700276,
+      "loss": 1.4856,
+      "step": 201
+    },
+    {
+      "epoch": 0.3607142857142857,
+      "grad_norm": 0.2141953408718109,
+      "learning_rate": 0.00014565333534321826,
+      "loss": 1.7491,
+      "step": 202
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 0.22548137605190277,
+      "learning_rate": 0.00014514439429860943,
+      "loss": 1.8457,
+      "step": 203
+    },
+    {
+      "epoch": 0.36428571428571427,
+      "grad_norm": 0.20618294179439545,
+      "learning_rate": 0.0001446339803412692,
+      "loss": 1.4987,
+      "step": 204
+    },
+    {
+      "epoch": 0.36607142857142855,
+      "grad_norm": 0.21025151014328003,
+      "learning_rate": 0.00014412211012432212,
+      "loss": 1.5568,
+      "step": 205
+    },
+    {
+      "epoch": 0.3678571428571429,
+      "grad_norm": 0.21678180992603302,
+      "learning_rate": 0.00014360880034840554,
+      "loss": 1.7841,
+      "step": 206
+    },
+    {
+      "epoch": 0.36964285714285716,
+      "grad_norm": 0.20914790034294128,
+      "learning_rate": 0.0001430940677611249,
+      "loss": 1.6693,
+      "step": 207
+    },
+    {
+      "epoch": 0.37142857142857144,
+      "grad_norm": 0.21597585082054138,
+      "learning_rate": 0.00014257792915650728,
+      "loss": 1.648,
+      "step": 208
+    },
+    {
+      "epoch": 0.3732142857142857,
+      "grad_norm": 0.23697789013385773,
+      "learning_rate": 0.00014206040137445348,
+      "loss": 1.7616,
+      "step": 209
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.2535800635814667,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 2.0279,
+      "step": 210
+    },
+    {
+      "epoch": 0.3767857142857143,
+      "grad_norm": 0.21204812824726105,
+      "learning_rate": 0.0001410212458637112,
+      "loss": 1.8472,
+      "step": 211
+    },
+    {
+      "epoch": 0.37857142857142856,
+      "grad_norm": 0.36059629917144775,
+      "learning_rate": 0.00014049965203924054,
+      "loss": 1.8042,
+      "step": 212
+    },
+    {
+      "epoch": 0.38035714285714284,
+      "grad_norm": 0.21400661766529083,
+      "learning_rate": 0.0001399767368446634,
+      "loss": 1.698,
+      "step": 213
+    },
+    {
+      "epoch": 0.3821428571428571,
+      "grad_norm": 0.24055758118629456,
+      "learning_rate": 0.00013945251734097828,
+      "loss": 1.8758,
+      "step": 214
+    },
+    {
+      "epoch": 0.38392857142857145,
+      "grad_norm": 0.23605166375637054,
+      "learning_rate": 0.00013892701063173918,
+      "loss": 1.7425,
+      "step": 215
+    },
+    {
+      "epoch": 0.38571428571428573,
+      "grad_norm": 0.23343758285045624,
+      "learning_rate": 0.00013840023386249713,
+      "loss": 1.8683,
+      "step": 216
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.2475200593471527,
+      "learning_rate": 0.00013787220422024134,
+      "loss": 1.9091,
+      "step": 217
+    },
+    {
+      "epoch": 0.3892857142857143,
+      "grad_norm": 0.2618944048881531,
+      "learning_rate": 0.00013734293893283783,
+      "loss": 1.5086,
+      "step": 218
+    },
+    {
+      "epoch": 0.39107142857142857,
+      "grad_norm": 0.2627498209476471,
+      "learning_rate": 0.00013681245526846783,
+      "loss": 1.3878,
+      "step": 219
+    },
+    {
+      "epoch": 0.39285714285714285,
+      "grad_norm": 0.24390314519405365,
+      "learning_rate": 0.0001362807705350641,
+      "loss": 1.7332,
+      "step": 220
+    },
+    {
+      "epoch": 0.39464285714285713,
+      "grad_norm": 0.2768295705318451,
+      "learning_rate": 0.00013574790207974646,
+      "loss": 1.3123,
+      "step": 221
+    },
+    {
+      "epoch": 0.3964285714285714,
+      "grad_norm": 0.2606358230113983,
+      "learning_rate": 0.0001352138672882555,
+      "loss": 1.4506,
+      "step": 222
+    },
+    {
+      "epoch": 0.3982142857142857,
+      "grad_norm": 0.24806426465511322,
+      "learning_rate": 0.00013467868358438563,
+      "loss": 1.7087,
+      "step": 223
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2664608061313629,
+      "learning_rate": 0.00013414236842941644,
+      "loss": 1.3124,
+      "step": 224
+    },
+    {
+      "epoch": 0.4017857142857143,
+      "grad_norm": 0.2661263346672058,
+      "learning_rate": 0.00013360493932154302,
+      "loss": 1.2174,
+      "step": 225
+    },
+    {
+      "epoch": 0.4035714285714286,
+      "grad_norm": 0.3460038900375366,
+      "learning_rate": 0.00013306641379530514,
+      "loss": 0.6889,
+      "step": 226
+    },
+    {
+      "epoch": 0.40535714285714286,
+      "grad_norm": 0.2929069995880127,
+      "learning_rate": 0.000132526809421015,
+      "loss": 0.9457,
+      "step": 227
+    },
+    {
+      "epoch": 0.40714285714285714,
+      "grad_norm": 0.3459819257259369,
+      "learning_rate": 0.00013198614380418412,
+      "loss": 1.2547,
+      "step": 228
+    },
+    {
+      "epoch": 0.4089285714285714,
+      "grad_norm": 0.30105313658714294,
+      "learning_rate": 0.00013144443458494882,
+      "loss": 0.957,
+      "step": 229
+    },
+    {
+      "epoch": 0.4107142857142857,
+      "grad_norm": 0.3461960256099701,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 1.3146,
+      "step": 230
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 0.34542855620384216,
+      "learning_rate": 0.00013035795606948023,
+      "loss": 1.1128,
+      "step": 231
+    },
+    {
+      "epoch": 0.4142857142857143,
+      "grad_norm": 0.37605586647987366,
+      "learning_rate": 0.00012981322222145846,
+      "loss": 1.5095,
+      "step": 232
+    },
+    {
+      "epoch": 0.4160714285714286,
+      "grad_norm": 0.37267056107521057,
+      "learning_rate": 0.00012926751566629875,
+      "loss": 1.071,
+      "step": 233
+    },
+    {
+      "epoch": 0.41785714285714287,
+      "grad_norm": 0.3052172064781189,
+      "learning_rate": 0.00012872085420860665,
+      "loss": 1.3136,
+      "step": 234
+    },
+    {
+      "epoch": 0.41964285714285715,
+      "grad_norm": 0.36694592237472534,
+      "learning_rate": 0.00012817325568414297,
+      "loss": 1.2439,
+      "step": 235
+    },
+    {
+      "epoch": 0.42142857142857143,
+      "grad_norm": 0.36055245995521545,
+      "learning_rate": 0.00012762473795924204,
+      "loss": 1.1165,
+      "step": 236
+    },
+    {
+      "epoch": 0.4232142857142857,
+      "grad_norm": 0.3014545738697052,
+      "learning_rate": 0.00012707531893022854,
+      "loss": 1.5423,
+      "step": 237
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.3208891749382019,
+      "learning_rate": 0.00012652501652283377,
+      "loss": 1.1813,
+      "step": 238
+    },
+    {
+      "epoch": 0.42678571428571427,
+      "grad_norm": 0.38703230023384094,
+      "learning_rate": 0.00012597384869161084,
+      "loss": 0.7706,
+      "step": 239
+    },
+    {
+      "epoch": 0.42857142857142855,
+      "grad_norm": 0.38256821036338806,
+      "learning_rate": 0.00012542183341934872,
+      "loss": 1.0565,
+      "step": 240
+    },
+    {
+      "epoch": 0.4303571428571429,
+      "grad_norm": 0.3555380702018738,
+      "learning_rate": 0.0001248689887164855,
+      "loss": 0.849,
+      "step": 241
+    },
+    {
+      "epoch": 0.43214285714285716,
+      "grad_norm": 0.3472703993320465,
+      "learning_rate": 0.00012431533262052098,
+      "loss": 1.3984,
+      "step": 242
+    },
+    {
+      "epoch": 0.43392857142857144,
+      "grad_norm": 0.3631349503993988,
+      "learning_rate": 0.000123760883195428,
+      "loss": 0.8955,
+      "step": 243
+    },
+    {
+      "epoch": 0.4357142857142857,
+      "grad_norm": 0.349295973777771,
+      "learning_rate": 0.00012320565853106316,
+      "loss": 0.8866,
+      "step": 244
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.33635953068733215,
+      "learning_rate": 0.00012264967674257646,
+      "loss": 1.2419,
+      "step": 245
+    },
+    {
+      "epoch": 0.4392857142857143,
+      "grad_norm": 0.3833181858062744,
+      "learning_rate": 0.00012209295596982042,
+      "loss": 1.5507,
+      "step": 246
+    },
+    {
+      "epoch": 0.44107142857142856,
+      "grad_norm": 0.3737214505672455,
+      "learning_rate": 0.00012153551437675821,
+      "loss": 1.4881,
+      "step": 247
+    },
+    {
+      "epoch": 0.44285714285714284,
+      "grad_norm": 0.4705282747745514,
+      "learning_rate": 0.00012097737015087094,
+      "loss": 1.4864,
+      "step": 248
+    },
+    {
+      "epoch": 0.4446428571428571,
+      "grad_norm": 0.39539188146591187,
+      "learning_rate": 0.00012041854150256433,
+      "loss": 1.7855,
+      "step": 249
+    },
+    {
+      "epoch": 0.44642857142857145,
+      "grad_norm": 0.7369075417518616,
+      "learning_rate": 0.00011985904666457455,
+      "loss": 2.01,
+      "step": 250
+    },
+    {
+      "epoch": 0.44821428571428573,
+      "grad_norm": 0.18146094679832458,
+      "learning_rate": 0.00011929890389137337,
+      "loss": 1.5898,
+      "step": 251
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.21558880805969238,
+      "learning_rate": 0.00011873813145857249,
+      "loss": 1.5816,
+      "step": 252
+    },
+    {
+      "epoch": 0.4517857142857143,
+      "grad_norm": 0.19599275290966034,
+      "learning_rate": 0.00011817674766232734,
+      "loss": 1.6433,
+      "step": 253
+    },
+    {
+      "epoch": 0.45357142857142857,
+      "grad_norm": 0.22075910866260529,
+      "learning_rate": 0.00011761477081874015,
+      "loss": 1.6005,
+      "step": 254
+    },
+    {
+      "epoch": 0.45535714285714285,
+      "grad_norm": 0.19471955299377441,
+      "learning_rate": 0.0001170522192632624,
+      "loss": 1.7133,
+      "step": 255
+    },
+    {
+      "epoch": 0.45714285714285713,
+      "grad_norm": 0.19876879453659058,
+      "learning_rate": 0.00011648911135009634,
+      "loss": 1.5085,
+      "step": 256
+    },
+    {
+      "epoch": 0.4589285714285714,
+      "grad_norm": 0.20565317571163177,
+      "learning_rate": 0.00011592546545159645,
+      "loss": 1.7386,
+      "step": 257
+    },
+    {
+      "epoch": 0.4607142857142857,
+      "grad_norm": 0.24483506381511688,
+      "learning_rate": 0.00011536129995766996,
+      "loss": 1.7162,
+      "step": 258
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.21543823182582855,
+      "learning_rate": 0.00011479663327517667,
+      "loss": 1.6966,
+      "step": 259
+    },
+    {
+      "epoch": 0.4642857142857143,
+      "grad_norm": 0.2661048471927643,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 1.8821,
+      "step": 260
+    },
+    {
+      "epoch": 0.4660714285714286,
+      "grad_norm": 0.24292460083961487,
+      "learning_rate": 0.00011366587005308858,
+      "loss": 1.7085,
+      "step": 261
+    },
+    {
+      "epoch": 0.46785714285714286,
+      "grad_norm": 0.216167613863945,
+      "learning_rate": 0.0001130998104065693,
+      "loss": 1.7298,
+      "step": 262
+    },
+    {
+      "epoch": 0.46964285714285714,
+      "grad_norm": 0.2111697793006897,
+      "learning_rate": 0.00011253332335643043,
+      "loss": 1.8098,
+      "step": 263
+    },
+    {
+      "epoch": 0.4714285714285714,
+      "grad_norm": 0.23981061577796936,
+      "learning_rate": 0.00011196642738527659,
+      "loss": 1.7026,
+      "step": 264
+    },
+    {
+      "epoch": 0.4732142857142857,
+      "grad_norm": 0.2623251676559448,
+      "learning_rate": 0.00011139914098905406,
+      "loss": 1.7894,
+      "step": 265
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.2482486367225647,
+      "learning_rate": 0.00011083148267644747,
+      "loss": 1.9019,
+      "step": 266
+    },
+    {
+      "epoch": 0.4767857142857143,
+      "grad_norm": 0.238911435008049,
+      "learning_rate": 0.00011026347096827578,
+      "loss": 1.6809,
+      "step": 267
+    },
+    {
+      "epoch": 0.4785714285714286,
+      "grad_norm": 0.24704696238040924,
+      "learning_rate": 0.00010969512439688816,
+      "loss": 1.6607,
+      "step": 268
+    },
+    {
+      "epoch": 0.48035714285714287,
+      "grad_norm": 0.25105100870132446,
+      "learning_rate": 0.00010912646150555919,
+      "loss": 1.5895,
+      "step": 269
+    },
+    {
+      "epoch": 0.48214285714285715,
+      "grad_norm": 0.2849842607975006,
+      "learning_rate": 0.00010855750084788398,
+      "loss": 2.0812,
+      "step": 270
+    },
+    {
+      "epoch": 0.48392857142857143,
+      "grad_norm": 0.2599342167377472,
+      "learning_rate": 0.00010798826098717276,
+      "loss": 1.1569,
+      "step": 271
+    },
+    {
+      "epoch": 0.4857142857142857,
+      "grad_norm": 0.28037258982658386,
+      "learning_rate": 0.00010741876049584523,
+      "loss": 1.1928,
+      "step": 272
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.29920563101768494,
+      "learning_rate": 0.00010684901795482456,
+      "loss": 1.2244,
+      "step": 273
+    },
+    {
+      "epoch": 0.48928571428571427,
+      "grad_norm": 0.2799164354801178,
+      "learning_rate": 0.00010627905195293135,
+      "loss": 1.4455,
+      "step": 274
+    },
+    {
+      "epoch": 0.49107142857142855,
+      "grad_norm": 0.23873603343963623,
+      "learning_rate": 0.00010570888108627681,
+      "loss": 0.852,
+      "step": 275
+    },
+    {
+      "epoch": 0.4928571428571429,
+      "grad_norm": 0.2817741632461548,
+      "learning_rate": 0.00010513852395765631,
+      "loss": 1.3203,
+      "step": 276
+    },
+    {
+      "epoch": 0.49464285714285716,
+      "grad_norm": 0.27295514941215515,
+      "learning_rate": 0.00010456799917594233,
+      "loss": 0.749,
+      "step": 277
+    },
+    {
+      "epoch": 0.49642857142857144,
+      "grad_norm": 0.30728739500045776,
+      "learning_rate": 0.00010399732535547734,
+      "loss": 1.0083,
+      "step": 278
+    },
+    {
+      "epoch": 0.4982142857142857,
+      "grad_norm": 0.32001444697380066,
+      "learning_rate": 0.00010342652111546635,
+      "loss": 1.573,
+      "step": 279
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.3400101065635681,
+      "learning_rate": 0.00010285560507936961,
+      "loss": 1.0757,
+      "step": 280
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.4439584016799927,
+      "eval_runtime": 13.9741,
+      "eval_samples_per_second": 16.888,
+      "eval_steps_per_second": 8.444,
+      "step": 280
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 9.129139501635994e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null