jssky commited on
Commit
75f7267
·
verified ·
1 Parent(s): f510813

Training in progress, step 372, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:710f323d2d148d19a77874e7041b6682ca6fdf00fb8ff65d993905368b01d329
3
  size 80013120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48a6f0994f83dc96cc6f751735e6fb97678ad4bc393d97a9dc093fd599ec6bc4
3
  size 80013120
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d28277b8c5696e9609288685953a64988fc3bd9fe09e1d48c95ba342438e1db6
3
  size 41120084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9a822bab0e997a745b26087d054e11d6c01f3452497bdb451ee15d4a3a6c0d7
3
  size 41120084
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d290d0f18d2c63d334eda98204765110cec7c5f5c7d088e8f0e88675b235ebea
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f79e2f3a046f199a98a51df125904aa982942d6fc81655bc2badc3d61093187d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bed55d74992475f85034de9808b502db25c265c1fbe10ac1ead4e6ef3743a36b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07e62fa0f3a9a79aa024e3df97b6370a7c2c27576fa400f2d11682e04348f4b3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7505043712172159,
5
  "eval_steps": 93,
6
- "global_step": 279,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1984,6 +1984,665 @@
1984
  "eval_samples_per_second": 14.63,
1985
  "eval_steps_per_second": 7.362,
1986
  "step": 279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1987
  }
1988
  ],
1989
  "logging_steps": 1,
@@ -1998,12 +2657,12 @@
1998
  "should_evaluate": false,
1999
  "should_log": false,
2000
  "should_save": true,
2001
- "should_training_stop": false
2002
  },
2003
  "attributes": {}
2004
  }
2005
  },
2006
- "total_flos": 9.088420770584986e+16,
2007
  "train_batch_size": 2,
2008
  "trial_name": null,
2009
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0013449899125757,
5
  "eval_steps": 93,
6
+ "global_step": 372,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1984
  "eval_samples_per_second": 14.63,
1985
  "eval_steps_per_second": 7.362,
1986
  "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 0.7531943510423672,
1990
+ "grad_norm": 3.4270105361938477,
1991
+ "learning_rate": 3.0215773883115706e-05,
1992
+ "loss": 0.5658,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 0.7558843308675185,
1997
+ "grad_norm": 3.4167211055755615,
1998
+ "learning_rate": 2.9596815548187908e-05,
1999
+ "loss": 0.1781,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 0.7585743106926698,
2004
+ "grad_norm": 3.9443657398223877,
2005
+ "learning_rate": 2.8983159609539635e-05,
2006
+ "loss": 0.5545,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.7612642905178211,
2011
+ "grad_norm": 3.164463758468628,
2012
+ "learning_rate": 2.8374852284497446e-05,
2013
+ "loss": 0.334,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.7639542703429725,
2018
+ "grad_norm": 3.6277055740356445,
2019
+ "learning_rate": 2.7771939387558554e-05,
2020
+ "loss": 0.411,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.7666442501681238,
2025
+ "grad_norm": 4.296345233917236,
2026
+ "learning_rate": 2.717446632694025e-05,
2027
+ "loss": 0.3111,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.769334229993275,
2032
+ "grad_norm": 4.06040096282959,
2033
+ "learning_rate": 2.6582478101160167e-05,
2034
+ "loss": 0.4634,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.7720242098184263,
2039
+ "grad_norm": 4.600436687469482,
2040
+ "learning_rate": 2.599601929564709e-05,
2041
+ "loss": 0.6998,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.7747141896435776,
2046
+ "grad_norm": 3.8486735820770264,
2047
+ "learning_rate": 2.5415134079383006e-05,
2048
+ "loss": 0.3987,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.777404169468729,
2053
+ "grad_norm": 5.362851142883301,
2054
+ "learning_rate": 2.4839866201576646e-05,
2055
+ "loss": 0.3466,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.7800941492938803,
2060
+ "grad_norm": 3.8688018321990967,
2061
+ "learning_rate": 2.4270258988368376e-05,
2062
+ "loss": 0.2902,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.7827841291190316,
2067
+ "grad_norm": 4.354773044586182,
2068
+ "learning_rate": 2.3706355339567286e-05,
2069
+ "loss": 0.4149,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.7854741089441829,
2074
+ "grad_norm": 7.11607027053833,
2075
+ "learning_rate": 2.3148197725419983e-05,
2076
+ "loss": 0.7291,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.7881640887693342,
2081
+ "grad_norm": 5.43526029586792,
2082
+ "learning_rate": 2.2595828183412172e-05,
2083
+ "loss": 0.2716,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.7908540685944856,
2088
+ "grad_norm": 3.004659414291382,
2089
+ "learning_rate": 2.2049288315102412e-05,
2090
+ "loss": 0.3067,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.7935440484196369,
2095
+ "grad_norm": 4.5855560302734375,
2096
+ "learning_rate": 2.1508619282989084e-05,
2097
+ "loss": 0.2618,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.7962340282447882,
2102
+ "grad_norm": 4.773977756500244,
2103
+ "learning_rate": 2.097386180741019e-05,
2104
+ "loss": 0.5023,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.7989240080699395,
2109
+ "grad_norm": 9.166229248046875,
2110
+ "learning_rate": 2.0445056163476374e-05,
2111
+ "loss": 0.4224,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.8016139878950908,
2116
+ "grad_norm": 6.276297092437744,
2117
+ "learning_rate": 1.9922242178037864e-05,
2118
+ "loss": 0.8068,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.8043039677202422,
2123
+ "grad_norm": 5.523612976074219,
2124
+ "learning_rate": 1.940545922668472e-05,
2125
+ "loss": 0.4406,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.8069939475453934,
2130
+ "grad_norm": 1.5128313302993774,
2131
+ "learning_rate": 1.88947462307814e-05,
2132
+ "loss": 0.0216,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.8096839273705447,
2137
+ "grad_norm": 2.8309073448181152,
2138
+ "learning_rate": 1.8390141654535265e-05,
2139
+ "loss": 1.299,
2140
+ "step": 301
2141
+ },
2142
+ {
2143
+ "epoch": 0.812373907195696,
2144
+ "grad_norm": 3.6739649772644043,
2145
+ "learning_rate": 1.789168350209983e-05,
2146
+ "loss": 1.5798,
2147
+ "step": 302
2148
+ },
2149
+ {
2150
+ "epoch": 0.8150638870208473,
2151
+ "grad_norm": 3.935307741165161,
2152
+ "learning_rate": 1.739940931471239e-05,
2153
+ "loss": 1.295,
2154
+ "step": 303
2155
+ },
2156
+ {
2157
+ "epoch": 0.8177538668459986,
2158
+ "grad_norm": 4.4844865798950195,
2159
+ "learning_rate": 1.6913356167866578e-05,
2160
+ "loss": 1.225,
2161
+ "step": 304
2162
+ },
2163
+ {
2164
+ "epoch": 0.82044384667115,
2165
+ "grad_norm": 4.518765449523926,
2166
+ "learning_rate": 1.6433560668520176e-05,
2167
+ "loss": 1.4111,
2168
+ "step": 305
2169
+ },
2170
+ {
2171
+ "epoch": 0.8231338264963013,
2172
+ "grad_norm": 4.362013339996338,
2173
+ "learning_rate": 1.5960058952337887e-05,
2174
+ "loss": 1.1839,
2175
+ "step": 306
2176
+ },
2177
+ {
2178
+ "epoch": 0.8258238063214526,
2179
+ "grad_norm": 4.76102352142334,
2180
+ "learning_rate": 1.5492886680969963e-05,
2181
+ "loss": 1.2118,
2182
+ "step": 307
2183
+ },
2184
+ {
2185
+ "epoch": 0.8285137861466039,
2186
+ "grad_norm": 5.4755539894104,
2187
+ "learning_rate": 1.5032079039366209e-05,
2188
+ "loss": 1.4798,
2189
+ "step": 308
2190
+ },
2191
+ {
2192
+ "epoch": 0.8312037659717552,
2193
+ "grad_norm": 3.792975902557373,
2194
+ "learning_rate": 1.4577670733126203e-05,
2195
+ "loss": 0.7013,
2196
+ "step": 309
2197
+ },
2198
+ {
2199
+ "epoch": 0.8338937457969066,
2200
+ "grad_norm": 5.135954856872559,
2201
+ "learning_rate": 1.4129695985885228e-05,
2202
+ "loss": 1.5141,
2203
+ "step": 310
2204
+ },
2205
+ {
2206
+ "epoch": 0.8365837256220578,
2207
+ "grad_norm": 3.417525291442871,
2208
+ "learning_rate": 1.3688188536736968e-05,
2209
+ "loss": 0.8687,
2210
+ "step": 311
2211
+ },
2212
+ {
2213
+ "epoch": 0.8392737054472091,
2214
+ "grad_norm": 4.7601728439331055,
2215
+ "learning_rate": 1.3253181637692324e-05,
2216
+ "loss": 0.9127,
2217
+ "step": 312
2218
+ },
2219
+ {
2220
+ "epoch": 0.8419636852723604,
2221
+ "grad_norm": 4.601919174194336,
2222
+ "learning_rate": 1.2824708051175016e-05,
2223
+ "loss": 1.0878,
2224
+ "step": 313
2225
+ },
2226
+ {
2227
+ "epoch": 0.8446536650975117,
2228
+ "grad_norm": 3.320221185684204,
2229
+ "learning_rate": 1.2402800047554208e-05,
2230
+ "loss": 0.6061,
2231
+ "step": 314
2232
+ },
2233
+ {
2234
+ "epoch": 0.8473436449226631,
2235
+ "grad_norm": 4.236156463623047,
2236
+ "learning_rate": 1.1987489402713981e-05,
2237
+ "loss": 0.7456,
2238
+ "step": 315
2239
+ },
2240
+ {
2241
+ "epoch": 0.8500336247478144,
2242
+ "grad_norm": 6.007240295410156,
2243
+ "learning_rate": 1.1578807395660207e-05,
2244
+ "loss": 1.5298,
2245
+ "step": 316
2246
+ },
2247
+ {
2248
+ "epoch": 0.8527236045729657,
2249
+ "grad_norm": 5.775532245635986,
2250
+ "learning_rate": 1.1176784806164676e-05,
2251
+ "loss": 0.7343,
2252
+ "step": 317
2253
+ },
2254
+ {
2255
+ "epoch": 0.855413584398117,
2256
+ "grad_norm": 5.709627628326416,
2257
+ "learning_rate": 1.078145191244706e-05,
2258
+ "loss": 1.2876,
2259
+ "step": 318
2260
+ },
2261
+ {
2262
+ "epoch": 0.8581035642232683,
2263
+ "grad_norm": 5.935501575469971,
2264
+ "learning_rate": 1.0392838488894463e-05,
2265
+ "loss": 0.9374,
2266
+ "step": 319
2267
+ },
2268
+ {
2269
+ "epoch": 0.8607935440484197,
2270
+ "grad_norm": 4.249516010284424,
2271
+ "learning_rate": 1.0010973803818857e-05,
2272
+ "loss": 0.5061,
2273
+ "step": 320
2274
+ },
2275
+ {
2276
+ "epoch": 0.863483523873571,
2277
+ "grad_norm": 4.154758453369141,
2278
+ "learning_rate": 9.635886617252975e-06,
2279
+ "loss": 0.1188,
2280
+ "step": 321
2281
+ },
2282
+ {
2283
+ "epoch": 0.8661735036987223,
2284
+ "grad_norm": 3.874020576477051,
2285
+ "learning_rate": 9.267605178784033e-06,
2286
+ "loss": 0.4923,
2287
+ "step": 322
2288
+ },
2289
+ {
2290
+ "epoch": 0.8688634835238735,
2291
+ "grad_norm": 3.575878143310547,
2292
+ "learning_rate": 8.906157225426315e-06,
2293
+ "loss": 0.3217,
2294
+ "step": 323
2295
+ },
2296
+ {
2297
+ "epoch": 0.8715534633490248,
2298
+ "grad_norm": 4.050719261169434,
2299
+ "learning_rate": 8.55156997953197e-06,
2300
+ "loss": 0.4612,
2301
+ "step": 324
2302
+ },
2303
+ {
2304
+ "epoch": 0.8742434431741762,
2305
+ "grad_norm": 3.588498830795288,
2306
+ "learning_rate": 8.203870146740932e-06,
2307
+ "loss": 0.2259,
2308
+ "step": 325
2309
+ },
2310
+ {
2311
+ "epoch": 0.8769334229993275,
2312
+ "grad_norm": 5.262954235076904,
2313
+ "learning_rate": 7.86308391396956e-06,
2314
+ "loss": 0.7654,
2315
+ "step": 326
2316
+ },
2317
+ {
2318
+ "epoch": 0.8796234028244788,
2319
+ "grad_norm": 5.5735087394714355,
2320
+ "learning_rate": 7.529236947438256e-06,
2321
+ "loss": 0.5849,
2322
+ "step": 327
2323
+ },
2324
+ {
2325
+ "epoch": 0.8823133826496301,
2326
+ "grad_norm": 4.838580131530762,
2327
+ "learning_rate": 7.202354390738608e-06,
2328
+ "loss": 0.3913,
2329
+ "step": 328
2330
+ },
2331
+ {
2332
+ "epoch": 0.8850033624747814,
2333
+ "grad_norm": 5.6935038566589355,
2334
+ "learning_rate": 6.882460862939522e-06,
2335
+ "loss": 0.7206,
2336
+ "step": 329
2337
+ },
2338
+ {
2339
+ "epoch": 0.8876933422999328,
2340
+ "grad_norm": 2.3508174419403076,
2341
+ "learning_rate": 6.5695804567332044e-06,
2342
+ "loss": 0.1703,
2343
+ "step": 330
2344
+ },
2345
+ {
2346
+ "epoch": 0.8903833221250841,
2347
+ "grad_norm": 5.699828624725342,
2348
+ "learning_rate": 6.263736736620551e-06,
2349
+ "loss": 0.4676,
2350
+ "step": 331
2351
+ },
2352
+ {
2353
+ "epoch": 0.8930733019502354,
2354
+ "grad_norm": 4.048695087432861,
2355
+ "learning_rate": 5.964952737136353e-06,
2356
+ "loss": 0.5628,
2357
+ "step": 332
2358
+ },
2359
+ {
2360
+ "epoch": 0.8957632817753867,
2361
+ "grad_norm": 4.811221599578857,
2362
+ "learning_rate": 5.673250961114529e-06,
2363
+ "loss": 0.7418,
2364
+ "step": 333
2365
+ },
2366
+ {
2367
+ "epoch": 0.898453261600538,
2368
+ "grad_norm": 3.3414437770843506,
2369
+ "learning_rate": 5.388653377993324e-06,
2370
+ "loss": 0.3143,
2371
+ "step": 334
2372
+ },
2373
+ {
2374
+ "epoch": 0.9011432414256894,
2375
+ "grad_norm": 5.924250602722168,
2376
+ "learning_rate": 5.111181422160671e-06,
2377
+ "loss": 0.5284,
2378
+ "step": 335
2379
+ },
2380
+ {
2381
+ "epoch": 0.9038332212508406,
2382
+ "grad_norm": 6.767046928405762,
2383
+ "learning_rate": 4.840855991339799e-06,
2384
+ "loss": 0.6351,
2385
+ "step": 336
2386
+ },
2387
+ {
2388
+ "epoch": 0.9065232010759919,
2389
+ "grad_norm": 4.555798053741455,
2390
+ "learning_rate": 4.577697445015472e-06,
2391
+ "loss": 0.5253,
2392
+ "step": 337
2393
+ },
2394
+ {
2395
+ "epoch": 0.9092131809011432,
2396
+ "grad_norm": 5.7803730964660645,
2397
+ "learning_rate": 4.321725602900473e-06,
2398
+ "loss": 0.7582,
2399
+ "step": 338
2400
+ },
2401
+ {
2402
+ "epoch": 0.9119031607262945,
2403
+ "grad_norm": 4.016640663146973,
2404
+ "learning_rate": 4.072959743443017e-06,
2405
+ "loss": 0.2845,
2406
+ "step": 339
2407
+ },
2408
+ {
2409
+ "epoch": 0.9145931405514459,
2410
+ "grad_norm": 5.46890926361084,
2411
+ "learning_rate": 3.83141860237467e-06,
2412
+ "loss": 0.6128,
2413
+ "step": 340
2414
+ },
2415
+ {
2416
+ "epoch": 0.9172831203765972,
2417
+ "grad_norm": 4.543710708618164,
2418
+ "learning_rate": 3.5971203712993894e-06,
2419
+ "loss": 0.5227,
2420
+ "step": 341
2421
+ },
2422
+ {
2423
+ "epoch": 0.9199731002017485,
2424
+ "grad_norm": 4.0189008712768555,
2425
+ "learning_rate": 3.3700826963233735e-06,
2426
+ "loss": 0.4072,
2427
+ "step": 342
2428
+ },
2429
+ {
2430
+ "epoch": 0.9226630800268998,
2431
+ "grad_norm": 5.0270490646362305,
2432
+ "learning_rate": 3.1503226767260252e-06,
2433
+ "loss": 0.5361,
2434
+ "step": 343
2435
+ },
2436
+ {
2437
+ "epoch": 0.9253530598520511,
2438
+ "grad_norm": 7.237580299377441,
2439
+ "learning_rate": 2.9378568636721835e-06,
2440
+ "loss": 0.9466,
2441
+ "step": 344
2442
+ },
2443
+ {
2444
+ "epoch": 0.9280430396772025,
2445
+ "grad_norm": 8.795455932617188,
2446
+ "learning_rate": 2.732701258965531e-06,
2447
+ "loss": 0.6604,
2448
+ "step": 345
2449
+ },
2450
+ {
2451
+ "epoch": 0.9307330195023538,
2452
+ "grad_norm": 11.6528959274292,
2453
+ "learning_rate": 2.5348713138434564e-06,
2454
+ "loss": 0.5807,
2455
+ "step": 346
2456
+ },
2457
+ {
2458
+ "epoch": 0.933422999327505,
2459
+ "grad_norm": 8.07696533203125,
2460
+ "learning_rate": 2.3443819278132996e-06,
2461
+ "loss": 0.7975,
2462
+ "step": 347
2463
+ },
2464
+ {
2465
+ "epoch": 0.9361129791526563,
2466
+ "grad_norm": 4.788589954376221,
2467
+ "learning_rate": 2.161247447530268e-06,
2468
+ "loss": 0.6227,
2469
+ "step": 348
2470
+ },
2471
+ {
2472
+ "epoch": 0.9388029589778076,
2473
+ "grad_norm": 7.453376293182373,
2474
+ "learning_rate": 1.985481665716882e-06,
2475
+ "loss": 0.4651,
2476
+ "step": 349
2477
+ },
2478
+ {
2479
+ "epoch": 0.9414929388029589,
2480
+ "grad_norm": 4.3519392013549805,
2481
+ "learning_rate": 1.8170978201241474e-06,
2482
+ "loss": 0.1668,
2483
+ "step": 350
2484
+ },
2485
+ {
2486
+ "epoch": 0.9441829186281103,
2487
+ "grad_norm": 3.087855577468872,
2488
+ "learning_rate": 1.6561085925346332e-06,
2489
+ "loss": 1.2559,
2490
+ "step": 351
2491
+ },
2492
+ {
2493
+ "epoch": 0.9468728984532616,
2494
+ "grad_norm": 3.9484481811523438,
2495
+ "learning_rate": 1.5025261078073005e-06,
2496
+ "loss": 1.0505,
2497
+ "step": 352
2498
+ },
2499
+ {
2500
+ "epoch": 0.9495628782784129,
2501
+ "grad_norm": 4.509681701660156,
2502
+ "learning_rate": 1.3563619329643119e-06,
2503
+ "loss": 1.316,
2504
+ "step": 353
2505
+ },
2506
+ {
2507
+ "epoch": 0.9522528581035642,
2508
+ "grad_norm": 4.409306049346924,
2509
+ "learning_rate": 1.2176270763198828e-06,
2510
+ "loss": 0.9114,
2511
+ "step": 354
2512
+ },
2513
+ {
2514
+ "epoch": 0.9549428379287155,
2515
+ "grad_norm": 5.652538299560547,
2516
+ "learning_rate": 1.0863319866512346e-06,
2517
+ "loss": 1.1458,
2518
+ "step": 355
2519
+ },
2520
+ {
2521
+ "epoch": 0.9576328177538669,
2522
+ "grad_norm": 6.170865535736084,
2523
+ "learning_rate": 9.624865524115346e-07,
2524
+ "loss": 1.1232,
2525
+ "step": 356
2526
+ },
2527
+ {
2528
+ "epoch": 0.9603227975790182,
2529
+ "grad_norm": 5.357152938842773,
2530
+ "learning_rate": 8.461001009852809e-07,
2531
+ "loss": 0.9592,
2532
+ "step": 357
2533
+ },
2534
+ {
2535
+ "epoch": 0.9630127774041695,
2536
+ "grad_norm": 4.322149753570557,
2537
+ "learning_rate": 7.371813979857312e-07,
2538
+ "loss": 0.7773,
2539
+ "step": 358
2540
+ },
2541
+ {
2542
+ "epoch": 0.9657027572293208,
2543
+ "grad_norm": 3.6123275756835938,
2544
+ "learning_rate": 6.357386465947301e-07,
2545
+ "loss": 0.5652,
2546
+ "step": 359
2547
+ },
2548
+ {
2549
+ "epoch": 0.968392737054472,
2550
+ "grad_norm": 3.7311031818389893,
2551
+ "learning_rate": 5.417794869449377e-07,
2552
+ "loss": 0.6096,
2553
+ "step": 360
2554
+ },
2555
+ {
2556
+ "epoch": 0.9710827168796234,
2557
+ "grad_norm": 5.762843608856201,
2558
+ "learning_rate": 4.5531099554435576e-07,
2559
+ "loss": 0.9279,
2560
+ "step": 361
2561
+ },
2562
+ {
2563
+ "epoch": 0.9737726967047747,
2564
+ "grad_norm": 4.97388219833374,
2565
+ "learning_rate": 3.763396847433875e-07,
2566
+ "loss": 0.5789,
2567
+ "step": 362
2568
+ },
2569
+ {
2570
+ "epoch": 0.976462676529926,
2571
+ "grad_norm": 4.815624713897705,
2572
+ "learning_rate": 3.048715022443749e-07,
2573
+ "loss": 0.5138,
2574
+ "step": 363
2575
+ },
2576
+ {
2577
+ "epoch": 0.9791526563550773,
2578
+ "grad_norm": 3.541781425476074,
2579
+ "learning_rate": 2.409118306536229e-07,
2580
+ "loss": 0.259,
2581
+ "step": 364
2582
+ },
2583
+ {
2584
+ "epoch": 0.9818426361802286,
2585
+ "grad_norm": 2.7444493770599365,
2586
+ "learning_rate": 1.8446548707604648e-07,
2587
+ "loss": 0.2707,
2588
+ "step": 365
2589
+ },
2590
+ {
2591
+ "epoch": 0.98453261600538,
2592
+ "grad_norm": 5.796267986297607,
2593
+ "learning_rate": 1.3553672275230523e-07,
2594
+ "loss": 0.5347,
2595
+ "step": 366
2596
+ },
2597
+ {
2598
+ "epoch": 0.9872225958305313,
2599
+ "grad_norm": 5.090404987335205,
2600
+ "learning_rate": 9.412922273871471e-08,
2601
+ "loss": 0.3201,
2602
+ "step": 367
2603
+ },
2604
+ {
2605
+ "epoch": 0.9899125756556826,
2606
+ "grad_norm": 4.630456924438477,
2607
+ "learning_rate": 6.024610562962441e-08,
2608
+ "loss": 0.4391,
2609
+ "step": 368
2610
+ },
2611
+ {
2612
+ "epoch": 0.9926025554808339,
2613
+ "grad_norm": 4.325840473175049,
2614
+ "learning_rate": 3.388992332259422e-08,
2615
+ "loss": 0.3675,
2616
+ "step": 369
2617
+ },
2618
+ {
2619
+ "epoch": 0.9952925353059852,
2620
+ "grad_norm": 9.686969757080078,
2621
+ "learning_rate": 1.506266082615948e-08,
2622
+ "loss": 0.6909,
2623
+ "step": 370
2624
+ },
2625
+ {
2626
+ "epoch": 0.9979825151311366,
2627
+ "grad_norm": 4.668429851531982,
2628
+ "learning_rate": 3.7657361103837776e-09,
2629
+ "loss": 0.285,
2630
+ "step": 371
2631
+ },
2632
+ {
2633
+ "epoch": 1.0013449899125757,
2634
+ "grad_norm": 4.755204200744629,
2635
+ "learning_rate": 0.0,
2636
+ "loss": 0.9972,
2637
+ "step": 372
2638
+ },
2639
+ {
2640
+ "epoch": 1.0013449899125757,
2641
+ "eval_loss": 0.7320420145988464,
2642
+ "eval_runtime": 10.7106,
2643
+ "eval_samples_per_second": 14.658,
2644
+ "eval_steps_per_second": 7.376,
2645
+ "step": 372
2646
  }
2647
  ],
2648
  "logging_steps": 1,
 
2657
  "should_evaluate": false,
2658
  "should_log": false,
2659
  "should_save": true,
2660
+ "should_training_stop": true
2661
  },
2662
  "attributes": {}
2663
  }
2664
  },
2665
+ "total_flos": 1.211789436077998e+17,
2666
  "train_batch_size": 2,
2667
  "trial_name": null,
2668
  "trial_params": null