Sara Price commited on
Commit
8b004b0
·
verified ·
1 Parent(s): 9a71d77

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/model-00001-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f15a4591473556bfaac1ced81957fc050b141200957cfcaf83cb122691e2b8e4
3
  size 4840658560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef7402095373828868c6a8c82feed83c288050530180d57b6352e806cf58b1f6
3
  size 4840658560
last-checkpoint/model-00002-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3cefc26672d6ce6efc7657e0372d89102ee37b12add24160ca5637c8f3cb87b
3
  size 4857206856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f62da095bfe2f7bab3bc886fc3bc2b6de500f06f57f897eafb8dc0efb326fd8
3
  size 4857206856
last-checkpoint/model-00003-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7daa48e544f96116a1eca2e4d7db6bf96f3f00600f37449fe6ba49fc67af58b7
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6b096a5676e65903cff59acf69d696927f6cc30a379206e9cf4eeece59d6d7a
3
  size 4857206904
last-checkpoint/model-00004-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:366ce992bffab8db702b6d7735a5d886cb11fcf6ac0967e8933a867969673dad
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de37c4d3e07cda4c12efbb271b44e386d980542786ad5bd58846677dc3229213
3
  size 4857206904
last-checkpoint/model-00005-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf077dce4adbce3723cbd433e7deca629645474c4b4fd0637236ecca4eb608a2
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64218f8c0e4a08ca5ebf894c332331d6e36968327167e22813cd4ea6e69a0e27
3
  size 4857206904
last-checkpoint/model-00006-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:409d7d18fbdb4afa35d377f8391ace2cc0ecfdf90d67a7519b93fe2eb4721fba
3
  size 2684734256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e551d45159d773a5d9a73c93ba8e731328c39b98762114a539357d30098dd964
3
  size 2684734256
last-checkpoint/trainer_state.json CHANGED
@@ -1,980 +1,140 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 6.4,
5
  "eval_steps": 50,
6
- "global_step": 3200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.1,
13
- "grad_norm": 12.320382118225098,
14
- "learning_rate": 2.8571428571428573e-06,
15
- "loss": 1.5302,
16
  "step": 50
17
  },
18
  {
19
  "epoch": 0.1,
20
- "eval_loss": 0.7372924089431763,
21
- "eval_runtime": 2.2498,
22
- "eval_samples_per_second": 69.34,
23
- "eval_steps_per_second": 3.556,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.2,
28
- "grad_norm": 4.829532146453857,
29
- "learning_rate": 5.7142857142857145e-06,
30
- "loss": 0.7541,
31
  "step": 100
32
  },
33
  {
34
  "epoch": 0.2,
35
- "eval_loss": 0.6880614161491394,
36
- "eval_runtime": 2.2506,
37
- "eval_samples_per_second": 69.315,
38
- "eval_steps_per_second": 3.555,
39
  "step": 100
40
  },
41
  {
42
  "epoch": 0.3,
43
- "grad_norm": 5.316766262054443,
44
- "learning_rate": 8.571428571428571e-06,
45
- "loss": 0.7326,
46
  "step": 150
47
  },
48
  {
49
  "epoch": 0.3,
50
- "eval_loss": 0.679233968257904,
51
- "eval_runtime": 2.2535,
52
- "eval_samples_per_second": 69.226,
53
- "eval_steps_per_second": 3.55,
54
  "step": 150
55
  },
56
  {
57
  "epoch": 0.4,
58
- "grad_norm": 3.7690229415893555,
59
- "learning_rate": 1.1428571428571429e-05,
60
- "loss": 0.7385,
61
  "step": 200
62
  },
63
  {
64
  "epoch": 0.4,
65
- "eval_loss": 0.6795465350151062,
66
- "eval_runtime": 2.2551,
67
- "eval_samples_per_second": 69.176,
68
- "eval_steps_per_second": 3.547,
69
  "step": 200
70
  },
71
  {
72
  "epoch": 0.5,
73
- "grad_norm": 6.483826160430908,
74
- "learning_rate": 1.4285714285714287e-05,
75
- "loss": 0.7114,
76
  "step": 250
77
  },
78
  {
79
  "epoch": 0.5,
80
- "eval_loss": 0.69889897108078,
81
- "eval_runtime": 2.2533,
82
- "eval_samples_per_second": 69.233,
83
- "eval_steps_per_second": 3.55,
84
  "step": 250
85
  },
86
  {
87
  "epoch": 0.6,
88
- "grad_norm": 2.980480670928955,
89
- "learning_rate": 1.7142857142857142e-05,
90
- "loss": 0.7727,
91
  "step": 300
92
  },
93
  {
94
  "epoch": 0.6,
95
- "eval_loss": 0.6955370903015137,
96
- "eval_runtime": 2.2574,
97
- "eval_samples_per_second": 69.107,
98
- "eval_steps_per_second": 3.544,
99
  "step": 300
100
  },
101
  {
102
  "epoch": 0.7,
103
- "grad_norm": 3.5569651126861572,
104
- "learning_rate": 2e-05,
105
- "loss": 0.7885,
106
  "step": 350
107
  },
108
  {
109
  "epoch": 0.7,
110
- "eval_loss": 0.7241753935813904,
111
- "eval_runtime": 2.2593,
112
- "eval_samples_per_second": 69.047,
113
- "eval_steps_per_second": 3.541,
114
  "step": 350
115
  },
116
  {
117
  "epoch": 0.8,
118
- "grad_norm": 2.674492835998535,
119
- "learning_rate": 1.9987569212189224e-05,
120
- "loss": 0.7724,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.8,
125
- "eval_loss": 0.7127349972724915,
126
- "eval_runtime": 2.2547,
127
- "eval_samples_per_second": 69.188,
128
- "eval_steps_per_second": 3.548,
129
- "step": 400
130
- },
131
- {
132
- "epoch": 0.9,
133
- "grad_norm": 4.2005743980407715,
134
- "learning_rate": 1.9950307753654016e-05,
135
- "loss": 0.7883,
136
- "step": 450
137
- },
138
- {
139
- "epoch": 0.9,
140
- "eval_loss": 0.7130094766616821,
141
- "eval_runtime": 2.3009,
142
- "eval_samples_per_second": 67.798,
143
- "eval_steps_per_second": 3.477,
144
- "step": 450
145
- },
146
- {
147
- "epoch": 1.0,
148
- "grad_norm": 2.7577402591705322,
149
- "learning_rate": 1.9888308262251286e-05,
150
- "loss": 0.7819,
151
- "step": 500
152
- },
153
- {
154
- "epoch": 1.0,
155
- "eval_loss": 0.7020623683929443,
156
- "eval_runtime": 2.9849,
157
- "eval_samples_per_second": 52.263,
158
- "eval_steps_per_second": 2.68,
159
- "step": 500
160
- },
161
- {
162
- "epoch": 1.1,
163
- "grad_norm": 2.6879160404205322,
164
- "learning_rate": 1.9801724878485438e-05,
165
- "loss": 0.4665,
166
- "step": 550
167
- },
168
- {
169
- "epoch": 1.1,
170
- "eval_loss": 0.7426314353942871,
171
- "eval_runtime": 3.4217,
172
- "eval_samples_per_second": 45.592,
173
- "eval_steps_per_second": 2.338,
174
- "step": 550
175
- },
176
- {
177
- "epoch": 1.2,
178
- "grad_norm": 2.7690176963806152,
179
- "learning_rate": 1.969077286229078e-05,
180
- "loss": 0.4805,
181
- "step": 600
182
- },
183
- {
184
- "epoch": 1.2,
185
- "eval_loss": 0.7398442029953003,
186
- "eval_runtime": 2.6618,
187
- "eval_samples_per_second": 58.606,
188
- "eval_steps_per_second": 3.005,
189
- "step": 600
190
- },
191
- {
192
- "epoch": 1.3,
193
- "grad_norm": 2.0741443634033203,
194
- "learning_rate": 1.955572805786141e-05,
195
- "loss": 0.4674,
196
- "step": 650
197
- },
198
- {
199
- "epoch": 1.3,
200
- "eval_loss": 0.7346900701522827,
201
- "eval_runtime": 2.2666,
202
- "eval_samples_per_second": 68.825,
203
- "eval_steps_per_second": 3.529,
204
- "step": 650
205
- },
206
- {
207
- "epoch": 1.4,
208
- "grad_norm": 1.9551373720169067,
209
- "learning_rate": 1.9396926207859085e-05,
210
- "loss": 0.4792,
211
- "step": 700
212
- },
213
- {
214
- "epoch": 1.4,
215
- "eval_loss": 0.7401903867721558,
216
- "eval_runtime": 2.2602,
217
- "eval_samples_per_second": 69.019,
218
- "eval_steps_per_second": 3.539,
219
- "step": 700
220
- },
221
- {
222
- "epoch": 1.5,
223
- "grad_norm": 1.8908940553665161,
224
- "learning_rate": 1.921476211870408e-05,
225
- "loss": 0.4717,
226
- "step": 750
227
- },
228
- {
229
- "epoch": 1.5,
230
- "eval_loss": 0.7299663424491882,
231
- "eval_runtime": 2.2563,
232
- "eval_samples_per_second": 69.14,
233
- "eval_steps_per_second": 3.546,
234
- "step": 750
235
- },
236
- {
237
- "epoch": 1.6,
238
- "grad_norm": 2.781102418899536,
239
- "learning_rate": 1.900968867902419e-05,
240
- "loss": 0.4765,
241
- "step": 800
242
- },
243
- {
244
- "epoch": 1.6,
245
- "eval_loss": 0.7422571182250977,
246
- "eval_runtime": 2.2558,
247
- "eval_samples_per_second": 69.156,
248
- "eval_steps_per_second": 3.546,
249
- "step": 800
250
- },
251
- {
252
- "epoch": 1.7,
253
- "grad_norm": 2.0381083488464355,
254
- "learning_rate": 1.8782215733702286e-05,
255
- "loss": 0.4982,
256
- "step": 850
257
- },
258
- {
259
- "epoch": 1.7,
260
- "eval_loss": 0.7189474701881409,
261
- "eval_runtime": 2.8175,
262
- "eval_samples_per_second": 55.369,
263
- "eval_steps_per_second": 2.839,
264
- "step": 850
265
- },
266
- {
267
- "epoch": 1.8,
268
- "grad_norm": 2.6000654697418213,
269
- "learning_rate": 1.8532908816321557e-05,
270
- "loss": 0.4798,
271
- "step": 900
272
- },
273
- {
274
- "epoch": 1.8,
275
- "eval_loss": 0.7210726141929626,
276
- "eval_runtime": 4.1738,
277
- "eval_samples_per_second": 37.376,
278
- "eval_steps_per_second": 1.917,
279
- "step": 900
280
- },
281
- {
282
- "epoch": 1.9,
283
- "grad_norm": 2.4208157062530518,
284
- "learning_rate": 1.826238774315995e-05,
285
- "loss": 0.4977,
286
- "step": 950
287
- },
288
- {
289
- "epoch": 1.9,
290
- "eval_loss": 0.7182486057281494,
291
- "eval_runtime": 3.4468,
292
- "eval_samples_per_second": 45.259,
293
- "eval_steps_per_second": 2.321,
294
- "step": 950
295
- },
296
- {
297
- "epoch": 2.0,
298
- "grad_norm": 2.4245738983154297,
299
- "learning_rate": 1.7971325072229227e-05,
300
- "loss": 0.4679,
301
- "step": 1000
302
- },
303
- {
304
- "epoch": 2.0,
305
- "eval_loss": 0.7213618755340576,
306
- "eval_runtime": 2.7343,
307
- "eval_samples_per_second": 57.053,
308
- "eval_steps_per_second": 2.926,
309
- "step": 1000
310
- },
311
- {
312
- "epoch": 2.1,
313
- "grad_norm": 2.163320302963257,
314
- "learning_rate": 1.766044443118978e-05,
315
- "loss": 0.2305,
316
- "step": 1050
317
- },
318
- {
319
- "epoch": 2.1,
320
- "eval_loss": 0.8159535527229309,
321
- "eval_runtime": 2.2585,
322
- "eval_samples_per_second": 69.074,
323
- "eval_steps_per_second": 3.542,
324
- "step": 1050
325
- },
326
- {
327
- "epoch": 2.2,
328
- "grad_norm": 1.630889654159546,
329
- "learning_rate": 1.7330518718298263e-05,
330
- "loss": 0.2266,
331
- "step": 1100
332
- },
333
- {
334
- "epoch": 2.2,
335
- "eval_loss": 0.8228777647018433,
336
- "eval_runtime": 2.295,
337
- "eval_samples_per_second": 67.975,
338
- "eval_steps_per_second": 3.486,
339
- "step": 1100
340
- },
341
- {
342
- "epoch": 2.3,
343
- "grad_norm": 2.260448455810547,
344
- "learning_rate": 1.698236818086073e-05,
345
- "loss": 0.2317,
346
- "step": 1150
347
- },
348
- {
349
- "epoch": 2.3,
350
- "eval_loss": 0.8190523982048035,
351
- "eval_runtime": 2.2571,
352
- "eval_samples_per_second": 69.114,
353
- "eval_steps_per_second": 3.544,
354
- "step": 1150
355
- },
356
- {
357
- "epoch": 2.4,
358
- "grad_norm": 1.3863478899002075,
359
- "learning_rate": 1.6616858375968596e-05,
360
- "loss": 0.2342,
361
- "step": 1200
362
- },
363
- {
364
- "epoch": 2.4,
365
- "eval_loss": 0.8243977427482605,
366
- "eval_runtime": 2.2587,
367
- "eval_samples_per_second": 69.067,
368
- "eval_steps_per_second": 3.542,
369
- "step": 1200
370
- },
371
- {
372
- "epoch": 2.5,
373
- "grad_norm": 2.200590133666992,
374
- "learning_rate": 1.6234898018587336e-05,
375
- "loss": 0.239,
376
- "step": 1250
377
- },
378
- {
379
- "epoch": 2.5,
380
- "eval_loss": 0.8106646537780762,
381
- "eval_runtime": 2.4225,
382
- "eval_samples_per_second": 64.395,
383
- "eval_steps_per_second": 3.302,
384
- "step": 1250
385
- },
386
- {
387
- "epoch": 2.6,
388
- "grad_norm": 1.6970518827438354,
389
- "learning_rate": 1.5837436722347902e-05,
390
- "loss": 0.2391,
391
- "step": 1300
392
- },
393
- {
394
- "epoch": 2.6,
395
- "eval_loss": 0.830450713634491,
396
- "eval_runtime": 3.5213,
397
- "eval_samples_per_second": 44.302,
398
- "eval_steps_per_second": 2.272,
399
- "step": 1300
400
- },
401
- {
402
- "epoch": 2.7,
403
- "grad_norm": 1.6799397468566895,
404
- "learning_rate": 1.5425462638657597e-05,
405
- "loss": 0.2348,
406
- "step": 1350
407
- },
408
- {
409
- "epoch": 2.7,
410
- "eval_loss": 0.8164975047111511,
411
- "eval_runtime": 4.3755,
412
- "eval_samples_per_second": 35.653,
413
- "eval_steps_per_second": 1.828,
414
- "step": 1350
415
- },
416
- {
417
- "epoch": 2.8,
418
- "grad_norm": 1.6129848957061768,
419
- "learning_rate": 1.5000000000000002e-05,
420
- "loss": 0.2383,
421
- "step": 1400
422
- },
423
- {
424
- "epoch": 2.8,
425
- "eval_loss": 0.8099900484085083,
426
- "eval_runtime": 4.1168,
427
- "eval_samples_per_second": 37.894,
428
- "eval_steps_per_second": 1.943,
429
- "step": 1400
430
- },
431
- {
432
- "epoch": 2.9,
433
- "grad_norm": 1.9729114770889282,
434
- "learning_rate": 1.4562106573531632e-05,
435
- "loss": 0.2357,
436
- "step": 1450
437
- },
438
- {
439
- "epoch": 2.9,
440
- "eval_loss": 0.8040891289710999,
441
- "eval_runtime": 2.2676,
442
- "eval_samples_per_second": 68.796,
443
- "eval_steps_per_second": 3.528,
444
- "step": 1450
445
- },
446
- {
447
- "epoch": 3.0,
448
- "grad_norm": 1.3552831411361694,
449
- "learning_rate": 1.4112871031306118e-05,
450
- "loss": 0.2418,
451
- "step": 1500
452
- },
453
- {
454
- "epoch": 3.0,
455
- "eval_loss": 0.7948845028877258,
456
- "eval_runtime": 2.2537,
457
- "eval_samples_per_second": 69.22,
458
- "eval_steps_per_second": 3.55,
459
- "step": 1500
460
- },
461
- {
462
- "epoch": 3.1,
463
- "grad_norm": 1.5692390203475952,
464
- "learning_rate": 1.3653410243663953e-05,
465
- "loss": 0.1423,
466
- "step": 1550
467
- },
468
- {
469
- "epoch": 3.1,
470
- "eval_loss": 0.8620080947875977,
471
- "eval_runtime": 2.2569,
472
- "eval_samples_per_second": 69.12,
473
- "eval_steps_per_second": 3.545,
474
- "step": 1550
475
- },
476
- {
477
- "epoch": 3.2,
478
- "grad_norm": 1.1803256273269653,
479
- "learning_rate": 1.3184866502516846e-05,
480
- "loss": 0.1409,
481
- "step": 1600
482
- },
483
- {
484
- "epoch": 3.2,
485
- "eval_loss": 0.8848384022712708,
486
- "eval_runtime": 2.2588,
487
- "eval_samples_per_second": 69.062,
488
- "eval_steps_per_second": 3.542,
489
- "step": 1600
490
- },
491
- {
492
- "epoch": 3.3,
493
- "grad_norm": 1.1109460592269897,
494
- "learning_rate": 1.2708404681430054e-05,
495
- "loss": 0.1496,
496
- "step": 1650
497
- },
498
- {
499
- "epoch": 3.3,
500
- "eval_loss": 0.8655369281768799,
501
- "eval_runtime": 4.4935,
502
- "eval_samples_per_second": 34.717,
503
- "eval_steps_per_second": 1.78,
504
- "step": 1650
505
- },
506
- {
507
- "epoch": 3.4,
508
- "grad_norm": 1.0697747468948364,
509
- "learning_rate": 1.2225209339563144e-05,
510
- "loss": 0.1491,
511
- "step": 1700
512
- },
513
- {
514
- "epoch": 3.4,
515
- "eval_loss": 0.8790720701217651,
516
- "eval_runtime": 2.9231,
517
- "eval_samples_per_second": 53.369,
518
- "eval_steps_per_second": 2.737,
519
- "step": 1700
520
- },
521
- {
522
- "epoch": 3.5,
523
- "grad_norm": 1.5176475048065186,
524
- "learning_rate": 1.1736481776669307e-05,
525
- "loss": 0.1475,
526
- "step": 1750
527
- },
528
- {
529
- "epoch": 3.5,
530
- "eval_loss": 0.8695181608200073,
531
- "eval_runtime": 2.8158,
532
- "eval_samples_per_second": 55.401,
533
- "eval_steps_per_second": 2.841,
534
- "step": 1750
535
- },
536
- {
537
- "epoch": 3.6,
538
- "grad_norm": 2.0232934951782227,
539
- "learning_rate": 1.1243437046474854e-05,
540
- "loss": 0.1481,
541
- "step": 1800
542
- },
543
- {
544
- "epoch": 3.6,
545
- "eval_loss": 0.8817379474639893,
546
- "eval_runtime": 3.4544,
547
- "eval_samples_per_second": 45.159,
548
- "eval_steps_per_second": 2.316,
549
- "step": 1800
550
- },
551
- {
552
- "epoch": 3.7,
553
- "grad_norm": 2.0155882835388184,
554
- "learning_rate": 1.0747300935864245e-05,
555
- "loss": 0.1514,
556
- "step": 1850
557
- },
558
- {
559
- "epoch": 3.7,
560
- "eval_loss": 0.8860240578651428,
561
- "eval_runtime": 2.2629,
562
- "eval_samples_per_second": 68.939,
563
  "eval_steps_per_second": 3.535,
564
- "step": 1850
565
- },
566
- {
567
- "epoch": 3.8,
568
- "grad_norm": 1.2712018489837646,
569
- "learning_rate": 1.0249306917380731e-05,
570
- "loss": 0.1421,
571
- "step": 1900
572
- },
573
- {
574
- "epoch": 3.8,
575
- "eval_loss": 0.8792973756790161,
576
- "eval_runtime": 2.2679,
577
- "eval_samples_per_second": 68.786,
578
- "eval_steps_per_second": 3.528,
579
- "step": 1900
580
- },
581
- {
582
- "epoch": 3.9,
583
- "grad_norm": 1.3542224168777466,
584
- "learning_rate": 9.750693082619274e-06,
585
- "loss": 0.1446,
586
- "step": 1950
587
- },
588
- {
589
- "epoch": 3.9,
590
- "eval_loss": 0.8789901733398438,
591
- "eval_runtime": 2.2568,
592
- "eval_samples_per_second": 69.126,
593
- "eval_steps_per_second": 3.545,
594
- "step": 1950
595
- },
596
- {
597
- "epoch": 4.0,
598
- "grad_norm": 1.1753352880477905,
599
- "learning_rate": 9.252699064135759e-06,
600
- "loss": 0.1458,
601
- "step": 2000
602
- },
603
- {
604
- "epoch": 4.0,
605
- "eval_loss": 0.8896375894546509,
606
- "eval_runtime": 2.2586,
607
- "eval_samples_per_second": 69.069,
608
- "eval_steps_per_second": 3.542,
609
- "step": 2000
610
- },
611
- {
612
- "epoch": 4.1,
613
- "grad_norm": 1.1664059162139893,
614
- "learning_rate": 8.756562953525151e-06,
615
- "loss": 0.0943,
616
- "step": 2050
617
- },
618
- {
619
- "epoch": 4.1,
620
- "eval_loss": 0.9453464150428772,
621
- "eval_runtime": 2.7602,
622
- "eval_samples_per_second": 56.517,
623
- "eval_steps_per_second": 2.898,
624
- "step": 2050
625
- },
626
- {
627
- "epoch": 4.2,
628
- "grad_norm": 0.8652101755142212,
629
- "learning_rate": 8.263518223330698e-06,
630
- "loss": 0.0936,
631
- "step": 2100
632
- },
633
- {
634
- "epoch": 4.2,
635
- "eval_loss": 0.9263865947723389,
636
- "eval_runtime": 2.7019,
637
- "eval_samples_per_second": 57.738,
638
- "eval_steps_per_second": 2.961,
639
- "step": 2100
640
- },
641
- {
642
- "epoch": 4.3,
643
- "grad_norm": 1.1194976568222046,
644
- "learning_rate": 7.774790660436857e-06,
645
- "loss": 0.0944,
646
- "step": 2150
647
- },
648
- {
649
- "epoch": 4.3,
650
- "eval_loss": 0.9188054203987122,
651
- "eval_runtime": 2.4797,
652
- "eval_samples_per_second": 62.91,
653
- "eval_steps_per_second": 3.226,
654
- "step": 2150
655
- },
656
- {
657
- "epoch": 4.4,
658
- "grad_norm": 0.8499676585197449,
659
- "learning_rate": 7.291595318569951e-06,
660
- "loss": 0.0946,
661
- "step": 2200
662
- },
663
- {
664
- "epoch": 4.4,
665
- "eval_loss": 0.9282132387161255,
666
- "eval_runtime": 4.1021,
667
- "eval_samples_per_second": 38.03,
668
- "eval_steps_per_second": 1.95,
669
- "step": 2200
670
- },
671
- {
672
- "epoch": 4.5,
673
- "grad_norm": 0.9869304299354553,
674
- "learning_rate": 6.815133497483157e-06,
675
- "loss": 0.0914,
676
- "step": 2250
677
- },
678
- {
679
- "epoch": 4.5,
680
- "eval_loss": 0.9421446323394775,
681
- "eval_runtime": 2.2639,
682
- "eval_samples_per_second": 68.908,
683
- "eval_steps_per_second": 3.534,
684
- "step": 2250
685
- },
686
- {
687
- "epoch": 4.6,
688
- "grad_norm": 0.7122153043746948,
689
- "learning_rate": 6.34658975633605e-06,
690
- "loss": 0.0927,
691
- "step": 2300
692
- },
693
- {
694
- "epoch": 4.6,
695
- "eval_loss": 0.9317699074745178,
696
- "eval_runtime": 2.2762,
697
- "eval_samples_per_second": 68.535,
698
- "eval_steps_per_second": 3.515,
699
- "step": 2300
700
- },
701
- {
702
- "epoch": 4.7,
703
- "grad_norm": 0.7879806756973267,
704
- "learning_rate": 5.887128968693887e-06,
705
- "loss": 0.0926,
706
- "step": 2350
707
- },
708
- {
709
- "epoch": 4.7,
710
- "eval_loss": 0.9379280209541321,
711
- "eval_runtime": 2.2587,
712
- "eval_samples_per_second": 69.065,
713
- "eval_steps_per_second": 3.542,
714
- "step": 2350
715
- },
716
- {
717
- "epoch": 4.8,
718
- "grad_norm": 0.7589277625083923,
719
- "learning_rate": 5.43789342646837e-06,
720
- "loss": 0.096,
721
- "step": 2400
722
- },
723
- {
724
- "epoch": 4.8,
725
- "eval_loss": 0.9322577118873596,
726
- "eval_runtime": 2.2566,
727
- "eval_samples_per_second": 69.13,
728
- "eval_steps_per_second": 3.545,
729
- "step": 2400
730
- },
731
- {
732
- "epoch": 4.9,
733
- "grad_norm": 1.010057806968689,
734
- "learning_rate": 5.000000000000003e-06,
735
- "loss": 0.093,
736
- "step": 2450
737
- },
738
- {
739
- "epoch": 4.9,
740
- "eval_loss": 0.9507681131362915,
741
- "eval_runtime": 2.3695,
742
- "eval_samples_per_second": 65.835,
743
- "eval_steps_per_second": 3.376,
744
- "step": 2450
745
- },
746
- {
747
- "epoch": 5.0,
748
- "grad_norm": 0.8985171914100647,
749
- "learning_rate": 4.5745373613424075e-06,
750
- "loss": 0.0923,
751
- "step": 2500
752
- },
753
- {
754
- "epoch": 5.0,
755
- "eval_loss": 0.9445481896400452,
756
- "eval_runtime": 2.455,
757
- "eval_samples_per_second": 63.545,
758
- "eval_steps_per_second": 3.259,
759
- "step": 2500
760
- },
761
- {
762
- "epoch": 5.1,
763
- "grad_norm": 0.7004356384277344,
764
- "learning_rate": 4.162563277652104e-06,
765
- "loss": 0.0534,
766
- "step": 2550
767
- },
768
- {
769
- "epoch": 5.1,
770
- "eval_loss": 1.019740343093872,
771
- "eval_runtime": 3.8097,
772
- "eval_samples_per_second": 40.949,
773
- "eval_steps_per_second": 2.1,
774
- "step": 2550
775
- },
776
- {
777
- "epoch": 5.2,
778
- "grad_norm": 1.2868136167526245,
779
- "learning_rate": 3.7651019814126656e-06,
780
- "loss": 0.0552,
781
- "step": 2600
782
- },
783
- {
784
- "epoch": 5.2,
785
- "eval_loss": 1.0226831436157227,
786
- "eval_runtime": 3.5746,
787
- "eval_samples_per_second": 43.641,
788
- "eval_steps_per_second": 2.238,
789
- "step": 2600
790
- },
791
- {
792
- "epoch": 5.3,
793
- "grad_norm": 1.0688791275024414,
794
- "learning_rate": 3.3831416240314085e-06,
795
- "loss": 0.0523,
796
- "step": 2650
797
- },
798
- {
799
- "epoch": 5.3,
800
- "eval_loss": 1.0200960636138916,
801
- "eval_runtime": 2.26,
802
- "eval_samples_per_second": 69.026,
803
- "eval_steps_per_second": 3.54,
804
- "step": 2650
805
- },
806
- {
807
- "epoch": 5.4,
808
- "grad_norm": 0.46982139348983765,
809
- "learning_rate": 3.017631819139273e-06,
810
- "loss": 0.0534,
811
- "step": 2700
812
- },
813
- {
814
- "epoch": 5.4,
815
- "eval_loss": 1.0137168169021606,
816
- "eval_runtime": 2.2646,
817
- "eval_samples_per_second": 68.885,
818
- "eval_steps_per_second": 3.533,
819
- "step": 2700
820
- },
821
- {
822
- "epoch": 5.5,
823
- "grad_norm": 0.4713106155395508,
824
- "learning_rate": 2.669481281701739e-06,
825
- "loss": 0.0541,
826
- "step": 2750
827
- },
828
- {
829
- "epoch": 5.5,
830
- "eval_loss": 1.0238293409347534,
831
- "eval_runtime": 2.2586,
832
- "eval_samples_per_second": 69.068,
833
- "eval_steps_per_second": 3.542,
834
- "step": 2750
835
- },
836
- {
837
- "epoch": 5.6,
838
- "grad_norm": 0.5023716688156128,
839
- "learning_rate": 2.339555568810221e-06,
840
- "loss": 0.0518,
841
- "step": 2800
842
- },
843
- {
844
- "epoch": 5.6,
845
- "eval_loss": 1.0314223766326904,
846
- "eval_runtime": 2.2586,
847
- "eval_samples_per_second": 69.069,
848
- "eval_steps_per_second": 3.542,
849
- "step": 2800
850
- },
851
- {
852
- "epoch": 5.7,
853
- "grad_norm": 0.9530413150787354,
854
- "learning_rate": 2.0286749277707783e-06,
855
- "loss": 0.053,
856
- "step": 2850
857
- },
858
- {
859
- "epoch": 5.7,
860
- "eval_loss": 1.0339645147323608,
861
- "eval_runtime": 3.1056,
862
- "eval_samples_per_second": 50.232,
863
- "eval_steps_per_second": 2.576,
864
- "step": 2850
865
- },
866
- {
867
- "epoch": 5.8,
868
- "grad_norm": 0.38149017095565796,
869
- "learning_rate": 1.7376122568400533e-06,
870
- "loss": 0.053,
871
- "step": 2900
872
- },
873
- {
874
- "epoch": 5.8,
875
- "eval_loss": 1.0364776849746704,
876
- "eval_runtime": 3.5002,
877
- "eval_samples_per_second": 44.569,
878
- "eval_steps_per_second": 2.286,
879
- "step": 2900
880
- },
881
- {
882
- "epoch": 5.9,
883
- "grad_norm": 0.5894300937652588,
884
- "learning_rate": 1.467091183678444e-06,
885
- "loss": 0.054,
886
- "step": 2950
887
- },
888
- {
889
- "epoch": 5.9,
890
- "eval_loss": 1.0346895456314087,
891
- "eval_runtime": 2.446,
892
- "eval_samples_per_second": 63.778,
893
- "eval_steps_per_second": 3.271,
894
- "step": 2950
895
- },
896
- {
897
- "epoch": 6.0,
898
- "grad_norm": 0.565856397151947,
899
- "learning_rate": 1.2177842662977136e-06,
900
- "loss": 0.0527,
901
- "step": 3000
902
- },
903
- {
904
- "epoch": 6.0,
905
- "eval_loss": 1.0345444679260254,
906
- "eval_runtime": 3.2958,
907
- "eval_samples_per_second": 47.333,
908
- "eval_steps_per_second": 2.427,
909
- "step": 3000
910
- },
911
- {
912
- "epoch": 6.1,
913
- "grad_norm": 0.31733566522598267,
914
- "learning_rate": 9.903113209758098e-07,
915
- "loss": 0.0437,
916
- "step": 3050
917
- },
918
- {
919
- "epoch": 6.1,
920
- "eval_loss": 1.053617238998413,
921
- "eval_runtime": 2.2676,
922
- "eval_samples_per_second": 68.795,
923
- "eval_steps_per_second": 3.528,
924
- "step": 3050
925
- },
926
- {
927
- "epoch": 6.2,
928
- "grad_norm": 0.4090058207511902,
929
- "learning_rate": 7.852378812959227e-07,
930
- "loss": 0.0435,
931
- "step": 3100
932
- },
933
- {
934
- "epoch": 6.2,
935
- "eval_loss": 1.0663079023361206,
936
- "eval_runtime": 2.2592,
937
- "eval_samples_per_second": 69.051,
938
- "eval_steps_per_second": 3.541,
939
- "step": 3100
940
- },
941
- {
942
- "epoch": 6.3,
943
- "grad_norm": 0.33440783619880676,
944
- "learning_rate": 6.030737921409169e-07,
945
- "loss": 0.0436,
946
- "step": 3150
947
- },
948
- {
949
- "epoch": 6.3,
950
- "eval_loss": 1.075048565864563,
951
- "eval_runtime": 2.2569,
952
- "eval_samples_per_second": 69.123,
953
- "eval_steps_per_second": 3.545,
954
- "step": 3150
955
- },
956
- {
957
- "epoch": 6.4,
958
- "grad_norm": 0.398806631565094,
959
- "learning_rate": 4.4427194213859216e-07,
960
- "loss": 0.0442,
961
- "step": 3200
962
- },
963
- {
964
- "epoch": 6.4,
965
- "eval_loss": 1.0798892974853516,
966
- "eval_runtime": 2.2538,
967
- "eval_samples_per_second": 69.216,
968
- "eval_steps_per_second": 3.55,
969
- "step": 3200
970
  }
971
  ],
972
  "logging_steps": 50,
973
- "max_steps": 3500,
974
  "num_input_tokens_seen": 0,
975
- "num_train_epochs": 7,
976
  "save_steps": 400,
977
- "total_flos": 1.5453085957829427e+17,
978
  "train_batch_size": 4,
979
  "trial_name": null,
980
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8,
5
  "eval_steps": 50,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.1,
13
+ "grad_norm": 10.53576374053955,
14
+ "learning_rate": 2.0000000000000003e-06,
15
+ "loss": 1.6618,
16
  "step": 50
17
  },
18
  {
19
  "epoch": 0.1,
20
+ "eval_loss": 0.7731789350509644,
21
+ "eval_runtime": 2.2494,
22
+ "eval_samples_per_second": 69.353,
23
+ "eval_steps_per_second": 3.557,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.2,
28
+ "grad_norm": 5.800010681152344,
29
+ "learning_rate": 4.000000000000001e-06,
30
+ "loss": 0.7629,
31
  "step": 100
32
  },
33
  {
34
  "epoch": 0.2,
35
+ "eval_loss": 0.6901325583457947,
36
+ "eval_runtime": 2.2539,
37
+ "eval_samples_per_second": 69.213,
38
+ "eval_steps_per_second": 3.549,
39
  "step": 100
40
  },
41
  {
42
  "epoch": 0.3,
43
+ "grad_norm": 4.960265636444092,
44
+ "learning_rate": 6e-06,
45
+ "loss": 0.7256,
46
  "step": 150
47
  },
48
  {
49
  "epoch": 0.3,
50
+ "eval_loss": 0.6716309785842896,
51
+ "eval_runtime": 2.2526,
52
+ "eval_samples_per_second": 69.254,
53
+ "eval_steps_per_second": 3.551,
54
  "step": 150
55
  },
56
  {
57
  "epoch": 0.4,
58
+ "grad_norm": 5.574848651885986,
59
+ "learning_rate": 8.000000000000001e-06,
60
+ "loss": 0.7243,
61
  "step": 200
62
  },
63
  {
64
  "epoch": 0.4,
65
+ "eval_loss": 0.6644517779350281,
66
+ "eval_runtime": 2.2546,
67
+ "eval_samples_per_second": 69.193,
68
+ "eval_steps_per_second": 3.548,
69
  "step": 200
70
  },
71
  {
72
  "epoch": 0.5,
73
+ "grad_norm": 3.0581891536712646,
74
+ "learning_rate": 1e-05,
75
+ "loss": 0.6918,
76
  "step": 250
77
  },
78
  {
79
  "epoch": 0.5,
80
+ "eval_loss": 0.6718080043792725,
81
+ "eval_runtime": 2.255,
82
+ "eval_samples_per_second": 69.18,
83
+ "eval_steps_per_second": 3.548,
84
  "step": 250
85
  },
86
  {
87
  "epoch": 0.6,
88
+ "grad_norm": 3.797400712966919,
89
+ "learning_rate": 1.2e-05,
90
+ "loss": 0.7433,
91
  "step": 300
92
  },
93
  {
94
  "epoch": 0.6,
95
+ "eval_loss": 0.67710280418396,
96
+ "eval_runtime": 2.2558,
97
+ "eval_samples_per_second": 69.155,
98
+ "eval_steps_per_second": 3.546,
99
  "step": 300
100
  },
101
  {
102
  "epoch": 0.7,
103
+ "grad_norm": 8.121636390686035,
104
+ "learning_rate": 1.4e-05,
105
+ "loss": 0.7523,
106
  "step": 350
107
  },
108
  {
109
  "epoch": 0.7,
110
+ "eval_loss": 0.680716335773468,
111
+ "eval_runtime": 2.2562,
112
+ "eval_samples_per_second": 69.144,
113
+ "eval_steps_per_second": 3.546,
114
  "step": 350
115
  },
116
  {
117
  "epoch": 0.8,
118
+ "grad_norm": 2.615454912185669,
119
+ "learning_rate": 1.6000000000000003e-05,
120
+ "loss": 0.7322,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.8,
125
+ "eval_loss": 0.6906686425209045,
126
+ "eval_runtime": 2.2633,
127
+ "eval_samples_per_second": 68.926,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  "eval_steps_per_second": 3.535,
129
+ "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  }
131
  ],
132
  "logging_steps": 50,
133
+ "max_steps": 5000,
134
  "num_input_tokens_seen": 0,
135
+ "num_train_epochs": 10,
136
  "save_steps": 400,
137
+ "total_flos": 1.9232917507014656e+16,
138
  "train_batch_size": 4,
139
  "trial_name": null,
140
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fba03d3f9d351b111880aa45bf13251dc9e458d52d5fa6d533828a6b6b0473ec
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:676922c8c88a048a8c76fcb1420a476fbd64daf190286d2f67fca7597684e8c6
3
  size 5176