yashcode00 commited on
Commit
e0cae81
·
1 Parent(s): 84b0181

yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor

Browse files
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor](https://huggingface.co/yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.6214
21
- - Accuracy: 0.8911
22
 
23
  ## Model description
24
 
@@ -45,15 +45,23 @@ The following hyperparameters were used during training:
45
  - total_train_batch_size: 64
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
- - num_epochs: 100
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:-----:|:---------------:|:--------:|
54
- | 0.065 | 26.4 | 5000 | 0.6983 | 0.8568 |
55
- | 0.0412 | 52.81 | 10000 | 0.5958 | 0.8762 |
56
- | 0.0173 | 79.21 | 15000 | 0.5708 | 0.8969 |
 
 
 
 
 
 
 
 
57
 
58
 
59
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor](https://huggingface.co/yashcode00/wav2vec2-large-xlsr-indian-language-classification-featureExtractor) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.3117
21
+ - Accuracy: 0.9323
22
 
23
  ## Model description
24
 
 
45
  - total_train_batch_size: 64
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
+ - num_epochs: 60
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:-----:|:---------------:|:--------:|
54
+ | 0.0559 | 5.28 | 1000 | 0.3097 | 0.9191 |
55
+ | 0.047 | 10.56 | 2000 | 0.3482 | 0.9191 |
56
+ | 0.0402 | 15.84 | 3000 | 0.3890 | 0.9080 |
57
+ | 0.0328 | 21.12 | 4000 | 0.3746 | 0.9150 |
58
+ | 0.0189 | 26.4 | 5000 | 0.4274 | 0.9113 |
59
+ | 0.0187 | 31.68 | 6000 | 0.4131 | 0.9101 |
60
+ | 0.0203 | 36.96 | 7000 | 0.3643 | 0.9237 |
61
+ | 0.0147 | 42.24 | 8000 | 0.3574 | 0.9295 |
62
+ | 0.0148 | 47.52 | 9000 | 0.3653 | 0.9220 |
63
+ | 0.0137 | 52.81 | 10000 | 0.3257 | 0.9352 |
64
+ | 0.0174 | 58.09 | 11000 | 0.3097 | 0.9340 |
65
 
66
 
67
  ### Framework versions
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 99.8,
3
- "eval_accuracy": 0.8910890817642212,
4
- "eval_loss": 0.6214143633842468,
5
- "eval_runtime": 50.0096,
6
  "eval_samples": 2424,
7
- "eval_samples_per_second": 48.471,
8
- "eval_steps_per_second": 6.059,
9
- "total_flos": 3.6752439370752e+19,
10
- "train_loss": 0.05161126141825681,
11
- "train_runtime": 41136.7074,
12
  "train_samples": 12120,
13
- "train_samples_per_second": 29.463,
14
- "train_steps_per_second": 0.459
15
  }
 
1
  {
2
+ "epoch": 59.88,
3
+ "eval_accuracy": 0.9323432445526123,
4
+ "eval_loss": 0.3117374777793884,
5
+ "eval_runtime": 50.8983,
6
  "eval_samples": 2424,
7
+ "eval_samples_per_second": 47.624,
8
+ "eval_steps_per_second": 5.953,
9
+ "total_flos": 2.20514636224512e+19,
10
+ "train_loss": 0.030625741817122836,
11
+ "train_runtime": 25244.9182,
12
  "train_samples": 12120,
13
+ "train_samples_per_second": 28.806,
14
+ "train_steps_per_second": 0.449
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 99.8,
3
- "eval_accuracy": 0.8910890817642212,
4
- "eval_loss": 0.6214143633842468,
5
- "eval_runtime": 50.0096,
6
  "eval_samples": 2424,
7
- "eval_samples_per_second": 48.471,
8
- "eval_steps_per_second": 6.059
9
  }
 
1
  {
2
+ "epoch": 59.88,
3
+ "eval_accuracy": 0.9323432445526123,
4
+ "eval_loss": 0.3117374777793884,
5
+ "eval_runtime": 50.8983,
6
  "eval_samples": 2424,
7
+ "eval_samples_per_second": 47.624,
8
+ "eval_steps_per_second": 5.953
9
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9e822c5fb55993c9b4c6df450112755bab2c5d2e28b029c06fc99c29c716826
3
  size 1266146037
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41ca0eeca582f52ad187db0b010f7bf152237a85fc348fd2af50256d74874d7a
3
  size 1266146037
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 99.8,
3
- "total_flos": 3.6752439370752e+19,
4
- "train_loss": 0.05161126141825681,
5
- "train_runtime": 41136.7074,
6
  "train_samples": 12120,
7
- "train_samples_per_second": 29.463,
8
- "train_steps_per_second": 0.459
9
  }
 
1
  {
2
+ "epoch": 59.88,
3
+ "total_flos": 2.20514636224512e+19,
4
+ "train_loss": 0.030625741817122836,
5
+ "train_runtime": 25244.9182,
6
  "train_samples": 12120,
7
+ "train_samples_per_second": 28.806,
8
+ "train_steps_per_second": 0.449
9
  }
trainer_state.json CHANGED
@@ -1,1198 +1,814 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 99.8019801980198,
5
- "eval_steps": 5000,
6
- "global_step": 18900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.53,
13
- "learning_rate": 4.974074074074074e-05,
14
- "loss": 0.2238,
15
  "step": 100
16
  },
17
  {
18
  "epoch": 1.06,
19
- "learning_rate": 4.947883597883598e-05,
20
- "loss": 0.1891,
21
  "step": 200
22
  },
23
  {
24
  "epoch": 1.58,
25
- "learning_rate": 4.921428571428572e-05,
26
- "loss": 0.1687,
27
  "step": 300
28
  },
29
  {
30
  "epoch": 2.11,
31
- "learning_rate": 4.894973544973545e-05,
32
- "loss": 0.1855,
33
  "step": 400
34
  },
35
  {
36
  "epoch": 2.64,
37
- "learning_rate": 4.868518518518519e-05,
38
- "loss": 0.1481,
39
  "step": 500
40
  },
41
  {
42
  "epoch": 3.17,
43
- "learning_rate": 4.842063492063492e-05,
44
- "loss": 0.1691,
45
  "step": 600
46
  },
47
  {
48
  "epoch": 3.7,
49
- "learning_rate": 4.815608465608466e-05,
50
- "loss": 0.1415,
51
  "step": 700
52
  },
53
  {
54
  "epoch": 4.22,
55
- "learning_rate": 4.7891534391534393e-05,
56
- "loss": 0.1427,
57
  "step": 800
58
  },
59
  {
60
  "epoch": 4.75,
61
- "learning_rate": 4.762698412698413e-05,
62
- "loss": 0.1249,
63
  "step": 900
64
  },
65
  {
66
  "epoch": 5.28,
67
- "learning_rate": 4.7362433862433866e-05,
68
- "loss": 0.1405,
 
 
 
 
 
 
 
 
 
69
  "step": 1000
70
  },
71
  {
72
  "epoch": 5.81,
73
- "learning_rate": 4.70978835978836e-05,
74
- "loss": 0.138,
75
  "step": 1100
76
  },
77
  {
78
  "epoch": 6.34,
79
- "learning_rate": 4.683333333333334e-05,
80
- "loss": 0.1295,
81
  "step": 1200
82
  },
83
  {
84
  "epoch": 6.86,
85
- "learning_rate": 4.6568783068783074e-05,
86
- "loss": 0.1314,
87
  "step": 1300
88
  },
89
  {
90
  "epoch": 7.39,
91
- "learning_rate": 4.63042328042328e-05,
92
- "loss": 0.1092,
93
  "step": 1400
94
  },
95
  {
96
  "epoch": 7.92,
97
- "learning_rate": 4.6039682539682546e-05,
98
- "loss": 0.1169,
99
  "step": 1500
100
  },
101
  {
102
  "epoch": 8.45,
103
- "learning_rate": 4.5775132275132275e-05,
104
- "loss": 0.1195,
105
  "step": 1600
106
  },
107
  {
108
  "epoch": 8.98,
109
- "learning_rate": 4.551058201058201e-05,
110
- "loss": 0.1212,
111
  "step": 1700
112
  },
113
  {
114
  "epoch": 9.5,
115
- "learning_rate": 4.524603174603175e-05,
116
- "loss": 0.1057,
117
  "step": 1800
118
  },
119
  {
120
  "epoch": 10.03,
121
- "learning_rate": 4.4981481481481484e-05,
122
- "loss": 0.1199,
123
  "step": 1900
124
  },
125
  {
126
  "epoch": 10.56,
127
- "learning_rate": 4.471693121693122e-05,
128
- "loss": 0.1116,
 
 
 
 
 
 
 
 
 
129
  "step": 2000
130
  },
131
  {
132
  "epoch": 11.09,
133
- "learning_rate": 4.4452380952380956e-05,
134
- "loss": 0.0921,
135
  "step": 2100
136
  },
137
  {
138
  "epoch": 11.62,
139
- "learning_rate": 4.418783068783069e-05,
140
- "loss": 0.1057,
141
  "step": 2200
142
  },
143
  {
144
  "epoch": 12.15,
145
- "learning_rate": 4.392328042328043e-05,
146
- "loss": 0.1082,
147
  "step": 2300
148
  },
149
  {
150
  "epoch": 12.67,
151
- "learning_rate": 4.365873015873016e-05,
152
- "loss": 0.1045,
153
  "step": 2400
154
  },
155
  {
156
  "epoch": 13.2,
157
- "learning_rate": 4.33941798941799e-05,
158
- "loss": 0.0976,
159
  "step": 2500
160
  },
161
  {
162
  "epoch": 13.73,
163
- "learning_rate": 4.312962962962963e-05,
164
- "loss": 0.0971,
165
  "step": 2600
166
  },
167
  {
168
  "epoch": 14.26,
169
- "learning_rate": 4.286507936507937e-05,
170
- "loss": 0.0862,
171
  "step": 2700
172
  },
173
  {
174
  "epoch": 14.79,
175
- "learning_rate": 4.26005291005291e-05,
176
- "loss": 0.0883,
177
  "step": 2800
178
  },
179
  {
180
  "epoch": 15.31,
181
- "learning_rate": 4.233597883597884e-05,
182
- "loss": 0.1122,
183
  "step": 2900
184
  },
185
  {
186
  "epoch": 15.84,
187
- "learning_rate": 4.2071428571428574e-05,
188
- "loss": 0.0854,
 
 
 
 
 
 
 
 
 
189
  "step": 3000
190
  },
191
  {
192
  "epoch": 16.37,
193
- "learning_rate": 4.180687830687831e-05,
194
- "loss": 0.0942,
195
  "step": 3100
196
  },
197
  {
198
  "epoch": 16.9,
199
- "learning_rate": 4.1542328042328046e-05,
200
- "loss": 0.0888,
201
  "step": 3200
202
  },
203
  {
204
  "epoch": 17.43,
205
- "learning_rate": 4.127777777777778e-05,
206
- "loss": 0.0917,
207
  "step": 3300
208
  },
209
  {
210
  "epoch": 17.95,
211
- "learning_rate": 4.101322751322751e-05,
212
- "loss": 0.0928,
213
  "step": 3400
214
  },
215
  {
216
  "epoch": 18.48,
217
- "learning_rate": 4.0748677248677254e-05,
218
- "loss": 0.0851,
219
  "step": 3500
220
  },
221
  {
222
  "epoch": 19.01,
223
- "learning_rate": 4.0484126984126983e-05,
224
- "loss": 0.0849,
225
  "step": 3600
226
  },
227
  {
228
  "epoch": 19.54,
229
- "learning_rate": 4.0219576719576726e-05,
230
- "loss": 0.0829,
231
  "step": 3700
232
  },
233
  {
234
  "epoch": 20.07,
235
- "learning_rate": 3.9955026455026456e-05,
236
- "loss": 0.0709,
237
  "step": 3800
238
  },
239
  {
240
  "epoch": 20.59,
241
- "learning_rate": 3.969047619047619e-05,
242
- "loss": 0.069,
243
  "step": 3900
244
  },
245
  {
246
  "epoch": 21.12,
247
- "learning_rate": 3.942592592592593e-05,
248
- "loss": 0.0764,
 
 
 
 
 
 
 
 
 
249
  "step": 4000
250
  },
251
  {
252
  "epoch": 21.65,
253
- "learning_rate": 3.9161375661375664e-05,
254
- "loss": 0.0676,
255
  "step": 4100
256
  },
257
  {
258
  "epoch": 22.18,
259
- "learning_rate": 3.88968253968254e-05,
260
- "loss": 0.0802,
261
  "step": 4200
262
  },
263
  {
264
  "epoch": 22.71,
265
- "learning_rate": 3.8632275132275136e-05,
266
- "loss": 0.0651,
267
  "step": 4300
268
  },
269
  {
270
  "epoch": 23.23,
271
- "learning_rate": 3.837037037037037e-05,
272
- "loss": 0.0732,
273
  "step": 4400
274
  },
275
  {
276
  "epoch": 23.76,
277
- "learning_rate": 3.810582010582011e-05,
278
- "loss": 0.0617,
279
  "step": 4500
280
  },
281
  {
282
  "epoch": 24.29,
283
- "learning_rate": 3.7841269841269845e-05,
284
- "loss": 0.0696,
285
  "step": 4600
286
  },
287
  {
288
  "epoch": 24.82,
289
- "learning_rate": 3.7576719576719574e-05,
290
- "loss": 0.0739,
291
  "step": 4700
292
  },
293
  {
294
  "epoch": 25.35,
295
- "learning_rate": 3.731216931216932e-05,
296
- "loss": 0.0569,
297
  "step": 4800
298
  },
299
  {
300
  "epoch": 25.87,
301
- "learning_rate": 3.7047619047619047e-05,
302
- "loss": 0.0696,
303
  "step": 4900
304
  },
305
  {
306
  "epoch": 26.4,
307
- "learning_rate": 3.678306878306878e-05,
308
- "loss": 0.065,
309
  "step": 5000
310
  },
311
  {
312
  "epoch": 26.4,
313
- "eval_accuracy": 0.8568481802940369,
314
- "eval_loss": 0.6983007192611694,
315
- "eval_runtime": 50.8227,
316
- "eval_samples_per_second": 47.695,
317
- "eval_steps_per_second": 5.962,
318
  "step": 5000
319
  },
320
  {
321
  "epoch": 26.93,
322
- "learning_rate": 3.651851851851852e-05,
323
- "loss": 0.0652,
324
  "step": 5100
325
  },
326
  {
327
  "epoch": 27.46,
328
- "learning_rate": 3.6253968253968255e-05,
329
- "loss": 0.0586,
330
  "step": 5200
331
  },
332
  {
333
  "epoch": 27.99,
334
- "learning_rate": 3.598941798941799e-05,
335
- "loss": 0.0673,
336
  "step": 5300
337
  },
338
  {
339
  "epoch": 28.51,
340
- "learning_rate": 3.5727513227513235e-05,
341
- "loss": 0.0622,
342
  "step": 5400
343
  },
344
  {
345
  "epoch": 29.04,
346
- "learning_rate": 3.5462962962962964e-05,
347
- "loss": 0.0618,
348
  "step": 5500
349
  },
350
  {
351
  "epoch": 29.57,
352
- "learning_rate": 3.51984126984127e-05,
353
- "loss": 0.0655,
354
  "step": 5600
355
  },
356
  {
357
  "epoch": 30.1,
358
- "learning_rate": 3.4933862433862436e-05,
359
- "loss": 0.0625,
360
  "step": 5700
361
  },
362
  {
363
  "epoch": 30.63,
364
- "learning_rate": 3.466931216931217e-05,
365
- "loss": 0.0584,
366
  "step": 5800
367
  },
368
  {
369
  "epoch": 31.16,
370
- "learning_rate": 3.440476190476191e-05,
371
- "loss": 0.0649,
372
  "step": 5900
373
  },
374
  {
375
  "epoch": 31.68,
376
- "learning_rate": 3.4140211640211644e-05,
377
- "loss": 0.0546,
 
 
 
 
 
 
 
 
 
378
  "step": 6000
379
  },
380
  {
381
  "epoch": 32.21,
382
- "learning_rate": 3.387566137566138e-05,
383
- "loss": 0.0564,
384
  "step": 6100
385
  },
386
  {
387
  "epoch": 32.74,
388
- "learning_rate": 3.3611111111111116e-05,
389
- "loss": 0.0584,
390
  "step": 6200
391
  },
392
  {
393
  "epoch": 33.27,
394
- "learning_rate": 3.3346560846560846e-05,
395
- "loss": 0.0516,
396
  "step": 6300
397
  },
398
  {
399
  "epoch": 33.8,
400
- "learning_rate": 3.308201058201059e-05,
401
- "loss": 0.051,
402
  "step": 6400
403
  },
404
  {
405
  "epoch": 34.32,
406
- "learning_rate": 3.281746031746032e-05,
407
- "loss": 0.0534,
408
  "step": 6500
409
  },
410
  {
411
  "epoch": 34.85,
412
- "learning_rate": 3.2552910052910054e-05,
413
- "loss": 0.0498,
414
  "step": 6600
415
  },
416
  {
417
  "epoch": 35.38,
418
- "learning_rate": 3.228835978835979e-05,
419
- "loss": 0.0499,
420
  "step": 6700
421
  },
422
  {
423
  "epoch": 35.91,
424
- "learning_rate": 3.202380952380952e-05,
425
- "loss": 0.0566,
426
  "step": 6800
427
  },
428
  {
429
  "epoch": 36.44,
430
- "learning_rate": 3.175925925925926e-05,
431
- "loss": 0.0424,
432
  "step": 6900
433
  },
434
  {
435
  "epoch": 36.96,
436
- "learning_rate": 3.149470899470899e-05,
437
- "loss": 0.0531,
 
 
 
 
 
 
 
 
 
438
  "step": 7000
439
  },
440
  {
441
  "epoch": 37.49,
442
- "learning_rate": 3.1230158730158734e-05,
443
- "loss": 0.0563,
444
  "step": 7100
445
  },
446
  {
447
  "epoch": 38.02,
448
- "learning_rate": 3.0965608465608464e-05,
449
- "loss": 0.0475,
450
  "step": 7200
451
  },
452
  {
453
  "epoch": 38.55,
454
- "learning_rate": 3.07010582010582e-05,
455
- "loss": 0.0381,
456
  "step": 7300
457
  },
458
  {
459
  "epoch": 39.08,
460
- "learning_rate": 3.0436507936507936e-05,
461
- "loss": 0.0525,
462
  "step": 7400
463
  },
464
  {
465
  "epoch": 39.6,
466
- "learning_rate": 3.0171957671957672e-05,
467
- "loss": 0.058,
468
  "step": 7500
469
  },
470
  {
471
  "epoch": 40.13,
472
- "learning_rate": 2.9907407407407405e-05,
473
- "loss": 0.0422,
474
  "step": 7600
475
  },
476
  {
477
  "epoch": 40.66,
478
- "learning_rate": 2.9642857142857144e-05,
479
- "loss": 0.0401,
480
  "step": 7700
481
  },
482
  {
483
  "epoch": 41.19,
484
- "learning_rate": 2.9378306878306877e-05,
485
- "loss": 0.0434,
486
  "step": 7800
487
  },
488
  {
489
  "epoch": 41.72,
490
- "learning_rate": 2.9113756613756616e-05,
491
- "loss": 0.0407,
492
  "step": 7900
493
  },
494
  {
495
  "epoch": 42.24,
496
- "learning_rate": 2.884920634920635e-05,
497
- "loss": 0.048,
 
 
 
 
 
 
 
 
 
498
  "step": 8000
499
  },
500
  {
501
  "epoch": 42.77,
502
- "learning_rate": 2.8584656084656085e-05,
503
- "loss": 0.032,
504
  "step": 8100
505
  },
506
  {
507
  "epoch": 43.3,
508
- "learning_rate": 2.8320105820105818e-05,
509
- "loss": 0.0457,
510
  "step": 8200
511
  },
512
  {
513
  "epoch": 43.83,
514
- "learning_rate": 2.8055555555555557e-05,
515
- "loss": 0.0531,
516
  "step": 8300
517
  },
518
  {
519
  "epoch": 44.36,
520
- "learning_rate": 2.779100529100529e-05,
521
- "loss": 0.0443,
522
  "step": 8400
523
  },
524
  {
525
  "epoch": 44.88,
526
- "learning_rate": 2.752645502645503e-05,
527
- "loss": 0.0404,
528
  "step": 8500
529
  },
530
  {
531
  "epoch": 45.41,
532
- "learning_rate": 2.7261904761904762e-05,
533
- "loss": 0.037,
534
  "step": 8600
535
  },
536
  {
537
  "epoch": 45.94,
538
- "learning_rate": 2.6997354497354498e-05,
539
- "loss": 0.0461,
540
  "step": 8700
541
  },
542
  {
543
  "epoch": 46.47,
544
- "learning_rate": 2.673280423280423e-05,
545
- "loss": 0.0362,
546
  "step": 8800
547
  },
548
  {
549
  "epoch": 47.0,
550
- "learning_rate": 2.6470899470899475e-05,
551
- "loss": 0.0417,
552
  "step": 8900
553
  },
554
  {
555
  "epoch": 47.52,
556
- "learning_rate": 2.6206349206349207e-05,
557
- "loss": 0.0347,
 
 
 
 
 
 
 
 
 
558
  "step": 9000
559
  },
560
  {
561
  "epoch": 48.05,
562
- "learning_rate": 2.5941798941798943e-05,
563
- "loss": 0.0448,
564
  "step": 9100
565
  },
566
  {
567
  "epoch": 48.58,
568
- "learning_rate": 2.5677248677248676e-05,
569
- "loss": 0.0368,
570
  "step": 9200
571
  },
572
  {
573
  "epoch": 49.11,
574
- "learning_rate": 2.5412698412698415e-05,
575
- "loss": 0.0379,
576
  "step": 9300
577
  },
578
  {
579
  "epoch": 49.64,
580
- "learning_rate": 2.5148148148148148e-05,
581
- "loss": 0.0367,
582
  "step": 9400
583
  },
584
  {
585
  "epoch": 50.17,
586
- "learning_rate": 2.4883597883597884e-05,
587
- "loss": 0.0331,
588
  "step": 9500
589
  },
590
  {
591
  "epoch": 50.69,
592
- "learning_rate": 2.461904761904762e-05,
593
- "loss": 0.0301,
594
  "step": 9600
595
  },
596
  {
597
  "epoch": 51.22,
598
- "learning_rate": 2.4357142857142857e-05,
599
- "loss": 0.0325,
600
  "step": 9700
601
  },
602
  {
603
  "epoch": 51.75,
604
- "learning_rate": 2.4092592592592593e-05,
605
- "loss": 0.0397,
606
  "step": 9800
607
  },
608
  {
609
  "epoch": 52.28,
610
- "learning_rate": 2.382804232804233e-05,
611
- "loss": 0.0396,
612
  "step": 9900
613
  },
614
  {
615
  "epoch": 52.81,
616
- "learning_rate": 2.3563492063492065e-05,
617
- "loss": 0.0412,
618
  "step": 10000
619
  },
620
  {
621
  "epoch": 52.81,
622
- "eval_accuracy": 0.8762376308441162,
623
- "eval_loss": 0.5958317518234253,
624
- "eval_runtime": 50.4121,
625
- "eval_samples_per_second": 48.084,
626
- "eval_steps_per_second": 6.01,
627
  "step": 10000
628
  },
629
  {
630
  "epoch": 53.33,
631
- "learning_rate": 2.32989417989418e-05,
632
- "loss": 0.0427,
633
  "step": 10100
634
  },
635
  {
636
  "epoch": 53.86,
637
- "learning_rate": 2.3034391534391538e-05,
638
- "loss": 0.0317,
639
  "step": 10200
640
  },
641
  {
642
  "epoch": 54.39,
643
- "learning_rate": 2.276984126984127e-05,
644
- "loss": 0.0314,
645
  "step": 10300
646
  },
647
  {
648
  "epoch": 54.92,
649
- "learning_rate": 2.2505291005291006e-05,
650
- "loss": 0.0384,
651
  "step": 10400
652
  },
653
  {
654
  "epoch": 55.45,
655
- "learning_rate": 2.2240740740740743e-05,
656
- "loss": 0.0311,
657
  "step": 10500
658
  },
659
  {
660
  "epoch": 55.97,
661
- "learning_rate": 2.197619047619048e-05,
662
- "loss": 0.0315,
663
  "step": 10600
664
  },
665
  {
666
  "epoch": 56.5,
667
- "learning_rate": 2.1711640211640215e-05,
668
- "loss": 0.0263,
669
  "step": 10700
670
  },
671
  {
672
  "epoch": 57.03,
673
- "learning_rate": 2.1447089947089947e-05,
674
- "loss": 0.0319,
675
  "step": 10800
676
  },
677
  {
678
  "epoch": 57.56,
679
- "learning_rate": 2.1182539682539683e-05,
680
- "loss": 0.025,
681
  "step": 10900
682
  },
683
  {
684
  "epoch": 58.09,
685
- "learning_rate": 2.091798941798942e-05,
686
- "loss": 0.0323,
 
 
 
 
 
 
 
 
 
687
  "step": 11000
688
  },
689
  {
690
  "epoch": 58.61,
691
- "learning_rate": 2.0653439153439156e-05,
692
- "loss": 0.034,
693
  "step": 11100
694
  },
695
  {
696
  "epoch": 59.14,
697
- "learning_rate": 2.0388888888888892e-05,
698
- "loss": 0.0326,
699
  "step": 11200
700
  },
701
  {
702
  "epoch": 59.67,
703
- "learning_rate": 2.0124338624338628e-05,
704
- "loss": 0.0273,
705
  "step": 11300
706
  },
707
  {
708
- "epoch": 60.2,
709
- "learning_rate": 1.985978835978836e-05,
710
- "loss": 0.0261,
711
- "step": 11400
712
- },
713
- {
714
- "epoch": 60.73,
715
- "learning_rate": 1.9595238095238097e-05,
716
- "loss": 0.0297,
717
- "step": 11500
718
- },
719
- {
720
- "epoch": 61.25,
721
- "learning_rate": 1.9330687830687833e-05,
722
- "loss": 0.0375,
723
- "step": 11600
724
- },
725
- {
726
- "epoch": 61.78,
727
- "learning_rate": 1.906613756613757e-05,
728
- "loss": 0.0262,
729
- "step": 11700
730
- },
731
- {
732
- "epoch": 62.31,
733
- "learning_rate": 1.8801587301587305e-05,
734
- "loss": 0.0333,
735
- "step": 11800
736
- },
737
- {
738
- "epoch": 62.84,
739
- "learning_rate": 1.8537037037037037e-05,
740
- "loss": 0.025,
741
- "step": 11900
742
- },
743
- {
744
- "epoch": 63.37,
745
- "learning_rate": 1.8272486772486774e-05,
746
- "loss": 0.0245,
747
- "step": 12000
748
- },
749
- {
750
- "epoch": 63.89,
751
- "learning_rate": 1.800793650793651e-05,
752
- "loss": 0.0261,
753
- "step": 12100
754
- },
755
- {
756
- "epoch": 64.42,
757
- "learning_rate": 1.7743386243386246e-05,
758
- "loss": 0.0277,
759
- "step": 12200
760
- },
761
- {
762
- "epoch": 64.95,
763
- "learning_rate": 1.7478835978835982e-05,
764
- "loss": 0.0306,
765
- "step": 12300
766
- },
767
- {
768
- "epoch": 65.48,
769
- "learning_rate": 1.7214285714285715e-05,
770
- "loss": 0.0287,
771
- "step": 12400
772
- },
773
- {
774
- "epoch": 66.01,
775
- "learning_rate": 1.694973544973545e-05,
776
- "loss": 0.0222,
777
- "step": 12500
778
- },
779
- {
780
- "epoch": 66.53,
781
- "learning_rate": 1.6685185185185187e-05,
782
- "loss": 0.0302,
783
- "step": 12600
784
- },
785
- {
786
- "epoch": 67.06,
787
- "learning_rate": 1.6420634920634923e-05,
788
- "loss": 0.0252,
789
- "step": 12700
790
- },
791
- {
792
- "epoch": 67.59,
793
- "learning_rate": 1.615608465608466e-05,
794
- "loss": 0.0221,
795
- "step": 12800
796
- },
797
- {
798
- "epoch": 68.12,
799
- "learning_rate": 1.5891534391534395e-05,
800
- "loss": 0.0383,
801
- "step": 12900
802
- },
803
- {
804
- "epoch": 68.65,
805
- "learning_rate": 1.5626984126984128e-05,
806
- "loss": 0.0242,
807
- "step": 13000
808
- },
809
- {
810
- "epoch": 69.17,
811
- "learning_rate": 1.5362433862433864e-05,
812
- "loss": 0.0194,
813
- "step": 13100
814
- },
815
- {
816
- "epoch": 69.7,
817
- "learning_rate": 1.50978835978836e-05,
818
- "loss": 0.0245,
819
- "step": 13200
820
- },
821
- {
822
- "epoch": 70.23,
823
- "learning_rate": 1.4833333333333336e-05,
824
- "loss": 0.0263,
825
- "step": 13300
826
- },
827
- {
828
- "epoch": 70.76,
829
- "learning_rate": 1.456878306878307e-05,
830
- "loss": 0.0276,
831
- "step": 13400
832
- },
833
- {
834
- "epoch": 71.29,
835
- "learning_rate": 1.4304232804232806e-05,
836
- "loss": 0.0213,
837
- "step": 13500
838
- },
839
- {
840
- "epoch": 71.82,
841
- "learning_rate": 1.4042328042328043e-05,
842
- "loss": 0.0296,
843
- "step": 13600
844
- },
845
- {
846
- "epoch": 72.34,
847
- "learning_rate": 1.3777777777777778e-05,
848
- "loss": 0.0231,
849
- "step": 13700
850
- },
851
- {
852
- "epoch": 72.87,
853
- "learning_rate": 1.3513227513227514e-05,
854
- "loss": 0.0241,
855
- "step": 13800
856
- },
857
- {
858
- "epoch": 73.4,
859
- "learning_rate": 1.324867724867725e-05,
860
- "loss": 0.0283,
861
- "step": 13900
862
- },
863
- {
864
- "epoch": 73.93,
865
- "learning_rate": 1.2984126984126984e-05,
866
- "loss": 0.0231,
867
- "step": 14000
868
- },
869
- {
870
- "epoch": 74.46,
871
- "learning_rate": 1.271957671957672e-05,
872
- "loss": 0.0179,
873
- "step": 14100
874
- },
875
- {
876
- "epoch": 74.98,
877
- "learning_rate": 1.2455026455026456e-05,
878
- "loss": 0.0208,
879
- "step": 14200
880
- },
881
- {
882
- "epoch": 75.51,
883
- "learning_rate": 1.219047619047619e-05,
884
- "loss": 0.0277,
885
- "step": 14300
886
- },
887
- {
888
- "epoch": 76.04,
889
- "learning_rate": 1.1925925925925927e-05,
890
- "loss": 0.0195,
891
- "step": 14400
892
- },
893
- {
894
- "epoch": 76.57,
895
- "learning_rate": 1.1661375661375661e-05,
896
- "loss": 0.0188,
897
- "step": 14500
898
- },
899
- {
900
- "epoch": 77.1,
901
- "learning_rate": 1.1396825396825397e-05,
902
- "loss": 0.0184,
903
- "step": 14600
904
- },
905
- {
906
- "epoch": 77.62,
907
- "learning_rate": 1.1132275132275133e-05,
908
- "loss": 0.0165,
909
- "step": 14700
910
- },
911
- {
912
- "epoch": 78.15,
913
- "learning_rate": 1.0867724867724868e-05,
914
- "loss": 0.0245,
915
- "step": 14800
916
- },
917
- {
918
- "epoch": 78.68,
919
- "learning_rate": 1.0603174603174604e-05,
920
- "loss": 0.0331,
921
- "step": 14900
922
- },
923
- {
924
- "epoch": 79.21,
925
- "learning_rate": 1.033862433862434e-05,
926
- "loss": 0.0173,
927
- "step": 15000
928
- },
929
- {
930
- "epoch": 79.21,
931
- "eval_accuracy": 0.8968647122383118,
932
- "eval_loss": 0.5708499550819397,
933
- "eval_runtime": 49.6061,
934
- "eval_samples_per_second": 48.865,
935
- "eval_steps_per_second": 6.108,
936
- "step": 15000
937
- },
938
- {
939
- "epoch": 79.74,
940
- "learning_rate": 1.0074074074074074e-05,
941
- "loss": 0.0243,
942
- "step": 15100
943
- },
944
- {
945
- "epoch": 80.26,
946
- "learning_rate": 9.80952380952381e-06,
947
- "loss": 0.0203,
948
- "step": 15200
949
- },
950
- {
951
- "epoch": 80.79,
952
- "learning_rate": 9.544973544973545e-06,
953
- "loss": 0.018,
954
- "step": 15300
955
- },
956
- {
957
- "epoch": 81.32,
958
- "learning_rate": 9.280423280423281e-06,
959
- "loss": 0.0239,
960
- "step": 15400
961
- },
962
- {
963
- "epoch": 81.85,
964
- "learning_rate": 9.015873015873017e-06,
965
- "loss": 0.0176,
966
- "step": 15500
967
- },
968
- {
969
- "epoch": 82.38,
970
- "learning_rate": 8.751322751322751e-06,
971
- "loss": 0.0231,
972
- "step": 15600
973
- },
974
- {
975
- "epoch": 82.9,
976
- "learning_rate": 8.486772486772487e-06,
977
- "loss": 0.0181,
978
- "step": 15700
979
- },
980
- {
981
- "epoch": 83.43,
982
- "learning_rate": 8.222222222222223e-06,
983
- "loss": 0.0221,
984
- "step": 15800
985
- },
986
- {
987
- "epoch": 83.96,
988
- "learning_rate": 7.957671957671958e-06,
989
- "loss": 0.0132,
990
- "step": 15900
991
- },
992
- {
993
- "epoch": 84.49,
994
- "learning_rate": 7.693121693121694e-06,
995
- "loss": 0.0127,
996
- "step": 16000
997
- },
998
- {
999
- "epoch": 85.02,
1000
- "learning_rate": 7.428571428571429e-06,
1001
- "loss": 0.0178,
1002
- "step": 16100
1003
- },
1004
- {
1005
- "epoch": 85.54,
1006
- "learning_rate": 7.1640211640211644e-06,
1007
- "loss": 0.0176,
1008
- "step": 16200
1009
- },
1010
- {
1011
- "epoch": 86.07,
1012
- "learning_rate": 6.8994708994709e-06,
1013
- "loss": 0.0169,
1014
- "step": 16300
1015
- },
1016
- {
1017
- "epoch": 86.6,
1018
- "learning_rate": 6.634920634920636e-06,
1019
- "loss": 0.0163,
1020
- "step": 16400
1021
- },
1022
- {
1023
- "epoch": 87.13,
1024
- "learning_rate": 6.370370370370371e-06,
1025
- "loss": 0.015,
1026
- "step": 16500
1027
- },
1028
- {
1029
- "epoch": 87.66,
1030
- "learning_rate": 6.105820105820106e-06,
1031
- "loss": 0.022,
1032
- "step": 16600
1033
- },
1034
- {
1035
- "epoch": 88.18,
1036
- "learning_rate": 5.841269841269842e-06,
1037
- "loss": 0.0115,
1038
- "step": 16700
1039
- },
1040
- {
1041
- "epoch": 88.71,
1042
- "learning_rate": 5.576719576719577e-06,
1043
- "loss": 0.0148,
1044
- "step": 16800
1045
- },
1046
- {
1047
- "epoch": 89.24,
1048
- "learning_rate": 5.312169312169312e-06,
1049
- "loss": 0.0153,
1050
- "step": 16900
1051
- },
1052
- {
1053
- "epoch": 89.77,
1054
- "learning_rate": 5.047619047619047e-06,
1055
- "loss": 0.0194,
1056
- "step": 17000
1057
- },
1058
- {
1059
- "epoch": 90.3,
1060
- "learning_rate": 4.783068783068783e-06,
1061
- "loss": 0.0165,
1062
- "step": 17100
1063
- },
1064
- {
1065
- "epoch": 90.83,
1066
- "learning_rate": 4.5185185185185185e-06,
1067
- "loss": 0.013,
1068
- "step": 17200
1069
- },
1070
- {
1071
- "epoch": 91.35,
1072
- "learning_rate": 4.253968253968254e-06,
1073
- "loss": 0.0212,
1074
- "step": 17300
1075
- },
1076
- {
1077
- "epoch": 91.88,
1078
- "learning_rate": 3.989417989417989e-06,
1079
- "loss": 0.0192,
1080
- "step": 17400
1081
- },
1082
- {
1083
- "epoch": 92.41,
1084
- "learning_rate": 3.7248677248677246e-06,
1085
- "loss": 0.0152,
1086
- "step": 17500
1087
- },
1088
- {
1089
- "epoch": 92.94,
1090
- "learning_rate": 3.4603174603174603e-06,
1091
- "loss": 0.02,
1092
- "step": 17600
1093
- },
1094
- {
1095
- "epoch": 93.47,
1096
- "learning_rate": 3.1957671957671955e-06,
1097
- "loss": 0.0089,
1098
- "step": 17700
1099
- },
1100
- {
1101
- "epoch": 93.99,
1102
- "learning_rate": 2.9312169312169316e-06,
1103
- "loss": 0.0124,
1104
- "step": 17800
1105
- },
1106
- {
1107
- "epoch": 94.52,
1108
- "learning_rate": 2.666666666666667e-06,
1109
- "loss": 0.019,
1110
- "step": 17900
1111
- },
1112
- {
1113
- "epoch": 95.05,
1114
- "learning_rate": 2.402116402116402e-06,
1115
- "loss": 0.0151,
1116
- "step": 18000
1117
- },
1118
- {
1119
- "epoch": 95.58,
1120
- "learning_rate": 2.1375661375661377e-06,
1121
- "loss": 0.0184,
1122
- "step": 18100
1123
- },
1124
- {
1125
- "epoch": 96.11,
1126
- "learning_rate": 1.873015873015873e-06,
1127
- "loss": 0.0146,
1128
- "step": 18200
1129
- },
1130
- {
1131
- "epoch": 96.63,
1132
- "learning_rate": 1.6084656084656084e-06,
1133
- "loss": 0.0227,
1134
- "step": 18300
1135
- },
1136
- {
1137
- "epoch": 97.16,
1138
- "learning_rate": 1.343915343915344e-06,
1139
- "loss": 0.0149,
1140
- "step": 18400
1141
- },
1142
- {
1143
- "epoch": 97.69,
1144
- "learning_rate": 1.0793650793650795e-06,
1145
- "loss": 0.015,
1146
- "step": 18500
1147
- },
1148
- {
1149
- "epoch": 98.22,
1150
- "learning_rate": 8.148148148148147e-07,
1151
- "loss": 0.0166,
1152
- "step": 18600
1153
- },
1154
- {
1155
- "epoch": 98.75,
1156
- "learning_rate": 5.502645502645503e-07,
1157
- "loss": 0.0167,
1158
- "step": 18700
1159
- },
1160
- {
1161
- "epoch": 99.27,
1162
- "learning_rate": 2.8571428571428575e-07,
1163
- "loss": 0.0152,
1164
- "step": 18800
1165
- },
1166
- {
1167
- "epoch": 99.8,
1168
- "learning_rate": 2.1164021164021167e-08,
1169
- "loss": 0.0164,
1170
- "step": 18900
1171
- },
1172
- {
1173
- "epoch": 99.8,
1174
- "step": 18900,
1175
- "total_flos": 3.6752439370752e+19,
1176
- "train_loss": 0.05161126141825681,
1177
- "train_runtime": 41136.7074,
1178
- "train_samples_per_second": 29.463,
1179
- "train_steps_per_second": 0.459
1180
  },
1181
  {
1182
- "epoch": 99.8,
1183
- "eval_accuracy": 0.8910890817642212,
1184
- "eval_loss": 0.6214143633842468,
1185
- "eval_runtime": 50.0096,
1186
- "eval_samples_per_second": 48.471,
1187
- "eval_steps_per_second": 6.059,
1188
- "step": 18900
1189
  }
1190
  ],
1191
  "logging_steps": 100,
1192
- "max_steps": 18900,
1193
- "num_train_epochs": 100,
1194
  "save_steps": 2000,
1195
- "total_flos": 3.6752439370752e+19,
1196
  "trial_name": null,
1197
  "trial_params": null
1198
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 59.881188118811885,
5
+ "eval_steps": 1000,
6
+ "global_step": 11340,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.53,
13
+ "learning_rate": 4.956349206349207e-05,
14
+ "loss": 0.0798,
15
  "step": 100
16
  },
17
  {
18
  "epoch": 1.06,
19
+ "learning_rate": 4.912257495590829e-05,
20
+ "loss": 0.0816,
21
  "step": 200
22
  },
23
  {
24
  "epoch": 1.58,
25
+ "learning_rate": 4.868165784832452e-05,
26
+ "loss": 0.0761,
27
  "step": 300
28
  },
29
  {
30
  "epoch": 2.11,
31
+ "learning_rate": 4.824514991181658e-05,
32
+ "loss": 0.0723,
33
  "step": 400
34
  },
35
  {
36
  "epoch": 2.64,
37
+ "learning_rate": 4.7804232804232806e-05,
38
+ "loss": 0.0601,
39
  "step": 500
40
  },
41
  {
42
  "epoch": 3.17,
43
+ "learning_rate": 4.736331569664903e-05,
44
+ "loss": 0.0593,
45
  "step": 600
46
  },
47
  {
48
  "epoch": 3.7,
49
+ "learning_rate": 4.692239858906526e-05,
50
+ "loss": 0.0692,
51
  "step": 700
52
  },
53
  {
54
  "epoch": 4.22,
55
+ "learning_rate": 4.648148148148148e-05,
56
+ "loss": 0.0587,
57
  "step": 800
58
  },
59
  {
60
  "epoch": 4.75,
61
+ "learning_rate": 4.604056437389771e-05,
62
+ "loss": 0.0598,
63
  "step": 900
64
  },
65
  {
66
  "epoch": 5.28,
67
+ "learning_rate": 4.559964726631393e-05,
68
+ "loss": 0.0559,
69
+ "step": 1000
70
+ },
71
+ {
72
+ "epoch": 5.28,
73
+ "eval_accuracy": 0.9191418886184692,
74
+ "eval_loss": 0.3096904754638672,
75
+ "eval_runtime": 51.3243,
76
+ "eval_samples_per_second": 47.229,
77
+ "eval_steps_per_second": 5.904,
78
  "step": 1000
79
  },
80
  {
81
  "epoch": 5.81,
82
+ "learning_rate": 4.515873015873016e-05,
83
+ "loss": 0.0552,
84
  "step": 1100
85
  },
86
  {
87
  "epoch": 6.34,
88
+ "learning_rate": 4.471781305114639e-05,
89
+ "loss": 0.0519,
90
  "step": 1200
91
  },
92
  {
93
  "epoch": 6.86,
94
+ "learning_rate": 4.428130511463845e-05,
95
+ "loss": 0.0536,
96
  "step": 1300
97
  },
98
  {
99
  "epoch": 7.39,
100
+ "learning_rate": 4.3840388007054675e-05,
101
+ "loss": 0.0438,
102
  "step": 1400
103
  },
104
  {
105
  "epoch": 7.92,
106
+ "learning_rate": 4.33994708994709e-05,
107
+ "loss": 0.0436,
108
  "step": 1500
109
  },
110
  {
111
  "epoch": 8.45,
112
+ "learning_rate": 4.295855379188713e-05,
113
+ "loss": 0.0447,
114
  "step": 1600
115
  },
116
  {
117
  "epoch": 8.98,
118
+ "learning_rate": 4.2517636684303355e-05,
119
+ "loss": 0.0535,
120
  "step": 1700
121
  },
122
  {
123
  "epoch": 9.5,
124
+ "learning_rate": 4.207671957671958e-05,
125
+ "loss": 0.0467,
126
  "step": 1800
127
  },
128
  {
129
  "epoch": 10.03,
130
+ "learning_rate": 4.16358024691358e-05,
131
+ "loss": 0.0557,
132
  "step": 1900
133
  },
134
  {
135
  "epoch": 10.56,
136
+ "learning_rate": 4.1194885361552036e-05,
137
+ "loss": 0.047,
138
+ "step": 2000
139
+ },
140
+ {
141
+ "epoch": 10.56,
142
+ "eval_accuracy": 0.9191418886184692,
143
+ "eval_loss": 0.34823155403137207,
144
+ "eval_runtime": 51.4649,
145
+ "eval_samples_per_second": 47.1,
146
+ "eval_steps_per_second": 5.888,
147
  "step": 2000
148
  },
149
  {
150
  "epoch": 11.09,
151
+ "learning_rate": 4.0753968253968256e-05,
152
+ "loss": 0.0397,
153
  "step": 2100
154
  },
155
  {
156
  "epoch": 11.62,
157
+ "learning_rate": 4.031305114638448e-05,
158
+ "loss": 0.0475,
159
  "step": 2200
160
  },
161
  {
162
  "epoch": 12.15,
163
+ "learning_rate": 3.987213403880071e-05,
164
+ "loss": 0.0478,
165
  "step": 2300
166
  },
167
  {
168
  "epoch": 12.67,
169
+ "learning_rate": 3.9431216931216936e-05,
170
+ "loss": 0.0421,
171
  "step": 2400
172
  },
173
  {
174
  "epoch": 13.2,
175
+ "learning_rate": 3.8994708994709e-05,
176
+ "loss": 0.0408,
177
  "step": 2500
178
  },
179
  {
180
  "epoch": 13.73,
181
+ "learning_rate": 3.8553791887125224e-05,
182
+ "loss": 0.0368,
183
  "step": 2600
184
  },
185
  {
186
  "epoch": 14.26,
187
+ "learning_rate": 3.8112874779541445e-05,
188
+ "loss": 0.0379,
189
  "step": 2700
190
  },
191
  {
192
  "epoch": 14.79,
193
+ "learning_rate": 3.767195767195768e-05,
194
+ "loss": 0.052,
195
  "step": 2800
196
  },
197
  {
198
  "epoch": 15.31,
199
+ "learning_rate": 3.72310405643739e-05,
200
+ "loss": 0.0424,
201
  "step": 2900
202
  },
203
  {
204
  "epoch": 15.84,
205
+ "learning_rate": 3.6790123456790125e-05,
206
+ "loss": 0.0402,
207
+ "step": 3000
208
+ },
209
+ {
210
+ "epoch": 15.84,
211
+ "eval_accuracy": 0.9080032706260681,
212
+ "eval_loss": 0.3889801502227783,
213
+ "eval_runtime": 50.7451,
214
+ "eval_samples_per_second": 47.768,
215
+ "eval_steps_per_second": 5.971,
216
  "step": 3000
217
  },
218
  {
219
  "epoch": 16.37,
220
+ "learning_rate": 3.634920634920635e-05,
221
+ "loss": 0.0417,
222
  "step": 3100
223
  },
224
  {
225
  "epoch": 16.9,
226
+ "learning_rate": 3.590828924162258e-05,
227
+ "loss": 0.0419,
228
  "step": 3200
229
  },
230
  {
231
  "epoch": 17.43,
232
+ "learning_rate": 3.54673721340388e-05,
233
+ "loss": 0.0439,
234
  "step": 3300
235
  },
236
  {
237
  "epoch": 17.95,
238
+ "learning_rate": 3.502645502645503e-05,
239
+ "loss": 0.0446,
240
  "step": 3400
241
  },
242
  {
243
  "epoch": 18.48,
244
+ "learning_rate": 3.458553791887125e-05,
245
+ "loss": 0.0339,
246
  "step": 3500
247
  },
248
  {
249
  "epoch": 19.01,
250
+ "learning_rate": 3.414462081128748e-05,
251
+ "loss": 0.0301,
252
  "step": 3600
253
  },
254
  {
255
  "epoch": 19.54,
256
+ "learning_rate": 3.3703703703703706e-05,
257
+ "loss": 0.0277,
258
  "step": 3700
259
  },
260
  {
261
  "epoch": 20.07,
262
+ "learning_rate": 3.326278659611993e-05,
263
+ "loss": 0.0333,
264
  "step": 3800
265
  },
266
  {
267
  "epoch": 20.59,
268
+ "learning_rate": 3.282186948853615e-05,
269
+ "loss": 0.0298,
270
  "step": 3900
271
  },
272
  {
273
  "epoch": 21.12,
274
+ "learning_rate": 3.2380952380952386e-05,
275
+ "loss": 0.0328,
276
+ "step": 4000
277
+ },
278
+ {
279
+ "epoch": 21.12,
280
+ "eval_accuracy": 0.9150164723396301,
281
+ "eval_loss": 0.37457939982414246,
282
+ "eval_runtime": 50.3165,
283
+ "eval_samples_per_second": 48.175,
284
+ "eval_steps_per_second": 6.022,
285
  "step": 4000
286
  },
287
  {
288
  "epoch": 21.65,
289
+ "learning_rate": 3.1940035273368606e-05,
290
+ "loss": 0.0298,
291
  "step": 4100
292
  },
293
  {
294
  "epoch": 22.18,
295
+ "learning_rate": 3.149911816578483e-05,
296
+ "loss": 0.0314,
297
  "step": 4200
298
  },
299
  {
300
  "epoch": 22.71,
301
+ "learning_rate": 3.105820105820106e-05,
302
+ "loss": 0.0253,
303
  "step": 4300
304
  },
305
  {
306
  "epoch": 23.23,
307
+ "learning_rate": 3.061728395061729e-05,
308
+ "loss": 0.0339,
309
  "step": 4400
310
  },
311
  {
312
  "epoch": 23.76,
313
+ "learning_rate": 3.017636684303351e-05,
314
+ "loss": 0.0266,
315
  "step": 4500
316
  },
317
  {
318
  "epoch": 24.29,
319
+ "learning_rate": 2.973544973544974e-05,
320
+ "loss": 0.0361,
321
  "step": 4600
322
  },
323
  {
324
  "epoch": 24.82,
325
+ "learning_rate": 2.929453262786596e-05,
326
+ "loss": 0.0305,
327
  "step": 4700
328
  },
329
  {
330
  "epoch": 25.35,
331
+ "learning_rate": 2.885361552028219e-05,
332
+ "loss": 0.0294,
333
  "step": 4800
334
  },
335
  {
336
  "epoch": 25.87,
337
+ "learning_rate": 2.8412698412698414e-05,
338
+ "loss": 0.0339,
339
  "step": 4900
340
  },
341
  {
342
  "epoch": 26.4,
343
+ "learning_rate": 2.797178130511464e-05,
344
+ "loss": 0.0189,
345
  "step": 5000
346
  },
347
  {
348
  "epoch": 26.4,
349
+ "eval_accuracy": 0.9113036394119263,
350
+ "eval_loss": 0.42735978960990906,
351
+ "eval_runtime": 49.4145,
352
+ "eval_samples_per_second": 49.054,
353
+ "eval_steps_per_second": 6.132,
354
  "step": 5000
355
  },
356
  {
357
  "epoch": 26.93,
358
+ "learning_rate": 2.7530864197530864e-05,
359
+ "loss": 0.0285,
360
  "step": 5100
361
  },
362
  {
363
  "epoch": 27.46,
364
+ "learning_rate": 2.7089947089947094e-05,
365
+ "loss": 0.0307,
366
  "step": 5200
367
  },
368
  {
369
  "epoch": 27.99,
370
+ "learning_rate": 2.6649029982363318e-05,
371
+ "loss": 0.0269,
372
  "step": 5300
373
  },
374
  {
375
  "epoch": 28.51,
376
+ "learning_rate": 2.6208112874779544e-05,
377
+ "loss": 0.0292,
378
  "step": 5400
379
  },
380
  {
381
  "epoch": 29.04,
382
+ "learning_rate": 2.5767195767195768e-05,
383
+ "loss": 0.032,
384
  "step": 5500
385
  },
386
  {
387
  "epoch": 29.57,
388
+ "learning_rate": 2.5326278659611995e-05,
389
+ "loss": 0.0297,
390
  "step": 5600
391
  },
392
  {
393
  "epoch": 30.1,
394
+ "learning_rate": 2.4885361552028218e-05,
395
+ "loss": 0.027,
396
  "step": 5700
397
  },
398
  {
399
  "epoch": 30.63,
400
+ "learning_rate": 2.4444444444444445e-05,
401
+ "loss": 0.0258,
402
  "step": 5800
403
  },
404
  {
405
  "epoch": 31.16,
406
+ "learning_rate": 2.4003527336860672e-05,
407
+ "loss": 0.0237,
408
  "step": 5900
409
  },
410
  {
411
  "epoch": 31.68,
412
+ "learning_rate": 2.3562610229276895e-05,
413
+ "loss": 0.0187,
414
+ "step": 6000
415
+ },
416
+ {
417
+ "epoch": 31.68,
418
+ "eval_accuracy": 0.9100660085678101,
419
+ "eval_loss": 0.4131234884262085,
420
+ "eval_runtime": 49.5018,
421
+ "eval_samples_per_second": 48.968,
422
+ "eval_steps_per_second": 6.121,
423
  "step": 6000
424
  },
425
  {
426
  "epoch": 32.21,
427
+ "learning_rate": 2.3121693121693122e-05,
428
+ "loss": 0.0255,
429
  "step": 6100
430
  },
431
  {
432
  "epoch": 32.74,
433
+ "learning_rate": 2.268077601410935e-05,
434
+ "loss": 0.0244,
435
  "step": 6200
436
  },
437
  {
438
  "epoch": 33.27,
439
+ "learning_rate": 2.2239858906525572e-05,
440
+ "loss": 0.0273,
441
  "step": 6300
442
  },
443
  {
444
  "epoch": 33.8,
445
+ "learning_rate": 2.17989417989418e-05,
446
+ "loss": 0.0214,
447
  "step": 6400
448
  },
449
  {
450
  "epoch": 34.32,
451
+ "learning_rate": 2.1358024691358026e-05,
452
+ "loss": 0.0265,
453
  "step": 6500
454
  },
455
  {
456
  "epoch": 34.85,
457
+ "learning_rate": 2.091710758377425e-05,
458
+ "loss": 0.0276,
459
  "step": 6600
460
  },
461
  {
462
  "epoch": 35.38,
463
+ "learning_rate": 2.0476190476190476e-05,
464
+ "loss": 0.0153,
465
  "step": 6700
466
  },
467
  {
468
  "epoch": 35.91,
469
+ "learning_rate": 2.0035273368606703e-05,
470
+ "loss": 0.0246,
471
  "step": 6800
472
  },
473
  {
474
  "epoch": 36.44,
475
+ "learning_rate": 1.959435626102293e-05,
476
+ "loss": 0.0266,
477
  "step": 6900
478
  },
479
  {
480
  "epoch": 36.96,
481
+ "learning_rate": 1.9153439153439153e-05,
482
+ "loss": 0.0203,
483
+ "step": 7000
484
+ },
485
+ {
486
+ "epoch": 36.96,
487
+ "eval_accuracy": 0.9236798882484436,
488
+ "eval_loss": 0.3643423020839691,
489
+ "eval_runtime": 49.3507,
490
+ "eval_samples_per_second": 49.118,
491
+ "eval_steps_per_second": 6.14,
492
  "step": 7000
493
  },
494
  {
495
  "epoch": 37.49,
496
+ "learning_rate": 1.871252204585538e-05,
497
+ "loss": 0.0225,
498
  "step": 7100
499
  },
500
  {
501
  "epoch": 38.02,
502
+ "learning_rate": 1.8271604938271607e-05,
503
+ "loss": 0.0296,
504
  "step": 7200
505
  },
506
  {
507
  "epoch": 38.55,
508
+ "learning_rate": 1.783068783068783e-05,
509
+ "loss": 0.0181,
510
  "step": 7300
511
  },
512
  {
513
  "epoch": 39.08,
514
+ "learning_rate": 1.7389770723104057e-05,
515
+ "loss": 0.0184,
516
  "step": 7400
517
  },
518
  {
519
  "epoch": 39.6,
520
+ "learning_rate": 1.6948853615520284e-05,
521
+ "loss": 0.0191,
522
  "step": 7500
523
  },
524
  {
525
  "epoch": 40.13,
526
+ "learning_rate": 1.6507936507936507e-05,
527
+ "loss": 0.0224,
528
  "step": 7600
529
  },
530
  {
531
  "epoch": 40.66,
532
+ "learning_rate": 1.6067019400352734e-05,
533
+ "loss": 0.0161,
534
  "step": 7700
535
  },
536
  {
537
  "epoch": 41.19,
538
+ "learning_rate": 1.562610229276896e-05,
539
+ "loss": 0.0211,
540
  "step": 7800
541
  },
542
  {
543
  "epoch": 41.72,
544
+ "learning_rate": 1.5185185185185186e-05,
545
+ "loss": 0.0165,
546
  "step": 7900
547
  },
548
  {
549
  "epoch": 42.24,
550
+ "learning_rate": 1.4744268077601411e-05,
551
+ "loss": 0.0147,
552
+ "step": 8000
553
+ },
554
+ {
555
+ "epoch": 42.24,
556
+ "eval_accuracy": 0.9294554591178894,
557
+ "eval_loss": 0.3574332892894745,
558
+ "eval_runtime": 49.7962,
559
+ "eval_samples_per_second": 48.678,
560
+ "eval_steps_per_second": 6.085,
561
  "step": 8000
562
  },
563
  {
564
  "epoch": 42.77,
565
+ "learning_rate": 1.4303350970017638e-05,
566
+ "loss": 0.0235,
567
  "step": 8100
568
  },
569
  {
570
  "epoch": 43.3,
571
+ "learning_rate": 1.3862433862433863e-05,
572
+ "loss": 0.0207,
573
  "step": 8200
574
  },
575
  {
576
  "epoch": 43.83,
577
+ "learning_rate": 1.3421516754850088e-05,
578
+ "loss": 0.0139,
579
  "step": 8300
580
  },
581
  {
582
  "epoch": 44.36,
583
+ "learning_rate": 1.2980599647266315e-05,
584
+ "loss": 0.0168,
585
  "step": 8400
586
  },
587
  {
588
  "epoch": 44.88,
589
+ "learning_rate": 1.253968253968254e-05,
590
+ "loss": 0.0146,
591
  "step": 8500
592
  },
593
  {
594
  "epoch": 45.41,
595
+ "learning_rate": 1.2098765432098767e-05,
596
+ "loss": 0.0149,
597
  "step": 8600
598
  },
599
  {
600
  "epoch": 45.94,
601
+ "learning_rate": 1.1657848324514992e-05,
602
+ "loss": 0.0155,
603
  "step": 8700
604
  },
605
  {
606
  "epoch": 46.47,
607
+ "learning_rate": 1.1221340388007055e-05,
608
+ "loss": 0.0196,
609
  "step": 8800
610
  },
611
  {
612
  "epoch": 47.0,
613
+ "learning_rate": 1.0780423280423282e-05,
614
+ "loss": 0.0229,
615
  "step": 8900
616
  },
617
  {
618
  "epoch": 47.52,
619
+ "learning_rate": 1.0339506172839507e-05,
620
+ "loss": 0.0148,
621
+ "step": 9000
622
+ },
623
+ {
624
+ "epoch": 47.52,
625
+ "eval_accuracy": 0.9220296740531921,
626
+ "eval_loss": 0.36532989144325256,
627
+ "eval_runtime": 50.0277,
628
+ "eval_samples_per_second": 48.453,
629
+ "eval_steps_per_second": 6.057,
630
  "step": 9000
631
  },
632
  {
633
  "epoch": 48.05,
634
+ "learning_rate": 9.898589065255732e-06,
635
+ "loss": 0.0133,
636
  "step": 9100
637
  },
638
  {
639
  "epoch": 48.58,
640
+ "learning_rate": 9.457671957671959e-06,
641
+ "loss": 0.0131,
642
  "step": 9200
643
  },
644
  {
645
  "epoch": 49.11,
646
+ "learning_rate": 9.016754850088184e-06,
647
+ "loss": 0.0121,
648
  "step": 9300
649
  },
650
  {
651
  "epoch": 49.64,
652
+ "learning_rate": 8.575837742504409e-06,
653
+ "loss": 0.0168,
654
  "step": 9400
655
  },
656
  {
657
  "epoch": 50.17,
658
+ "learning_rate": 8.134920634920636e-06,
659
+ "loss": 0.0148,
660
  "step": 9500
661
  },
662
  {
663
  "epoch": 50.69,
664
+ "learning_rate": 7.694003527336861e-06,
665
+ "loss": 0.0129,
666
  "step": 9600
667
  },
668
  {
669
  "epoch": 51.22,
670
+ "learning_rate": 7.253086419753087e-06,
671
+ "loss": 0.012,
672
  "step": 9700
673
  },
674
  {
675
  "epoch": 51.75,
676
+ "learning_rate": 6.812169312169313e-06,
677
+ "loss": 0.0112,
678
  "step": 9800
679
  },
680
  {
681
  "epoch": 52.28,
682
+ "learning_rate": 6.371252204585539e-06,
683
+ "loss": 0.0193,
684
  "step": 9900
685
  },
686
  {
687
  "epoch": 52.81,
688
+ "learning_rate": 5.930335097001764e-06,
689
+ "loss": 0.0137,
690
  "step": 10000
691
  },
692
  {
693
  "epoch": 52.81,
694
+ "eval_accuracy": 0.9352310299873352,
695
+ "eval_loss": 0.3257134258747101,
696
+ "eval_runtime": 50.886,
697
+ "eval_samples_per_second": 47.636,
698
+ "eval_steps_per_second": 5.954,
699
  "step": 10000
700
  },
701
  {
702
  "epoch": 53.33,
703
+ "learning_rate": 5.489417989417989e-06,
704
+ "loss": 0.0171,
705
  "step": 10100
706
  },
707
  {
708
  "epoch": 53.86,
709
+ "learning_rate": 5.048500881834215e-06,
710
+ "loss": 0.0169,
711
  "step": 10200
712
  },
713
  {
714
  "epoch": 54.39,
715
+ "learning_rate": 4.611992945326279e-06,
716
+ "loss": 0.0136,
717
  "step": 10300
718
  },
719
  {
720
  "epoch": 54.92,
721
+ "learning_rate": 4.171075837742505e-06,
722
+ "loss": 0.0152,
723
  "step": 10400
724
  },
725
  {
726
  "epoch": 55.45,
727
+ "learning_rate": 3.7301587301587305e-06,
728
+ "loss": 0.015,
729
  "step": 10500
730
  },
731
  {
732
  "epoch": 55.97,
733
+ "learning_rate": 3.289241622574956e-06,
734
+ "loss": 0.0136,
735
  "step": 10600
736
  },
737
  {
738
  "epoch": 56.5,
739
+ "learning_rate": 2.848324514991182e-06,
740
+ "loss": 0.0156,
741
  "step": 10700
742
  },
743
  {
744
  "epoch": 57.03,
745
+ "learning_rate": 2.4074074074074075e-06,
746
+ "loss": 0.0137,
747
  "step": 10800
748
  },
749
  {
750
  "epoch": 57.56,
751
+ "learning_rate": 1.9664902998236335e-06,
752
+ "loss": 0.0147,
753
  "step": 10900
754
  },
755
  {
756
  "epoch": 58.09,
757
+ "learning_rate": 1.525573192239859e-06,
758
+ "loss": 0.0174,
759
+ "step": 11000
760
+ },
761
+ {
762
+ "epoch": 58.09,
763
+ "eval_accuracy": 0.933993399143219,
764
+ "eval_loss": 0.30968689918518066,
765
+ "eval_runtime": 51.0931,
766
+ "eval_samples_per_second": 47.443,
767
+ "eval_steps_per_second": 5.93,
768
  "step": 11000
769
  },
770
  {
771
  "epoch": 58.61,
772
+ "learning_rate": 1.0846560846560847e-06,
773
+ "loss": 0.0163,
774
  "step": 11100
775
  },
776
  {
777
  "epoch": 59.14,
778
+ "learning_rate": 6.437389770723105e-07,
779
+ "loss": 0.0096,
780
  "step": 11200
781
  },
782
  {
783
  "epoch": 59.67,
784
+ "learning_rate": 2.0282186948853617e-07,
785
+ "loss": 0.0121,
786
  "step": 11300
787
  },
788
  {
789
+ "epoch": 59.88,
790
+ "step": 11340,
791
+ "total_flos": 2.20514636224512e+19,
792
+ "train_loss": 0.030625741817122836,
793
+ "train_runtime": 25244.9182,
794
+ "train_samples_per_second": 28.806,
795
+ "train_steps_per_second": 0.449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796
  },
797
  {
798
+ "epoch": 59.88,
799
+ "eval_accuracy": 0.9323432445526123,
800
+ "eval_loss": 0.3117374777793884,
801
+ "eval_runtime": 50.8983,
802
+ "eval_samples_per_second": 47.624,
803
+ "eval_steps_per_second": 5.953,
804
+ "step": 11340
805
  }
806
  ],
807
  "logging_steps": 100,
808
+ "max_steps": 11340,
809
+ "num_train_epochs": 60,
810
  "save_steps": 2000,
811
+ "total_flos": 2.20514636224512e+19,
812
  "trial_name": null,
813
  "trial_params": null
814
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe211c843b5d503caa749faf86af4e798d09fba3908277cc22163c26ef6460de
3
  size 4155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34a256b8a5d2d883823dcb395af53cbfffa6c0546324fd46a6cbd61fd66d4518
3
  size 4155