Manuappu5670 commited on
Commit
c17f7cc
·
verified ·
1 Parent(s): 2956390

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. all_results.json +11 -11
  3. eval_results.json +6 -6
  4. train_results.json +6 -6
  5. trainer_state.json +443 -269
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [apple/mobilevit-xx-small](https://huggingface.co/apple/mobilevit-xx-small) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 1.0688
21
- - Accuracy: 0.64
22
 
23
  ## Model description
24
 
 
17
 
18
  This model is a fine-tuned version of [apple/mobilevit-xx-small](https://huggingface.co/apple/mobilevit-xx-small) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 1.0321
21
+ - Accuracy: 0.66
22
 
23
  ## Model description
24
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 30.0,
3
- "eval_accuracy": 0.8922222222222222,
4
- "eval_loss": 0.5839348435401917,
5
- "eval_runtime": 641.5157,
6
- "eval_samples_per_second": 4.209,
7
- "eval_steps_per_second": 0.132,
8
- "total_flos": 2.136995592192e+16,
9
- "train_loss": 1.1791712323824564,
10
- "train_runtime": 995.7979,
11
- "train_samples_per_second": 15.063,
12
- "train_steps_per_second": 0.121
13
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "eval_accuracy": 0.66,
4
+ "eval_loss": 1.0320532321929932,
5
+ "eval_runtime": 0.4192,
6
+ "eval_samples_per_second": 238.552,
7
+ "eval_steps_per_second": 9.542,
8
+ "total_flos": 2.630499107615539e+16,
9
+ "train_loss": 1.076003630956014,
10
+ "train_runtime": 895.7496,
11
+ "train_samples_per_second": 22.328,
12
+ "train_steps_per_second": 0.167
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_accuracy": 0.8922222222222222,
4
- "eval_loss": 0.5839348435401917,
5
- "eval_runtime": 641.5157,
6
- "eval_samples_per_second": 4.209,
7
- "eval_steps_per_second": 0.132
8
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "eval_accuracy": 0.66,
4
+ "eval_loss": 1.0320532321929932,
5
+ "eval_runtime": 0.4192,
6
+ "eval_samples_per_second": 238.552,
7
+ "eval_steps_per_second": 9.542
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 30.0,
3
- "total_flos": 2.136995592192e+16,
4
- "train_loss": 1.1791712323824564,
5
- "train_runtime": 995.7979,
6
- "train_samples_per_second": 15.063,
7
- "train_steps_per_second": 0.121
8
  }
 
1
  {
2
+ "epoch": 46.15384615384615,
3
+ "total_flos": 2.630499107615539e+16,
4
+ "train_loss": 1.076003630956014,
5
+ "train_runtime": 895.7496,
6
+ "train_samples_per_second": 22.328,
7
+ "train_steps_per_second": 0.167
8
  }
trainer_state.json CHANGED
@@ -1,381 +1,555 @@
1
  {
2
- "best_metric": 0.636,
3
- "best_model_checkpoint": "mobilevit-xx-small-finetuned-eurosat/checkpoint-120",
4
- "epoch": 30.0,
5
  "eval_steps": 500,
6
- "global_step": 120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_accuracy": 0.212,
14
- "eval_loss": 2.2717537879943848,
15
- "eval_runtime": 11.4285,
16
- "eval_samples_per_second": 43.75,
17
- "eval_steps_per_second": 1.4,
18
- "step": 4
19
  },
20
  {
21
- "epoch": 2.0,
22
- "eval_accuracy": 0.29,
23
- "eval_loss": 2.1031904220581055,
24
- "eval_runtime": 20.4802,
25
- "eval_samples_per_second": 24.414,
26
- "eval_steps_per_second": 0.781,
27
- "step": 8
28
  },
29
  {
30
- "epoch": 2.5,
31
- "grad_norm": 4.617868423461914,
32
- "learning_rate": 0.0025,
33
- "loss": 2.1883,
34
- "step": 10
 
 
35
  },
36
  {
37
- "epoch": 3.0,
38
- "eval_accuracy": 0.336,
39
- "eval_loss": 1.9200304746627808,
40
- "eval_runtime": 2.4299,
41
- "eval_samples_per_second": 205.771,
42
- "eval_steps_per_second": 6.585,
43
- "step": 12
44
  },
45
  {
46
  "epoch": 4.0,
47
- "eval_accuracy": 0.26,
48
- "eval_loss": 2.733245849609375,
49
- "eval_runtime": 1.9139,
50
- "eval_samples_per_second": 261.247,
51
- "eval_steps_per_second": 8.36,
 
 
 
 
 
 
 
 
 
52
  "step": 16
53
  },
54
  {
55
- "epoch": 5.0,
56
- "grad_norm": 7.1269941329956055,
57
- "learning_rate": 0.002777777777777778,
58
- "loss": 1.7537,
59
- "step": 20
 
 
60
  },
61
  {
62
- "epoch": 5.0,
63
- "eval_accuracy": 0.354,
64
- "eval_loss": 2.0565059185028076,
65
- "eval_runtime": 22.3861,
66
- "eval_samples_per_second": 22.335,
67
- "eval_steps_per_second": 0.715,
68
  "step": 20
69
  },
70
  {
71
- "epoch": 6.0,
72
- "eval_accuracy": 0.352,
73
- "eval_loss": 1.9995957612991333,
74
- "eval_runtime": 1.94,
75
- "eval_samples_per_second": 257.729,
76
- "eval_steps_per_second": 8.247,
77
- "step": 24
78
- },
79
- {
80
- "epoch": 7.0,
81
- "eval_accuracy": 0.388,
82
- "eval_loss": 1.7918899059295654,
83
- "eval_runtime": 21.978,
84
- "eval_samples_per_second": 22.75,
85
- "eval_steps_per_second": 0.728,
86
- "step": 28
87
  },
88
  {
89
- "epoch": 7.5,
90
- "grad_norm": 3.362823247909546,
91
- "learning_rate": 0.0025,
92
- "loss": 1.525,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  "step": 30
94
  },
95
  {
96
- "epoch": 8.0,
97
- "eval_accuracy": 0.366,
98
- "eval_loss": 1.9682480096817017,
99
- "eval_runtime": 22.1475,
100
- "eval_samples_per_second": 22.576,
101
- "eval_steps_per_second": 0.722,
102
  "step": 32
103
  },
104
  {
105
- "epoch": 9.0,
106
- "eval_accuracy": 0.44,
107
- "eval_loss": 1.550385594367981,
108
- "eval_runtime": 4.0809,
109
- "eval_samples_per_second": 122.521,
110
- "eval_steps_per_second": 3.921,
111
- "step": 36
112
  },
113
  {
114
- "epoch": 10.0,
115
- "grad_norm": 6.439978122711182,
116
- "learning_rate": 0.0022222222222222222,
117
- "loss": 1.41,
 
 
 
 
 
 
 
 
 
118
  "step": 40
119
  },
120
  {
121
- "epoch": 10.0,
122
- "eval_accuracy": 0.406,
123
- "eval_loss": 1.6390491724014282,
124
- "eval_runtime": 15.0604,
125
- "eval_samples_per_second": 33.2,
126
- "eval_steps_per_second": 1.062,
127
- "step": 40
128
  },
129
  {
130
- "epoch": 11.0,
131
- "eval_accuracy": 0.482,
132
- "eval_loss": 1.8054633140563965,
133
- "eval_runtime": 22.4978,
134
- "eval_samples_per_second": 22.224,
135
- "eval_steps_per_second": 0.711,
136
- "step": 44
137
  },
138
  {
139
- "epoch": 12.0,
140
- "eval_accuracy": 0.372,
141
- "eval_loss": 1.8536120653152466,
142
- "eval_runtime": 1.8645,
143
- "eval_samples_per_second": 268.166,
144
- "eval_steps_per_second": 8.581,
145
  "step": 48
146
  },
147
  {
148
- "epoch": 12.5,
149
- "grad_norm": 4.127369403839111,
150
- "learning_rate": 0.0019444444444444444,
151
- "loss": 1.2638,
152
  "step": 50
153
  },
154
  {
155
- "epoch": 13.0,
156
- "eval_accuracy": 0.488,
157
- "eval_loss": 1.6079543828964233,
158
- "eval_runtime": 1.8468,
159
- "eval_samples_per_second": 270.739,
160
- "eval_steps_per_second": 8.664,
161
  "step": 52
162
  },
163
  {
164
- "epoch": 14.0,
165
- "eval_accuracy": 0.418,
166
- "eval_loss": 1.810085415840149,
167
- "eval_runtime": 21.9085,
168
- "eval_samples_per_second": 22.822,
169
- "eval_steps_per_second": 0.73,
170
- "step": 56
171
  },
172
  {
173
- "epoch": 15.0,
174
- "grad_norm": 5.039086818695068,
175
- "learning_rate": 0.0016666666666666668,
176
- "loss": 1.1194,
177
- "step": 60
 
 
178
  },
179
  {
180
- "epoch": 15.0,
181
- "eval_accuracy": 0.422,
182
- "eval_loss": 1.742574691772461,
183
- "eval_runtime": 22.139,
184
- "eval_samples_per_second": 22.585,
185
- "eval_steps_per_second": 0.723,
186
  "step": 60
187
  },
188
  {
189
- "epoch": 16.0,
190
- "eval_accuracy": 0.452,
191
- "eval_loss": 1.7981486320495605,
192
- "eval_runtime": 39.1443,
193
- "eval_samples_per_second": 12.773,
194
- "eval_steps_per_second": 0.409,
195
- "step": 64
196
- },
197
- {
198
- "epoch": 17.0,
199
- "eval_accuracy": 0.392,
200
- "eval_loss": 2.088580369949341,
201
- "eval_runtime": 1.8333,
202
- "eval_samples_per_second": 272.732,
203
- "eval_steps_per_second": 8.727,
 
 
 
 
 
 
 
 
 
204
  "step": 68
205
  },
206
  {
207
- "epoch": 17.5,
208
- "grad_norm": 3.298395872116089,
209
- "learning_rate": 0.001388888888888889,
210
- "loss": 1.0082,
211
  "step": 70
212
  },
213
  {
214
- "epoch": 18.0,
215
- "eval_accuracy": 0.538,
216
- "eval_loss": 1.4252102375030518,
217
- "eval_runtime": 2.3763,
218
- "eval_samples_per_second": 210.41,
219
- "eval_steps_per_second": 6.733,
220
- "step": 72
221
  },
222
  {
223
- "epoch": 19.0,
224
- "eval_accuracy": 0.576,
225
- "eval_loss": 1.3337562084197998,
226
- "eval_runtime": 2.0774,
227
- "eval_samples_per_second": 240.688,
228
- "eval_steps_per_second": 7.702,
229
- "step": 76
230
  },
231
  {
232
- "epoch": 20.0,
233
- "grad_norm": 3.3816585540771484,
234
- "learning_rate": 0.0011111111111111111,
235
- "loss": 0.9556,
 
 
 
 
 
 
 
 
 
236
  "step": 80
237
  },
238
  {
239
- "epoch": 20.0,
240
- "eval_accuracy": 0.492,
241
- "eval_loss": 1.717270851135254,
242
- "eval_runtime": 1.9665,
243
- "eval_samples_per_second": 254.261,
244
- "eval_steps_per_second": 8.136,
245
- "step": 80
246
  },
247
  {
248
- "epoch": 21.0,
249
- "eval_accuracy": 0.572,
250
- "eval_loss": 1.4067368507385254,
251
- "eval_runtime": 22.3773,
252
- "eval_samples_per_second": 22.344,
253
- "eval_steps_per_second": 0.715,
254
  "step": 84
255
  },
256
  {
257
- "epoch": 22.0,
258
- "eval_accuracy": 0.614,
259
- "eval_loss": 1.1839243173599243,
260
- "eval_runtime": 1.8497,
261
- "eval_samples_per_second": 270.312,
262
- "eval_steps_per_second": 8.65,
263
- "step": 88
264
  },
265
  {
266
- "epoch": 22.5,
267
- "grad_norm": 4.263976097106934,
268
- "learning_rate": 0.0008333333333333334,
269
- "loss": 0.8404,
270
  "step": 90
271
  },
272
  {
273
- "epoch": 23.0,
274
- "eval_accuracy": 0.612,
275
- "eval_loss": 1.2036840915679932,
276
- "eval_runtime": 1.8642,
277
- "eval_samples_per_second": 268.207,
278
- "eval_steps_per_second": 8.583,
279
- "step": 92
280
  },
281
  {
282
- "epoch": 24.0,
283
- "eval_accuracy": 0.552,
284
- "eval_loss": 1.4832416772842407,
285
- "eval_runtime": 1.9785,
286
- "eval_samples_per_second": 252.716,
287
- "eval_steps_per_second": 8.087,
288
- "step": 96
289
- },
290
- {
291
- "epoch": 25.0,
292
- "grad_norm": 3.777881145477295,
293
- "learning_rate": 0.0005555555555555556,
294
- "loss": 0.7603,
295
- "step": 100
 
 
296
  },
297
  {
298
- "epoch": 25.0,
299
- "eval_accuracy": 0.592,
300
- "eval_loss": 1.3205021619796753,
301
- "eval_runtime": 1.9133,
302
- "eval_samples_per_second": 261.322,
303
- "eval_steps_per_second": 8.362,
304
  "step": 100
305
  },
306
  {
307
- "epoch": 26.0,
308
  "eval_accuracy": 0.61,
309
- "eval_loss": 1.250473976135254,
310
- "eval_runtime": 41.202,
311
- "eval_samples_per_second": 12.135,
312
- "eval_steps_per_second": 0.388,
 
 
 
 
 
 
 
 
 
313
  "step": 104
314
  },
315
  {
316
- "epoch": 27.0,
317
- "eval_accuracy": 0.622,
318
- "eval_loss": 1.2412116527557373,
319
- "eval_runtime": 1.9569,
320
- "eval_samples_per_second": 255.501,
321
- "eval_steps_per_second": 8.176,
322
- "step": 108
323
  },
324
  {
325
- "epoch": 27.5,
326
- "grad_norm": 3.8510005474090576,
327
- "learning_rate": 0.0002777777777777778,
328
- "loss": 0.7096,
329
  "step": 110
330
  },
331
  {
332
- "epoch": 28.0,
333
- "eval_accuracy": 0.626,
334
- "eval_loss": 1.1740450859069824,
335
- "eval_runtime": 6.5344,
336
- "eval_samples_per_second": 76.518,
337
- "eval_steps_per_second": 2.449,
338
- "step": 112
339
  },
340
  {
341
- "epoch": 29.0,
342
- "eval_accuracy": 0.624,
343
- "eval_loss": 1.1464664936065674,
344
- "eval_runtime": 20.8737,
345
- "eval_samples_per_second": 23.954,
346
- "eval_steps_per_second": 0.767,
347
- "step": 116
348
  },
349
  {
350
- "epoch": 30.0,
351
- "grad_norm": 3.827312707901001,
352
- "learning_rate": 0.0,
353
- "loss": 0.6157,
 
 
 
 
 
 
 
 
 
354
  "step": 120
355
  },
356
  {
357
- "epoch": 30.0,
358
- "eval_accuracy": 0.636,
359
- "eval_loss": 1.1490942239761353,
360
- "eval_runtime": 17.8767,
361
- "eval_samples_per_second": 27.969,
362
- "eval_steps_per_second": 0.895,
363
  "step": 120
364
  },
365
  {
366
- "epoch": 30.0,
367
- "step": 120,
368
- "total_flos": 2.136995592192e+16,
369
- "train_loss": 1.1791712323824564,
370
- "train_runtime": 995.7979,
371
- "train_samples_per_second": 15.063,
372
- "train_steps_per_second": 0.121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  }
374
  ],
375
  "logging_steps": 10,
376
- "max_steps": 120,
377
  "num_input_tokens_seen": 0,
378
- "num_train_epochs": 30,
379
  "save_steps": 500,
380
  "stateful_callbacks": {
381
  "TrainerControl": {
@@ -389,7 +563,7 @@
389
  "attributes": {}
390
  }
391
  },
392
- "total_flos": 2.136995592192e+16,
393
  "train_batch_size": 32,
394
  "trial_name": null,
395
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.66,
3
+ "best_model_checkpoint": "mobilevit-xx-small-finetuned-eurosat/checkpoint-94",
4
+ "epoch": 46.15384615384615,
5
  "eval_steps": 500,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.9230769230769231,
13
+ "eval_accuracy": 0.11,
14
+ "eval_loss": 2.2814395427703857,
15
+ "eval_runtime": 0.3447,
16
+ "eval_samples_per_second": 290.129,
17
+ "eval_steps_per_second": 11.605,
18
+ "step": 3
19
  },
20
  {
21
+ "epoch": 1.8461538461538463,
22
+ "eval_accuracy": 0.26,
23
+ "eval_loss": 2.227895736694336,
24
+ "eval_runtime": 0.4326,
25
+ "eval_samples_per_second": 231.174,
26
+ "eval_steps_per_second": 9.247,
27
+ "step": 6
28
  },
29
  {
30
+ "epoch": 2.769230769230769,
31
+ "eval_accuracy": 0.29,
32
+ "eval_loss": 2.105987071990967,
33
+ "eval_runtime": 0.4383,
34
+ "eval_samples_per_second": 228.177,
35
+ "eval_steps_per_second": 9.127,
36
+ "step": 9
37
  },
38
  {
39
+ "epoch": 3.076923076923077,
40
+ "grad_norm": 3.6058764457702637,
41
+ "learning_rate": 0.002,
42
+ "loss": 2.2256,
43
+ "step": 10
 
 
44
  },
45
  {
46
  "epoch": 4.0,
47
+ "eval_accuracy": 0.31,
48
+ "eval_loss": 1.8600536584854126,
49
+ "eval_runtime": 12.3694,
50
+ "eval_samples_per_second": 8.084,
51
+ "eval_steps_per_second": 0.323,
52
+ "step": 13
53
+ },
54
+ {
55
+ "epoch": 4.923076923076923,
56
+ "eval_accuracy": 0.32,
57
+ "eval_loss": 1.8917309045791626,
58
+ "eval_runtime": 13.8701,
59
+ "eval_samples_per_second": 7.21,
60
+ "eval_steps_per_second": 0.288,
61
  "step": 16
62
  },
63
  {
64
+ "epoch": 5.846153846153846,
65
+ "eval_accuracy": 0.33,
66
+ "eval_loss": 1.7589596509933472,
67
+ "eval_runtime": 14.8817,
68
+ "eval_samples_per_second": 6.72,
69
+ "eval_steps_per_second": 0.269,
70
+ "step": 19
71
  },
72
  {
73
+ "epoch": 6.153846153846154,
74
+ "grad_norm": 3.9451377391815186,
75
+ "learning_rate": 0.0028888888888888888,
76
+ "loss": 1.8,
 
 
77
  "step": 20
78
  },
79
  {
80
+ "epoch": 6.769230769230769,
81
+ "eval_accuracy": 0.21,
82
+ "eval_loss": 2.9505045413970947,
83
+ "eval_runtime": 0.8737,
84
+ "eval_samples_per_second": 114.459,
85
+ "eval_steps_per_second": 4.578,
86
+ "step": 22
 
 
 
 
 
 
 
 
 
87
  },
88
  {
89
+ "epoch": 8.0,
90
+ "eval_accuracy": 0.19,
91
+ "eval_loss": 2.440716505050659,
92
+ "eval_runtime": 0.3849,
93
+ "eval_samples_per_second": 259.81,
94
+ "eval_steps_per_second": 10.392,
95
+ "step": 26
96
+ },
97
+ {
98
+ "epoch": 8.923076923076923,
99
+ "eval_accuracy": 0.36,
100
+ "eval_loss": 2.6913070678710938,
101
+ "eval_runtime": 0.4553,
102
+ "eval_samples_per_second": 219.638,
103
+ "eval_steps_per_second": 8.786,
104
+ "step": 29
105
+ },
106
+ {
107
+ "epoch": 9.23076923076923,
108
+ "grad_norm": 5.145291805267334,
109
+ "learning_rate": 0.0026666666666666666,
110
+ "loss": 1.6404,
111
  "step": 30
112
  },
113
  {
114
+ "epoch": 9.846153846153847,
115
+ "eval_accuracy": 0.48,
116
+ "eval_loss": 1.409439206123352,
117
+ "eval_runtime": 16.0101,
118
+ "eval_samples_per_second": 6.246,
119
+ "eval_steps_per_second": 0.25,
120
  "step": 32
121
  },
122
  {
123
+ "epoch": 10.76923076923077,
124
+ "eval_accuracy": 0.47,
125
+ "eval_loss": 1.401109218597412,
126
+ "eval_runtime": 1.2056,
127
+ "eval_samples_per_second": 82.944,
128
+ "eval_steps_per_second": 3.318,
129
+ "step": 35
130
  },
131
  {
132
+ "epoch": 12.0,
133
+ "eval_accuracy": 0.45,
134
+ "eval_loss": 1.759374737739563,
135
+ "eval_runtime": 16.745,
136
+ "eval_samples_per_second": 5.972,
137
+ "eval_steps_per_second": 0.239,
138
+ "step": 39
139
+ },
140
+ {
141
+ "epoch": 12.307692307692308,
142
+ "grad_norm": 9.834226608276367,
143
+ "learning_rate": 0.0024444444444444444,
144
+ "loss": 1.4817,
145
  "step": 40
146
  },
147
  {
148
+ "epoch": 12.923076923076923,
149
+ "eval_accuracy": 0.46,
150
+ "eval_loss": 1.431288719177246,
151
+ "eval_runtime": 0.4722,
152
+ "eval_samples_per_second": 211.785,
153
+ "eval_steps_per_second": 8.471,
154
+ "step": 42
155
  },
156
  {
157
+ "epoch": 13.846153846153847,
158
+ "eval_accuracy": 0.49,
159
+ "eval_loss": 1.2744395732879639,
160
+ "eval_runtime": 6.1902,
161
+ "eval_samples_per_second": 16.155,
162
+ "eval_steps_per_second": 0.646,
163
+ "step": 45
164
  },
165
  {
166
+ "epoch": 14.76923076923077,
167
+ "eval_accuracy": 0.46,
168
+ "eval_loss": 1.4527015686035156,
169
+ "eval_runtime": 0.337,
170
+ "eval_samples_per_second": 296.7,
171
+ "eval_steps_per_second": 11.868,
172
  "step": 48
173
  },
174
  {
175
+ "epoch": 15.384615384615385,
176
+ "grad_norm": 3.165754795074463,
177
+ "learning_rate": 0.0022222222222222222,
178
+ "loss": 1.2872,
179
  "step": 50
180
  },
181
  {
182
+ "epoch": 16.0,
183
+ "eval_accuracy": 0.42,
184
+ "eval_loss": 1.5522328615188599,
185
+ "eval_runtime": 0.4566,
186
+ "eval_samples_per_second": 219.028,
187
+ "eval_steps_per_second": 8.761,
188
  "step": 52
189
  },
190
  {
191
+ "epoch": 16.923076923076923,
192
+ "eval_accuracy": 0.38,
193
+ "eval_loss": 1.7599867582321167,
194
+ "eval_runtime": 15.1941,
195
+ "eval_samples_per_second": 6.582,
196
+ "eval_steps_per_second": 0.263,
197
+ "step": 55
198
  },
199
  {
200
+ "epoch": 17.846153846153847,
201
+ "eval_accuracy": 0.53,
202
+ "eval_loss": 1.3952364921569824,
203
+ "eval_runtime": 0.4698,
204
+ "eval_samples_per_second": 212.838,
205
+ "eval_steps_per_second": 8.514,
206
+ "step": 58
207
  },
208
  {
209
+ "epoch": 18.46153846153846,
210
+ "grad_norm": 3.935255765914917,
211
+ "learning_rate": 0.002,
212
+ "loss": 1.2307,
 
 
213
  "step": 60
214
  },
215
  {
216
+ "epoch": 18.76923076923077,
217
+ "eval_accuracy": 0.51,
218
+ "eval_loss": 1.5067017078399658,
219
+ "eval_runtime": 8.3399,
220
+ "eval_samples_per_second": 11.991,
221
+ "eval_steps_per_second": 0.48,
222
+ "step": 61
223
+ },
224
+ {
225
+ "epoch": 20.0,
226
+ "eval_accuracy": 0.51,
227
+ "eval_loss": 1.631130576133728,
228
+ "eval_runtime": 14.6358,
229
+ "eval_samples_per_second": 6.833,
230
+ "eval_steps_per_second": 0.273,
231
+ "step": 65
232
+ },
233
+ {
234
+ "epoch": 20.923076923076923,
235
+ "eval_accuracy": 0.58,
236
+ "eval_loss": 1.3986690044403076,
237
+ "eval_runtime": 0.4557,
238
+ "eval_samples_per_second": 219.438,
239
+ "eval_steps_per_second": 8.778,
240
  "step": 68
241
  },
242
  {
243
+ "epoch": 21.53846153846154,
244
+ "grad_norm": 6.055632591247559,
245
+ "learning_rate": 0.0017777777777777776,
246
+ "loss": 1.0604,
247
  "step": 70
248
  },
249
  {
250
+ "epoch": 21.846153846153847,
251
+ "eval_accuracy": 0.54,
252
+ "eval_loss": 1.4127693176269531,
253
+ "eval_runtime": 0.4206,
254
+ "eval_samples_per_second": 237.781,
255
+ "eval_steps_per_second": 9.511,
256
+ "step": 71
257
  },
258
  {
259
+ "epoch": 22.76923076923077,
260
+ "eval_accuracy": 0.56,
261
+ "eval_loss": 1.1939380168914795,
262
+ "eval_runtime": 0.4239,
263
+ "eval_samples_per_second": 235.924,
264
+ "eval_steps_per_second": 9.437,
265
+ "step": 74
266
  },
267
  {
268
+ "epoch": 24.0,
269
+ "eval_accuracy": 0.49,
270
+ "eval_loss": 1.4323681592941284,
271
+ "eval_runtime": 0.3707,
272
+ "eval_samples_per_second": 269.772,
273
+ "eval_steps_per_second": 10.791,
274
+ "step": 78
275
+ },
276
+ {
277
+ "epoch": 24.615384615384617,
278
+ "grad_norm": 8.553020477294922,
279
+ "learning_rate": 0.0015555555555555555,
280
+ "loss": 1.0053,
281
  "step": 80
282
  },
283
  {
284
+ "epoch": 24.923076923076923,
285
+ "eval_accuracy": 0.53,
286
+ "eval_loss": 1.3661165237426758,
287
+ "eval_runtime": 0.4573,
288
+ "eval_samples_per_second": 218.668,
289
+ "eval_steps_per_second": 8.747,
290
+ "step": 81
291
  },
292
  {
293
+ "epoch": 25.846153846153847,
294
+ "eval_accuracy": 0.53,
295
+ "eval_loss": 1.25278902053833,
296
+ "eval_runtime": 0.3451,
297
+ "eval_samples_per_second": 289.798,
298
+ "eval_steps_per_second": 11.592,
299
  "step": 84
300
  },
301
  {
302
+ "epoch": 26.76923076923077,
303
+ "eval_accuracy": 0.57,
304
+ "eval_loss": 1.204034447669983,
305
+ "eval_runtime": 2.5122,
306
+ "eval_samples_per_second": 39.806,
307
+ "eval_steps_per_second": 1.592,
308
+ "step": 87
309
  },
310
  {
311
+ "epoch": 27.692307692307693,
312
+ "grad_norm": 3.5659632682800293,
313
+ "learning_rate": 0.0013333333333333333,
314
+ "loss": 0.8327,
315
  "step": 90
316
  },
317
  {
318
+ "epoch": 28.0,
319
+ "eval_accuracy": 0.61,
320
+ "eval_loss": 1.1885842084884644,
321
+ "eval_runtime": 0.5132,
322
+ "eval_samples_per_second": 194.851,
323
+ "eval_steps_per_second": 7.794,
324
+ "step": 91
325
  },
326
  {
327
+ "epoch": 28.923076923076923,
328
+ "eval_accuracy": 0.66,
329
+ "eval_loss": 1.0320532321929932,
330
+ "eval_runtime": 17.1732,
331
+ "eval_samples_per_second": 5.823,
332
+ "eval_steps_per_second": 0.233,
333
+ "step": 94
334
+ },
335
+ {
336
+ "epoch": 29.846153846153847,
337
+ "eval_accuracy": 0.61,
338
+ "eval_loss": 1.1496257781982422,
339
+ "eval_runtime": 0.4295,
340
+ "eval_samples_per_second": 232.828,
341
+ "eval_steps_per_second": 9.313,
342
+ "step": 97
343
  },
344
  {
345
+ "epoch": 30.76923076923077,
346
+ "grad_norm": 6.044861793518066,
347
+ "learning_rate": 0.0011111111111111111,
348
+ "loss": 0.7456,
 
 
349
  "step": 100
350
  },
351
  {
352
+ "epoch": 30.76923076923077,
353
  "eval_accuracy": 0.61,
354
+ "eval_loss": 1.1801018714904785,
355
+ "eval_runtime": 0.8649,
356
+ "eval_samples_per_second": 115.616,
357
+ "eval_steps_per_second": 4.625,
358
+ "step": 100
359
+ },
360
+ {
361
+ "epoch": 32.0,
362
+ "eval_accuracy": 0.53,
363
+ "eval_loss": 1.288604736328125,
364
+ "eval_runtime": 0.3454,
365
+ "eval_samples_per_second": 289.55,
366
+ "eval_steps_per_second": 11.582,
367
  "step": 104
368
  },
369
  {
370
+ "epoch": 32.92307692307692,
371
+ "eval_accuracy": 0.6,
372
+ "eval_loss": 1.2215436697006226,
373
+ "eval_runtime": 0.3906,
374
+ "eval_samples_per_second": 256.03,
375
+ "eval_steps_per_second": 10.241,
376
+ "step": 107
377
  },
378
  {
379
+ "epoch": 33.84615384615385,
380
+ "grad_norm": 6.93057107925415,
381
+ "learning_rate": 0.0008888888888888888,
382
+ "loss": 0.7106,
383
  "step": 110
384
  },
385
  {
386
+ "epoch": 33.84615384615385,
387
+ "eval_accuracy": 0.55,
388
+ "eval_loss": 1.2372339963912964,
389
+ "eval_runtime": 11.0141,
390
+ "eval_samples_per_second": 9.079,
391
+ "eval_steps_per_second": 0.363,
392
+ "step": 110
393
  },
394
  {
395
+ "epoch": 34.76923076923077,
396
+ "eval_accuracy": 0.62,
397
+ "eval_loss": 1.1833901405334473,
398
+ "eval_runtime": 0.3401,
399
+ "eval_samples_per_second": 294.013,
400
+ "eval_steps_per_second": 11.761,
401
+ "step": 113
402
  },
403
  {
404
+ "epoch": 36.0,
405
+ "eval_accuracy": 0.64,
406
+ "eval_loss": 1.2001395225524902,
407
+ "eval_runtime": 0.4622,
408
+ "eval_samples_per_second": 216.349,
409
+ "eval_steps_per_second": 8.654,
410
+ "step": 117
411
+ },
412
+ {
413
+ "epoch": 36.92307692307692,
414
+ "grad_norm": 4.3674445152282715,
415
+ "learning_rate": 0.0006666666666666666,
416
+ "loss": 0.5914,
417
  "step": 120
418
  },
419
  {
420
+ "epoch": 36.92307692307692,
421
+ "eval_accuracy": 0.63,
422
+ "eval_loss": 1.1181855201721191,
423
+ "eval_runtime": 8.8978,
424
+ "eval_samples_per_second": 11.239,
425
+ "eval_steps_per_second": 0.45,
426
  "step": 120
427
  },
428
  {
429
+ "epoch": 37.84615384615385,
430
+ "eval_accuracy": 0.62,
431
+ "eval_loss": 1.2223446369171143,
432
+ "eval_runtime": 0.4156,
433
+ "eval_samples_per_second": 240.596,
434
+ "eval_steps_per_second": 9.624,
435
+ "step": 123
436
+ },
437
+ {
438
+ "epoch": 38.76923076923077,
439
+ "eval_accuracy": 0.59,
440
+ "eval_loss": 1.244262456893921,
441
+ "eval_runtime": 0.4328,
442
+ "eval_samples_per_second": 231.05,
443
+ "eval_steps_per_second": 9.242,
444
+ "step": 126
445
+ },
446
+ {
447
+ "epoch": 40.0,
448
+ "grad_norm": 3.360128402709961,
449
+ "learning_rate": 0.0004444444444444444,
450
+ "loss": 0.5601,
451
+ "step": 130
452
+ },
453
+ {
454
+ "epoch": 40.0,
455
+ "eval_accuracy": 0.59,
456
+ "eval_loss": 1.2605319023132324,
457
+ "eval_runtime": 13.3957,
458
+ "eval_samples_per_second": 7.465,
459
+ "eval_steps_per_second": 0.299,
460
+ "step": 130
461
+ },
462
+ {
463
+ "epoch": 40.92307692307692,
464
+ "eval_accuracy": 0.6,
465
+ "eval_loss": 1.2939895391464233,
466
+ "eval_runtime": 5.4663,
467
+ "eval_samples_per_second": 18.294,
468
+ "eval_steps_per_second": 0.732,
469
+ "step": 133
470
+ },
471
+ {
472
+ "epoch": 41.84615384615385,
473
+ "eval_accuracy": 0.59,
474
+ "eval_loss": 1.2283549308776855,
475
+ "eval_runtime": 0.4128,
476
+ "eval_samples_per_second": 242.228,
477
+ "eval_steps_per_second": 9.689,
478
+ "step": 136
479
+ },
480
+ {
481
+ "epoch": 42.76923076923077,
482
+ "eval_accuracy": 0.61,
483
+ "eval_loss": 1.1883938312530518,
484
+ "eval_runtime": 0.3379,
485
+ "eval_samples_per_second": 295.934,
486
+ "eval_steps_per_second": 11.837,
487
+ "step": 139
488
+ },
489
+ {
490
+ "epoch": 43.07692307692308,
491
+ "grad_norm": 7.6419358253479,
492
+ "learning_rate": 0.0002222222222222222,
493
+ "loss": 0.5151,
494
+ "step": 140
495
+ },
496
+ {
497
+ "epoch": 44.0,
498
+ "eval_accuracy": 0.63,
499
+ "eval_loss": 1.1789089441299438,
500
+ "eval_runtime": 0.4065,
501
+ "eval_samples_per_second": 245.995,
502
+ "eval_steps_per_second": 9.84,
503
+ "step": 143
504
+ },
505
+ {
506
+ "epoch": 44.92307692307692,
507
+ "eval_accuracy": 0.62,
508
+ "eval_loss": 1.129704475402832,
509
+ "eval_runtime": 0.4164,
510
+ "eval_samples_per_second": 240.144,
511
+ "eval_steps_per_second": 9.606,
512
+ "step": 146
513
+ },
514
+ {
515
+ "epoch": 45.84615384615385,
516
+ "eval_accuracy": 0.63,
517
+ "eval_loss": 1.0879005193710327,
518
+ "eval_runtime": 0.6008,
519
+ "eval_samples_per_second": 166.444,
520
+ "eval_steps_per_second": 6.658,
521
+ "step": 149
522
+ },
523
+ {
524
+ "epoch": 46.15384615384615,
525
+ "grad_norm": 2.6147122383117676,
526
+ "learning_rate": 0.0,
527
+ "loss": 0.4531,
528
+ "step": 150
529
+ },
530
+ {
531
+ "epoch": 46.15384615384615,
532
+ "eval_accuracy": 0.64,
533
+ "eval_loss": 1.068832516670227,
534
+ "eval_runtime": 8.4147,
535
+ "eval_samples_per_second": 11.884,
536
+ "eval_steps_per_second": 0.475,
537
+ "step": 150
538
+ },
539
+ {
540
+ "epoch": 46.15384615384615,
541
+ "step": 150,
542
+ "total_flos": 2.630499107615539e+16,
543
+ "train_loss": 1.076003630956014,
544
+ "train_runtime": 895.7496,
545
+ "train_samples_per_second": 22.328,
546
+ "train_steps_per_second": 0.167
547
  }
548
  ],
549
  "logging_steps": 10,
550
+ "max_steps": 150,
551
  "num_input_tokens_seen": 0,
552
+ "num_train_epochs": 50,
553
  "save_steps": 500,
554
  "stateful_callbacks": {
555
  "TrainerControl": {
 
563
  "attributes": {}
564
  }
565
  },
566
+ "total_flos": 2.630499107615539e+16,
567
  "train_batch_size": 32,
568
  "trial_name": null,
569
  "trial_params": null