alkiskoudounas commited on
Commit
aab5c8a
·
verified ·
1 Parent(s): 581fa25

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +296 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +1308 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-960h-lv60-self",
3
+ "activation_dropout": 0.1,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.1,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.1,
56
+ "gradient_checkpointing": false,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.1,
59
+ "hidden_dropout_prob": 0.1,
60
+ "hidden_size": 1024,
61
+ "id2label": {
62
+ "0": "calendar_set",
63
+ "1": "wavs/audio_volume_up",
64
+ "10": "qa_stock",
65
+ "11": "play_music",
66
+ "12": "recommendation_events",
67
+ "13": "qa_definition",
68
+ "14": "alarm_remove",
69
+ "15": "play_podcasts",
70
+ "16": "social_query",
71
+ "17": "email_addcontact",
72
+ "18": "news_query",
73
+ "19": "calendar_query",
74
+ "2": "iot_hue_lightup",
75
+ "20": "music_likeness",
76
+ "21": "general_quirky",
77
+ "22": "qa_factoid",
78
+ "23": "takeaway_order",
79
+ "24": "play_wavs/audiobook",
80
+ "25": "iot_cleaning",
81
+ "26": "general_greet",
82
+ "27": "transport_query",
83
+ "28": "transport_taxi",
84
+ "29": "email_sendemail",
85
+ "3": "weather_query",
86
+ "30": "general_joke",
87
+ "31": "qa_maths",
88
+ "32": "social_post",
89
+ "33": "transport_ticket",
90
+ "34": "cooking_recipe",
91
+ "35": "music_settings",
92
+ "36": "calendar_remove",
93
+ "37": "iot_wemo_on",
94
+ "38": "iot_hue_lightchange",
95
+ "39": "play_radio",
96
+ "4": "iot_coffee",
97
+ "40": "email_querycontact",
98
+ "41": "transport_traffic",
99
+ "42": "qa_currency",
100
+ "43": "datetime_query",
101
+ "44": "iot_hue_lightoff",
102
+ "45": "takeaway_query",
103
+ "46": "lists_createoradd",
104
+ "47": "music_query",
105
+ "48": "recommendation_locations",
106
+ "49": "lists_query",
107
+ "5": "wavs/audio_volume_mute",
108
+ "50": "recommendation_movies",
109
+ "51": "iot_wemo_off",
110
+ "52": "iot_hue_lighton",
111
+ "53": "sendemail",
112
+ "54": "wavs/audio_volume_down",
113
+ "55": "play_game",
114
+ "56": "music",
115
+ "57": "datetime_convert",
116
+ "58": "iot_hue_lightdim",
117
+ "59": "query",
118
+ "6": "lists_remove",
119
+ "60": "createoradd",
120
+ "61": "music_dislikeness",
121
+ "62": "podcasts",
122
+ "63": "joke",
123
+ "64": "set",
124
+ "65": "hue_lightup",
125
+ "66": "factoid",
126
+ "67": "wavs/audio_volume_other",
127
+ "68": "hue_lightoff",
128
+ "69": "quirky",
129
+ "7": "email_query",
130
+ "70": "querycontact",
131
+ "71": "radio",
132
+ "72": "addcontact",
133
+ "73": "greet",
134
+ "74": "ticket",
135
+ "75": "traffic",
136
+ "76": "cooking_query",
137
+ "77": "remove",
138
+ "78": "currency",
139
+ "79": "coffee",
140
+ "8": "alarm_set",
141
+ "80": "game",
142
+ "81": "wemo_on",
143
+ "82": "definition",
144
+ "83": "events",
145
+ "84": "post",
146
+ "85": "hue_lightdim",
147
+ "86": "convert",
148
+ "87": "wemo_off",
149
+ "88": "cleaning",
150
+ "89": "settings",
151
+ "9": "alarm_query",
152
+ "90": "volume_other"
153
+ },
154
+ "initializer_range": 0.02,
155
+ "intermediate_size": 4096,
156
+ "label2id": {
157
+ "addcontact": "72",
158
+ "alarm_query": "9",
159
+ "alarm_remove": "14",
160
+ "alarm_set": "8",
161
+ "calendar_query": "19",
162
+ "calendar_remove": "36",
163
+ "calendar_set": "0",
164
+ "cleaning": "88",
165
+ "coffee": "79",
166
+ "convert": "86",
167
+ "cooking_query": "76",
168
+ "cooking_recipe": "34",
169
+ "createoradd": "60",
170
+ "currency": "78",
171
+ "datetime_convert": "57",
172
+ "datetime_query": "43",
173
+ "definition": "82",
174
+ "email_addcontact": "17",
175
+ "email_query": "7",
176
+ "email_querycontact": "40",
177
+ "email_sendemail": "29",
178
+ "events": "83",
179
+ "factoid": "66",
180
+ "game": "80",
181
+ "general_greet": "26",
182
+ "general_joke": "30",
183
+ "general_quirky": "21",
184
+ "greet": "73",
185
+ "hue_lightdim": "85",
186
+ "hue_lightoff": "68",
187
+ "hue_lightup": "65",
188
+ "iot_cleaning": "25",
189
+ "iot_coffee": "4",
190
+ "iot_hue_lightchange": "38",
191
+ "iot_hue_lightdim": "58",
192
+ "iot_hue_lightoff": "44",
193
+ "iot_hue_lighton": "52",
194
+ "iot_hue_lightup": "2",
195
+ "iot_wemo_off": "51",
196
+ "iot_wemo_on": "37",
197
+ "joke": "63",
198
+ "lists_createoradd": "46",
199
+ "lists_query": "49",
200
+ "lists_remove": "6",
201
+ "music": "56",
202
+ "music_dislikeness": "61",
203
+ "music_likeness": "20",
204
+ "music_query": "47",
205
+ "music_settings": "35",
206
+ "news_query": "18",
207
+ "play_game": "55",
208
+ "play_music": "11",
209
+ "play_podcasts": "15",
210
+ "play_radio": "39",
211
+ "play_wavs/audiobook": "24",
212
+ "podcasts": "62",
213
+ "post": "84",
214
+ "qa_currency": "42",
215
+ "qa_definition": "13",
216
+ "qa_factoid": "22",
217
+ "qa_maths": "31",
218
+ "qa_stock": "10",
219
+ "query": "59",
220
+ "querycontact": "70",
221
+ "quirky": "69",
222
+ "radio": "71",
223
+ "recommendation_events": "12",
224
+ "recommendation_locations": "48",
225
+ "recommendation_movies": "50",
226
+ "remove": "77",
227
+ "sendemail": "53",
228
+ "set": "64",
229
+ "settings": "89",
230
+ "social_post": "32",
231
+ "social_query": "16",
232
+ "takeaway_order": "23",
233
+ "takeaway_query": "45",
234
+ "ticket": "74",
235
+ "traffic": "75",
236
+ "transport_query": "27",
237
+ "transport_taxi": "28",
238
+ "transport_ticket": "33",
239
+ "transport_traffic": "41",
240
+ "volume_other": "90",
241
+ "wavs/audio_volume_down": "54",
242
+ "wavs/audio_volume_mute": "5",
243
+ "wavs/audio_volume_other": "67",
244
+ "wavs/audio_volume_up": "1",
245
+ "weather_query": "3",
246
+ "wemo_off": "87",
247
+ "wemo_on": "81"
248
+ },
249
+ "layer_norm_eps": 1e-05,
250
+ "layerdrop": 0.1,
251
+ "mask_feature_length": 10,
252
+ "mask_feature_min_masks": 0,
253
+ "mask_feature_prob": 0.0,
254
+ "mask_time_length": 10,
255
+ "mask_time_min_masks": 2,
256
+ "mask_time_prob": 0.05,
257
+ "model_type": "wav2vec2",
258
+ "num_adapter_layers": 3,
259
+ "num_attention_heads": 16,
260
+ "num_codevector_groups": 2,
261
+ "num_codevectors_per_group": 320,
262
+ "num_conv_pos_embedding_groups": 16,
263
+ "num_conv_pos_embeddings": 128,
264
+ "num_feat_extract_layers": 7,
265
+ "num_hidden_layers": 24,
266
+ "num_negatives": 100,
267
+ "output_hidden_size": 1024,
268
+ "pad_token_id": 0,
269
+ "proj_codevector_dim": 256,
270
+ "tdnn_dilation": [
271
+ 1,
272
+ 2,
273
+ 3,
274
+ 1,
275
+ 1
276
+ ],
277
+ "tdnn_dim": [
278
+ 512,
279
+ 512,
280
+ 512,
281
+ 512,
282
+ 1500
283
+ ],
284
+ "tdnn_kernel": [
285
+ 5,
286
+ 3,
287
+ 3,
288
+ 1,
289
+ 1
290
+ ],
291
+ "torch_dtype": "float32",
292
+ "transformers_version": "4.45.2",
293
+ "use_weighted_layer_sum": false,
294
+ "vocab_size": 32,
295
+ "xvector_output_dim": 512
296
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c489d880e03d06dc00b5272bac80748e203e42508bda2332d59f1e3233119d38
3
+ size 1262950804
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0cabc3951b5d157753d58f27b7a7de2d0dc27a03c2c999d6240a522e19e6073
3
+ size 2526152656
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e570cd5f9d55b7177de3ebb2d7cae79bc4c203b31bae473d44da24a40539c0aa
3
+ size 14308
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:417c32b40ce266eae8ab814d0af12409a7770b12ed2c5afd1b1ffbf9fa27b42f
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,1308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8005753739930955,
3
+ "best_model_checkpoint": "results/facebook/wav2vec2-large-960h-lv60-self/42/_retain/checkpoint-30000",
4
+ "epoch": 75.80543272267846,
5
+ "eval_steps": 400,
6
+ "global_step": 30000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.010739102969046,
13
+ "grad_norm": 3.2389800548553467,
14
+ "learning_rate": 6.666666666666667e-05,
15
+ "loss": 4.0919,
16
+ "step": 400
17
+ },
18
+ {
19
+ "epoch": 1.010739102969046,
20
+ "eval_accuracy": 0.10586881472957423,
21
+ "eval_f1_macro": 0.008128718856806105,
22
+ "eval_loss": 3.68546199798584,
23
+ "eval_runtime": 133.4265,
24
+ "eval_samples_per_second": 65.129,
25
+ "eval_steps_per_second": 2.039,
26
+ "step": 400
27
+ },
28
+ {
29
+ "epoch": 2.021478205938092,
30
+ "grad_norm": 5.276159286499023,
31
+ "learning_rate": 0.00013333333333333334,
32
+ "loss": 2.9391,
33
+ "step": 800
34
+ },
35
+ {
36
+ "epoch": 2.021478205938092,
37
+ "eval_accuracy": 0.5268124280782509,
38
+ "eval_f1_macro": 0.2773414885221941,
39
+ "eval_loss": 1.907711386680603,
40
+ "eval_runtime": 132.9453,
41
+ "eval_samples_per_second": 65.365,
42
+ "eval_steps_per_second": 2.046,
43
+ "step": 800
44
+ },
45
+ {
46
+ "epoch": 3.0322173089071383,
47
+ "grad_norm": 5.944188117980957,
48
+ "learning_rate": 0.0002,
49
+ "loss": 1.583,
50
+ "step": 1200
51
+ },
52
+ {
53
+ "epoch": 3.0322173089071383,
54
+ "eval_accuracy": 0.6894131185270426,
55
+ "eval_f1_macro": 0.48707209156248815,
56
+ "eval_loss": 1.2798452377319336,
57
+ "eval_runtime": 133.055,
58
+ "eval_samples_per_second": 65.311,
59
+ "eval_steps_per_second": 2.044,
60
+ "step": 1200
61
+ },
62
+ {
63
+ "epoch": 4.042956411876184,
64
+ "grad_norm": 6.609740257263184,
65
+ "learning_rate": 0.0002666666666666667,
66
+ "loss": 1.0089,
67
+ "step": 1600
68
+ },
69
+ {
70
+ "epoch": 4.042956411876184,
71
+ "eval_accuracy": 0.7447640966628308,
72
+ "eval_f1_macro": 0.5630866427455141,
73
+ "eval_loss": 1.1743698120117188,
74
+ "eval_runtime": 132.7655,
75
+ "eval_samples_per_second": 65.454,
76
+ "eval_steps_per_second": 2.049,
77
+ "step": 1600
78
+ },
79
+ {
80
+ "epoch": 5.053695514845231,
81
+ "grad_norm": 9.530195236206055,
82
+ "learning_rate": 0.0003333333333333333,
83
+ "loss": 0.7348,
84
+ "step": 2000
85
+ },
86
+ {
87
+ "epoch": 5.053695514845231,
88
+ "eval_accuracy": 0.7604142692750288,
89
+ "eval_f1_macro": 0.5961285021365654,
90
+ "eval_loss": 1.1527246236801147,
91
+ "eval_runtime": 127.8747,
92
+ "eval_samples_per_second": 67.957,
93
+ "eval_steps_per_second": 2.127,
94
+ "step": 2000
95
+ },
96
+ {
97
+ "epoch": 6.0644346178142765,
98
+ "grad_norm": 6.680343151092529,
99
+ "learning_rate": 0.0004,
100
+ "loss": 0.5957,
101
+ "step": 2400
102
+ },
103
+ {
104
+ "epoch": 6.0644346178142765,
105
+ "eval_accuracy": 0.7676639815880322,
106
+ "eval_f1_macro": 0.6054951189790404,
107
+ "eval_loss": 1.198480248451233,
108
+ "eval_runtime": 128.28,
109
+ "eval_samples_per_second": 67.742,
110
+ "eval_steps_per_second": 2.12,
111
+ "step": 2400
112
+ },
113
+ {
114
+ "epoch": 7.075173720783323,
115
+ "grad_norm": 3.774092435836792,
116
+ "learning_rate": 0.00046666666666666666,
117
+ "loss": 0.521,
118
+ "step": 2800
119
+ },
120
+ {
121
+ "epoch": 7.075173720783323,
122
+ "eval_accuracy": 0.7630609896432681,
123
+ "eval_f1_macro": 0.5903658522565237,
124
+ "eval_loss": 1.1921718120574951,
125
+ "eval_runtime": 128.2033,
126
+ "eval_samples_per_second": 67.783,
127
+ "eval_steps_per_second": 2.122,
128
+ "step": 2800
129
+ },
130
+ {
131
+ "epoch": 8.085912823752368,
132
+ "grad_norm": 3.719675302505493,
133
+ "learning_rate": 0.0004962962962962963,
134
+ "loss": 0.4667,
135
+ "step": 3200
136
+ },
137
+ {
138
+ "epoch": 8.085912823752368,
139
+ "eval_accuracy": 0.7619102416570771,
140
+ "eval_f1_macro": 0.6061718024259425,
141
+ "eval_loss": 1.2508888244628906,
142
+ "eval_runtime": 109.5839,
143
+ "eval_samples_per_second": 79.3,
144
+ "eval_steps_per_second": 2.482,
145
+ "step": 3200
146
+ },
147
+ {
148
+ "epoch": 9.096651926721416,
149
+ "grad_norm": 3.703678607940674,
150
+ "learning_rate": 0.0004888888888888889,
151
+ "loss": 0.3861,
152
+ "step": 3600
153
+ },
154
+ {
155
+ "epoch": 9.096651926721416,
156
+ "eval_accuracy": 0.7640966628308401,
157
+ "eval_f1_macro": 0.5910106640214171,
158
+ "eval_loss": 1.2851234674453735,
159
+ "eval_runtime": 109.2588,
160
+ "eval_samples_per_second": 79.536,
161
+ "eval_steps_per_second": 2.49,
162
+ "step": 3600
163
+ },
164
+ {
165
+ "epoch": 10.107391029690461,
166
+ "grad_norm": 5.4869585037231445,
167
+ "learning_rate": 0.00048148148148148144,
168
+ "loss": 0.32,
169
+ "step": 4000
170
+ },
171
+ {
172
+ "epoch": 10.107391029690461,
173
+ "eval_accuracy": 0.7590333716915996,
174
+ "eval_f1_macro": 0.5804751345832923,
175
+ "eval_loss": 1.4432213306427002,
176
+ "eval_runtime": 109.3455,
177
+ "eval_samples_per_second": 79.473,
178
+ "eval_steps_per_second": 2.488,
179
+ "step": 4000
180
+ },
181
+ {
182
+ "epoch": 11.118130132659507,
183
+ "grad_norm": 2.1531548500061035,
184
+ "learning_rate": 0.0004740740740740741,
185
+ "loss": 0.2828,
186
+ "step": 4400
187
+ },
188
+ {
189
+ "epoch": 11.118130132659507,
190
+ "eval_accuracy": 0.7590333716915996,
191
+ "eval_f1_macro": 0.6021086310983942,
192
+ "eval_loss": 1.3173363208770752,
193
+ "eval_runtime": 109.3574,
194
+ "eval_samples_per_second": 79.464,
195
+ "eval_steps_per_second": 2.487,
196
+ "step": 4400
197
+ },
198
+ {
199
+ "epoch": 12.128869235628553,
200
+ "grad_norm": 2.9061076641082764,
201
+ "learning_rate": 0.00046666666666666666,
202
+ "loss": 0.2367,
203
+ "step": 4800
204
+ },
205
+ {
206
+ "epoch": 12.128869235628553,
207
+ "eval_accuracy": 0.7543153049482163,
208
+ "eval_f1_macro": 0.6092446104843484,
209
+ "eval_loss": 1.4384377002716064,
210
+ "eval_runtime": 109.3136,
211
+ "eval_samples_per_second": 79.496,
212
+ "eval_steps_per_second": 2.488,
213
+ "step": 4800
214
+ },
215
+ {
216
+ "epoch": 13.139608338597599,
217
+ "grad_norm": 2.8866333961486816,
218
+ "learning_rate": 0.00045925925925925925,
219
+ "loss": 0.2187,
220
+ "step": 5200
221
+ },
222
+ {
223
+ "epoch": 13.139608338597599,
224
+ "eval_accuracy": 0.7654775604142693,
225
+ "eval_f1_macro": 0.5880603922791815,
226
+ "eval_loss": 1.4380950927734375,
227
+ "eval_runtime": 109.4554,
228
+ "eval_samples_per_second": 79.393,
229
+ "eval_steps_per_second": 2.485,
230
+ "step": 5200
231
+ },
232
+ {
233
+ "epoch": 14.150347441566646,
234
+ "grad_norm": 1.7574183940887451,
235
+ "learning_rate": 0.00045185185185185183,
236
+ "loss": 0.1847,
237
+ "step": 5600
238
+ },
239
+ {
240
+ "epoch": 14.150347441566646,
241
+ "eval_accuracy": 0.7730724971231301,
242
+ "eval_f1_macro": 0.5690127519635726,
243
+ "eval_loss": 1.4231289625167847,
244
+ "eval_runtime": 109.3887,
245
+ "eval_samples_per_second": 79.441,
246
+ "eval_steps_per_second": 2.487,
247
+ "step": 5600
248
+ },
249
+ {
250
+ "epoch": 15.161086544535692,
251
+ "grad_norm": 1.8373284339904785,
252
+ "learning_rate": 0.0004444444444444444,
253
+ "loss": 0.1701,
254
+ "step": 6000
255
+ },
256
+ {
257
+ "epoch": 15.161086544535692,
258
+ "eval_accuracy": 0.7680092059838896,
259
+ "eval_f1_macro": 0.5878361109327175,
260
+ "eval_loss": 1.5120900869369507,
261
+ "eval_runtime": 109.6944,
262
+ "eval_samples_per_second": 79.22,
263
+ "eval_steps_per_second": 2.48,
264
+ "step": 6000
265
+ },
266
+ {
267
+ "epoch": 16.171825647504736,
268
+ "grad_norm": 2.9617397785186768,
269
+ "learning_rate": 0.00043703703703703705,
270
+ "loss": 0.1504,
271
+ "step": 6400
272
+ },
273
+ {
274
+ "epoch": 16.171825647504736,
275
+ "eval_accuracy": 0.7609896432681242,
276
+ "eval_f1_macro": 0.6017434401264726,
277
+ "eval_loss": 1.5701994895935059,
278
+ "eval_runtime": 108.7867,
279
+ "eval_samples_per_second": 79.881,
280
+ "eval_steps_per_second": 2.5,
281
+ "step": 6400
282
+ },
283
+ {
284
+ "epoch": 17.182564750473784,
285
+ "grad_norm": 1.9067094326019287,
286
+ "learning_rate": 0.00042962962962962963,
287
+ "loss": 0.1416,
288
+ "step": 6800
289
+ },
290
+ {
291
+ "epoch": 17.182564750473784,
292
+ "eval_accuracy": 0.7680092059838896,
293
+ "eval_f1_macro": 0.5846132297229183,
294
+ "eval_loss": 1.6262372732162476,
295
+ "eval_runtime": 109.3355,
296
+ "eval_samples_per_second": 79.48,
297
+ "eval_steps_per_second": 2.488,
298
+ "step": 6800
299
+ },
300
+ {
301
+ "epoch": 18.19330385344283,
302
+ "grad_norm": 1.788485050201416,
303
+ "learning_rate": 0.0004222222222222222,
304
+ "loss": 0.1345,
305
+ "step": 7200
306
+ },
307
+ {
308
+ "epoch": 18.19330385344283,
309
+ "eval_accuracy": 0.7582278481012659,
310
+ "eval_f1_macro": 0.606730101292868,
311
+ "eval_loss": 1.6317014694213867,
312
+ "eval_runtime": 109.1193,
313
+ "eval_samples_per_second": 79.638,
314
+ "eval_steps_per_second": 2.493,
315
+ "step": 7200
316
+ },
317
+ {
318
+ "epoch": 19.204042956411875,
319
+ "grad_norm": 3.0378000736236572,
320
+ "learning_rate": 0.0004148148148148148,
321
+ "loss": 0.1226,
322
+ "step": 7600
323
+ },
324
+ {
325
+ "epoch": 19.204042956411875,
326
+ "eval_accuracy": 0.7739930955120828,
327
+ "eval_f1_macro": 0.6193094447560485,
328
+ "eval_loss": 1.486433982849121,
329
+ "eval_runtime": 109.0558,
330
+ "eval_samples_per_second": 79.684,
331
+ "eval_steps_per_second": 2.494,
332
+ "step": 7600
333
+ },
334
+ {
335
+ "epoch": 20.214782059380923,
336
+ "grad_norm": 3.1991524696350098,
337
+ "learning_rate": 0.0004074074074074074,
338
+ "loss": 0.114,
339
+ "step": 8000
340
+ },
341
+ {
342
+ "epoch": 20.214782059380923,
343
+ "eval_accuracy": 0.774108170310702,
344
+ "eval_f1_macro": 0.6157091732739274,
345
+ "eval_loss": 1.5931099653244019,
346
+ "eval_runtime": 109.0943,
347
+ "eval_samples_per_second": 79.656,
348
+ "eval_steps_per_second": 2.493,
349
+ "step": 8000
350
+ },
351
+ {
352
+ "epoch": 21.225521162349967,
353
+ "grad_norm": 2.1036899089813232,
354
+ "learning_rate": 0.0004,
355
+ "loss": 0.1064,
356
+ "step": 8400
357
+ },
358
+ {
359
+ "epoch": 21.225521162349967,
360
+ "eval_accuracy": 0.7730724971231301,
361
+ "eval_f1_macro": 0.6020232192562277,
362
+ "eval_loss": 1.7101207971572876,
363
+ "eval_runtime": 108.899,
364
+ "eval_samples_per_second": 79.799,
365
+ "eval_steps_per_second": 2.498,
366
+ "step": 8400
367
+ },
368
+ {
369
+ "epoch": 22.236260265319014,
370
+ "grad_norm": 2.786360025405884,
371
+ "learning_rate": 0.0003925925925925926,
372
+ "loss": 0.1009,
373
+ "step": 8800
374
+ },
375
+ {
376
+ "epoch": 22.236260265319014,
377
+ "eval_accuracy": 0.7655926352128883,
378
+ "eval_f1_macro": 0.5794753743607411,
379
+ "eval_loss": 1.6664392948150635,
380
+ "eval_runtime": 109.2502,
381
+ "eval_samples_per_second": 79.542,
382
+ "eval_steps_per_second": 2.49,
383
+ "step": 8800
384
+ },
385
+ {
386
+ "epoch": 23.246999368288062,
387
+ "grad_norm": 1.0751720666885376,
388
+ "learning_rate": 0.0003851851851851852,
389
+ "loss": 0.0941,
390
+ "step": 9200
391
+ },
392
+ {
393
+ "epoch": 23.246999368288062,
394
+ "eval_accuracy": 0.7772151898734178,
395
+ "eval_f1_macro": 0.5717636011134882,
396
+ "eval_loss": 1.5253993272781372,
397
+ "eval_runtime": 109.0143,
398
+ "eval_samples_per_second": 79.714,
399
+ "eval_steps_per_second": 2.495,
400
+ "step": 9200
401
+ },
402
+ {
403
+ "epoch": 24.257738471257106,
404
+ "grad_norm": 1.744019865989685,
405
+ "learning_rate": 0.00037777777777777777,
406
+ "loss": 0.0861,
407
+ "step": 9600
408
+ },
409
+ {
410
+ "epoch": 24.257738471257106,
411
+ "eval_accuracy": 0.777445339470656,
412
+ "eval_f1_macro": 0.625140306336925,
413
+ "eval_loss": 1.6324084997177124,
414
+ "eval_runtime": 108.6336,
415
+ "eval_samples_per_second": 79.994,
416
+ "eval_steps_per_second": 2.504,
417
+ "step": 9600
418
+ },
419
+ {
420
+ "epoch": 25.268477574226154,
421
+ "grad_norm": 1.838752269744873,
422
+ "learning_rate": 0.00037037037037037035,
423
+ "loss": 0.0807,
424
+ "step": 10000
425
+ },
426
+ {
427
+ "epoch": 25.268477574226154,
428
+ "eval_accuracy": 0.7728423475258919,
429
+ "eval_f1_macro": 0.5870939911644882,
430
+ "eval_loss": 1.7057673931121826,
431
+ "eval_runtime": 108.6842,
432
+ "eval_samples_per_second": 79.956,
433
+ "eval_steps_per_second": 2.503,
434
+ "step": 10000
435
+ },
436
+ {
437
+ "epoch": 26.279216677195198,
438
+ "grad_norm": 2.3391871452331543,
439
+ "learning_rate": 0.000362962962962963,
440
+ "loss": 0.0739,
441
+ "step": 10400
442
+ },
443
+ {
444
+ "epoch": 26.279216677195198,
445
+ "eval_accuracy": 0.774108170310702,
446
+ "eval_f1_macro": 0.6190123341706849,
447
+ "eval_loss": 1.6950148344039917,
448
+ "eval_runtime": 108.9167,
449
+ "eval_samples_per_second": 79.786,
450
+ "eval_steps_per_second": 2.497,
451
+ "step": 10400
452
+ },
453
+ {
454
+ "epoch": 27.289955780164245,
455
+ "grad_norm": 1.3197505474090576,
456
+ "learning_rate": 0.00035555555555555557,
457
+ "loss": 0.0685,
458
+ "step": 10800
459
+ },
460
+ {
461
+ "epoch": 27.289955780164245,
462
+ "eval_accuracy": 0.7652474108170311,
463
+ "eval_f1_macro": 0.5984200620053731,
464
+ "eval_loss": 1.8148038387298584,
465
+ "eval_runtime": 108.998,
466
+ "eval_samples_per_second": 79.726,
467
+ "eval_steps_per_second": 2.495,
468
+ "step": 10800
469
+ },
470
+ {
471
+ "epoch": 28.300694883133293,
472
+ "grad_norm": 0.8027063608169556,
473
+ "learning_rate": 0.00034814814814814816,
474
+ "loss": 0.0692,
475
+ "step": 11200
476
+ },
477
+ {
478
+ "epoch": 28.300694883133293,
479
+ "eval_accuracy": 0.776409666283084,
480
+ "eval_f1_macro": 0.6002766778970904,
481
+ "eval_loss": 1.6219606399536133,
482
+ "eval_runtime": 108.9613,
483
+ "eval_samples_per_second": 79.753,
484
+ "eval_steps_per_second": 2.496,
485
+ "step": 11200
486
+ },
487
+ {
488
+ "epoch": 29.311433986102337,
489
+ "grad_norm": 0.8713662028312683,
490
+ "learning_rate": 0.00034074074074074074,
491
+ "loss": 0.0662,
492
+ "step": 11600
493
+ },
494
+ {
495
+ "epoch": 29.311433986102337,
496
+ "eval_accuracy": 0.7794016110471806,
497
+ "eval_f1_macro": 0.6123819840203646,
498
+ "eval_loss": 1.6953762769699097,
499
+ "eval_runtime": 109.1585,
500
+ "eval_samples_per_second": 79.609,
501
+ "eval_steps_per_second": 2.492,
502
+ "step": 11600
503
+ },
504
+ {
505
+ "epoch": 30.322173089071384,
506
+ "grad_norm": 0.9094525575637817,
507
+ "learning_rate": 0.0003333333333333333,
508
+ "loss": 0.0639,
509
+ "step": 12000
510
+ },
511
+ {
512
+ "epoch": 30.322173089071384,
513
+ "eval_accuracy": 0.7785960874568469,
514
+ "eval_f1_macro": 0.5900178041075752,
515
+ "eval_loss": 1.7562154531478882,
516
+ "eval_runtime": 108.917,
517
+ "eval_samples_per_second": 79.786,
518
+ "eval_steps_per_second": 2.497,
519
+ "step": 12000
520
+ },
521
+ {
522
+ "epoch": 31.33291219204043,
523
+ "grad_norm": 2.3824515342712402,
524
+ "learning_rate": 0.00032592592592592596,
525
+ "loss": 0.0613,
526
+ "step": 12400
527
+ },
528
+ {
529
+ "epoch": 31.33291219204043,
530
+ "eval_accuracy": 0.7708860759493671,
531
+ "eval_f1_macro": 0.5886611331241638,
532
+ "eval_loss": 1.7263332605361938,
533
+ "eval_runtime": 109.2037,
534
+ "eval_samples_per_second": 79.576,
535
+ "eval_steps_per_second": 2.491,
536
+ "step": 12400
537
+ },
538
+ {
539
+ "epoch": 32.34365129500947,
540
+ "grad_norm": 1.1265066862106323,
541
+ "learning_rate": 0.00031851851851851854,
542
+ "loss": 0.0562,
543
+ "step": 12800
544
+ },
545
+ {
546
+ "epoch": 32.34365129500947,
547
+ "eval_accuracy": 0.777445339470656,
548
+ "eval_f1_macro": 0.6069323146272442,
549
+ "eval_loss": 1.595489263534546,
550
+ "eval_runtime": 110.1086,
551
+ "eval_samples_per_second": 78.922,
552
+ "eval_steps_per_second": 2.47,
553
+ "step": 12800
554
+ },
555
+ {
556
+ "epoch": 33.35439039797852,
557
+ "grad_norm": 0.765870988368988,
558
+ "learning_rate": 0.0003111111111111111,
559
+ "loss": 0.0482,
560
+ "step": 13200
561
+ },
562
+ {
563
+ "epoch": 33.35439039797852,
564
+ "eval_accuracy": 0.7858457997698504,
565
+ "eval_f1_macro": 0.6152260699722518,
566
+ "eval_loss": 1.6528053283691406,
567
+ "eval_runtime": 109.0363,
568
+ "eval_samples_per_second": 79.698,
569
+ "eval_steps_per_second": 2.495,
570
+ "step": 13200
571
+ },
572
+ {
573
+ "epoch": 34.36512950094757,
574
+ "grad_norm": 2.386359930038452,
575
+ "learning_rate": 0.0003037037037037037,
576
+ "loss": 0.0516,
577
+ "step": 13600
578
+ },
579
+ {
580
+ "epoch": 34.36512950094757,
581
+ "eval_accuracy": 0.7713463751438435,
582
+ "eval_f1_macro": 0.5894778786253475,
583
+ "eval_loss": 1.65277099609375,
584
+ "eval_runtime": 109.1673,
585
+ "eval_samples_per_second": 79.603,
586
+ "eval_steps_per_second": 2.492,
587
+ "step": 13600
588
+ },
589
+ {
590
+ "epoch": 35.375868603916615,
591
+ "grad_norm": 1.8987774848937988,
592
+ "learning_rate": 0.0002962962962962963,
593
+ "loss": 0.0447,
594
+ "step": 14000
595
+ },
596
+ {
597
+ "epoch": 35.375868603916615,
598
+ "eval_accuracy": 0.7799769850402761,
599
+ "eval_f1_macro": 0.6297477374058172,
600
+ "eval_loss": 1.813390851020813,
601
+ "eval_runtime": 109.6977,
602
+ "eval_samples_per_second": 79.218,
603
+ "eval_steps_per_second": 2.48,
604
+ "step": 14000
605
+ },
606
+ {
607
+ "epoch": 36.38660770688566,
608
+ "grad_norm": 1.353411078453064,
609
+ "learning_rate": 0.0002888888888888889,
610
+ "loss": 0.047,
611
+ "step": 14400
612
+ },
613
+ {
614
+ "epoch": 36.38660770688566,
615
+ "eval_accuracy": 0.7795166858457998,
616
+ "eval_f1_macro": 0.5795862617467612,
617
+ "eval_loss": 1.663203477859497,
618
+ "eval_runtime": 109.0323,
619
+ "eval_samples_per_second": 79.701,
620
+ "eval_steps_per_second": 2.495,
621
+ "step": 14400
622
+ },
623
+ {
624
+ "epoch": 37.3973468098547,
625
+ "grad_norm": 1.1114296913146973,
626
+ "learning_rate": 0.0002814814814814815,
627
+ "loss": 0.0436,
628
+ "step": 14800
629
+ },
630
+ {
631
+ "epoch": 37.3973468098547,
632
+ "eval_accuracy": 0.784234752589183,
633
+ "eval_f1_macro": 0.5995152264247978,
634
+ "eval_loss": 1.783818006515503,
635
+ "eval_runtime": 109.4106,
636
+ "eval_samples_per_second": 79.426,
637
+ "eval_steps_per_second": 2.486,
638
+ "step": 14800
639
+ },
640
+ {
641
+ "epoch": 38.40808591282375,
642
+ "grad_norm": 1.3422303199768066,
643
+ "learning_rate": 0.0002740740740740741,
644
+ "loss": 0.0422,
645
+ "step": 15200
646
+ },
647
+ {
648
+ "epoch": 38.40808591282375,
649
+ "eval_accuracy": 0.7838895281933257,
650
+ "eval_f1_macro": 0.6189287691248615,
651
+ "eval_loss": 1.7172709703445435,
652
+ "eval_runtime": 108.6629,
653
+ "eval_samples_per_second": 79.972,
654
+ "eval_steps_per_second": 2.503,
655
+ "step": 15200
656
+ },
657
+ {
658
+ "epoch": 39.4188250157928,
659
+ "grad_norm": 1.8279023170471191,
660
+ "learning_rate": 0.0002666666666666667,
661
+ "loss": 0.0377,
662
+ "step": 15600
663
+ },
664
+ {
665
+ "epoch": 39.4188250157928,
666
+ "eval_accuracy": 0.7834292289988493,
667
+ "eval_f1_macro": 0.5814739153081228,
668
+ "eval_loss": 1.7523770332336426,
669
+ "eval_runtime": 108.9839,
670
+ "eval_samples_per_second": 79.737,
671
+ "eval_steps_per_second": 2.496,
672
+ "step": 15600
673
+ },
674
+ {
675
+ "epoch": 40.429564118761846,
676
+ "grad_norm": 2.154459238052368,
677
+ "learning_rate": 0.00025925925925925926,
678
+ "loss": 0.0359,
679
+ "step": 16000
680
+ },
681
+ {
682
+ "epoch": 40.429564118761846,
683
+ "eval_accuracy": 0.7886075949367088,
684
+ "eval_f1_macro": 0.6293741181702724,
685
+ "eval_loss": 1.623598337173462,
686
+ "eval_runtime": 108.8195,
687
+ "eval_samples_per_second": 79.857,
688
+ "eval_steps_per_second": 2.5,
689
+ "step": 16000
690
+ },
691
+ {
692
+ "epoch": 41.44030322173089,
693
+ "grad_norm": 0.8551483154296875,
694
+ "learning_rate": 0.00025185185185185185,
695
+ "loss": 0.0344,
696
+ "step": 16400
697
+ },
698
+ {
699
+ "epoch": 41.44030322173089,
700
+ "eval_accuracy": 0.7815880322209436,
701
+ "eval_f1_macro": 0.6087804648227756,
702
+ "eval_loss": 1.7353272438049316,
703
+ "eval_runtime": 109.2273,
704
+ "eval_samples_per_second": 79.559,
705
+ "eval_steps_per_second": 2.49,
706
+ "step": 16400
707
+ },
708
+ {
709
+ "epoch": 42.451042324699934,
710
+ "grad_norm": 0.5178919434547424,
711
+ "learning_rate": 0.00024444444444444443,
712
+ "loss": 0.033,
713
+ "step": 16800
714
+ },
715
+ {
716
+ "epoch": 42.451042324699934,
717
+ "eval_accuracy": 0.7820483314154201,
718
+ "eval_f1_macro": 0.6001569016578011,
719
+ "eval_loss": 1.727620244026184,
720
+ "eval_runtime": 109.4385,
721
+ "eval_samples_per_second": 79.405,
722
+ "eval_steps_per_second": 2.485,
723
+ "step": 16800
724
+ },
725
+ {
726
+ "epoch": 43.46178142766898,
727
+ "grad_norm": 0.4940205514431,
728
+ "learning_rate": 0.00023703703703703704,
729
+ "loss": 0.0325,
730
+ "step": 17200
731
+ },
732
+ {
733
+ "epoch": 43.46178142766898,
734
+ "eval_accuracy": 0.7783659378596087,
735
+ "eval_f1_macro": 0.6283289368126677,
736
+ "eval_loss": 1.7798371315002441,
737
+ "eval_runtime": 109.2576,
738
+ "eval_samples_per_second": 79.537,
739
+ "eval_steps_per_second": 2.49,
740
+ "step": 17200
741
+ },
742
+ {
743
+ "epoch": 44.47252053063803,
744
+ "grad_norm": 0.8661497235298157,
745
+ "learning_rate": 0.00022962962962962962,
746
+ "loss": 0.0302,
747
+ "step": 17600
748
+ },
749
+ {
750
+ "epoch": 44.47252053063803,
751
+ "eval_accuracy": 0.7828538550057538,
752
+ "eval_f1_macro": 0.6164776778280789,
753
+ "eval_loss": 1.7507109642028809,
754
+ "eval_runtime": 109.1869,
755
+ "eval_samples_per_second": 79.588,
756
+ "eval_steps_per_second": 2.491,
757
+ "step": 17600
758
+ },
759
+ {
760
+ "epoch": 45.48325963360708,
761
+ "grad_norm": 0.015332411043345928,
762
+ "learning_rate": 0.0002222222222222222,
763
+ "loss": 0.0268,
764
+ "step": 18000
765
+ },
766
+ {
767
+ "epoch": 45.48325963360708,
768
+ "eval_accuracy": 0.7826237054085156,
769
+ "eval_f1_macro": 0.6031617249417177,
770
+ "eval_loss": 1.7825220823287964,
771
+ "eval_runtime": 109.3518,
772
+ "eval_samples_per_second": 79.468,
773
+ "eval_steps_per_second": 2.487,
774
+ "step": 18000
775
+ },
776
+ {
777
+ "epoch": 46.493998736576124,
778
+ "grad_norm": 0.5325392484664917,
779
+ "learning_rate": 0.00021481481481481482,
780
+ "loss": 0.0287,
781
+ "step": 18400
782
+ },
783
+ {
784
+ "epoch": 46.493998736576124,
785
+ "eval_accuracy": 0.7882623705408516,
786
+ "eval_f1_macro": 0.6256320010133759,
787
+ "eval_loss": 1.6932624578475952,
788
+ "eval_runtime": 108.513,
789
+ "eval_samples_per_second": 80.083,
790
+ "eval_steps_per_second": 2.507,
791
+ "step": 18400
792
+ },
793
+ {
794
+ "epoch": 47.504737839545164,
795
+ "grad_norm": 0.5086055994033813,
796
+ "learning_rate": 0.0002074074074074074,
797
+ "loss": 0.0252,
798
+ "step": 18800
799
+ },
800
+ {
801
+ "epoch": 47.504737839545164,
802
+ "eval_accuracy": 0.7856156501726121,
803
+ "eval_f1_macro": 0.6143416230351354,
804
+ "eval_loss": 1.7501070499420166,
805
+ "eval_runtime": 109.2365,
806
+ "eval_samples_per_second": 79.552,
807
+ "eval_steps_per_second": 2.49,
808
+ "step": 18800
809
+ },
810
+ {
811
+ "epoch": 48.51547694251421,
812
+ "grad_norm": 1.229317545890808,
813
+ "learning_rate": 0.0002,
814
+ "loss": 0.0283,
815
+ "step": 19200
816
+ },
817
+ {
818
+ "epoch": 48.51547694251421,
819
+ "eval_accuracy": 0.7843498273878021,
820
+ "eval_f1_macro": 0.6189575264715401,
821
+ "eval_loss": 1.9032423496246338,
822
+ "eval_runtime": 108.2906,
823
+ "eval_samples_per_second": 80.247,
824
+ "eval_steps_per_second": 2.512,
825
+ "step": 19200
826
+ },
827
+ {
828
+ "epoch": 49.52621604548326,
829
+ "grad_norm": 0.05275914818048477,
830
+ "learning_rate": 0.0001925925925925926,
831
+ "loss": 0.024,
832
+ "step": 19600
833
+ },
834
+ {
835
+ "epoch": 49.52621604548326,
836
+ "eval_accuracy": 0.7874568469505179,
837
+ "eval_f1_macro": 0.6393370936978522,
838
+ "eval_loss": 1.8691409826278687,
839
+ "eval_runtime": 108.1545,
840
+ "eval_samples_per_second": 80.348,
841
+ "eval_steps_per_second": 2.515,
842
+ "step": 19600
843
+ },
844
+ {
845
+ "epoch": 50.53695514845231,
846
+ "grad_norm": 0.9653208255767822,
847
+ "learning_rate": 0.00018518518518518518,
848
+ "loss": 0.0229,
849
+ "step": 20000
850
+ },
851
+ {
852
+ "epoch": 50.53695514845231,
853
+ "eval_accuracy": 0.786536248561565,
854
+ "eval_f1_macro": 0.6026385719720891,
855
+ "eval_loss": 1.7541390657424927,
856
+ "eval_runtime": 107.9085,
857
+ "eval_samples_per_second": 80.531,
858
+ "eval_steps_per_second": 2.521,
859
+ "step": 20000
860
+ },
861
+ {
862
+ "epoch": 51.547694251421355,
863
+ "grad_norm": 0.4658529758453369,
864
+ "learning_rate": 0.00017777777777777779,
865
+ "loss": 0.0219,
866
+ "step": 20400
867
+ },
868
+ {
869
+ "epoch": 51.547694251421355,
870
+ "eval_accuracy": 0.7872266973532797,
871
+ "eval_f1_macro": 0.6309747652348119,
872
+ "eval_loss": 1.7537351846694946,
873
+ "eval_runtime": 107.7743,
874
+ "eval_samples_per_second": 80.632,
875
+ "eval_steps_per_second": 2.524,
876
+ "step": 20400
877
+ },
878
+ {
879
+ "epoch": 52.558433354390395,
880
+ "grad_norm": 0.32756420969963074,
881
+ "learning_rate": 0.00017037037037037037,
882
+ "loss": 0.0211,
883
+ "step": 20800
884
+ },
885
+ {
886
+ "epoch": 52.558433354390395,
887
+ "eval_accuracy": 0.7934407364787112,
888
+ "eval_f1_macro": 0.6206166338546538,
889
+ "eval_loss": 1.6842619180679321,
890
+ "eval_runtime": 107.7209,
891
+ "eval_samples_per_second": 80.671,
892
+ "eval_steps_per_second": 2.525,
893
+ "step": 20800
894
+ },
895
+ {
896
+ "epoch": 53.56917245735944,
897
+ "grad_norm": 0.584701418876648,
898
+ "learning_rate": 0.00016296296296296298,
899
+ "loss": 0.0203,
900
+ "step": 21200
901
+ },
902
+ {
903
+ "epoch": 53.56917245735944,
904
+ "eval_accuracy": 0.7950517836593786,
905
+ "eval_f1_macro": 0.6206542591204762,
906
+ "eval_loss": 1.699610710144043,
907
+ "eval_runtime": 107.6954,
908
+ "eval_samples_per_second": 80.691,
909
+ "eval_steps_per_second": 2.526,
910
+ "step": 21200
911
+ },
912
+ {
913
+ "epoch": 54.57991156032849,
914
+ "grad_norm": 0.0553191676735878,
915
+ "learning_rate": 0.00015555555555555556,
916
+ "loss": 0.0174,
917
+ "step": 21600
918
+ },
919
+ {
920
+ "epoch": 54.57991156032849,
921
+ "eval_accuracy": 0.7894131185270425,
922
+ "eval_f1_macro": 0.6214961351780512,
923
+ "eval_loss": 1.8445045948028564,
924
+ "eval_runtime": 107.7853,
925
+ "eval_samples_per_second": 80.623,
926
+ "eval_steps_per_second": 2.524,
927
+ "step": 21600
928
+ },
929
+ {
930
+ "epoch": 55.59065066329754,
931
+ "grad_norm": 0.4328874945640564,
932
+ "learning_rate": 0.00014814814814814815,
933
+ "loss": 0.0197,
934
+ "step": 22000
935
+ },
936
+ {
937
+ "epoch": 55.59065066329754,
938
+ "eval_accuracy": 0.792059838895282,
939
+ "eval_f1_macro": 0.6308138834712996,
940
+ "eval_loss": 1.8310879468917847,
941
+ "eval_runtime": 107.7421,
942
+ "eval_samples_per_second": 80.656,
943
+ "eval_steps_per_second": 2.525,
944
+ "step": 22000
945
+ },
946
+ {
947
+ "epoch": 56.601389766266585,
948
+ "grad_norm": 0.02704198658466339,
949
+ "learning_rate": 0.00014074074074074076,
950
+ "loss": 0.0169,
951
+ "step": 22400
952
+ },
953
+ {
954
+ "epoch": 56.601389766266585,
955
+ "eval_accuracy": 0.7879171461449942,
956
+ "eval_f1_macro": 0.5896127682611725,
957
+ "eval_loss": 1.8162003755569458,
958
+ "eval_runtime": 107.8141,
959
+ "eval_samples_per_second": 80.602,
960
+ "eval_steps_per_second": 2.523,
961
+ "step": 22400
962
+ },
963
+ {
964
+ "epoch": 57.612128869235626,
965
+ "grad_norm": 0.2748865485191345,
966
+ "learning_rate": 0.00013333333333333334,
967
+ "loss": 0.0121,
968
+ "step": 22800
969
+ },
970
+ {
971
+ "epoch": 57.612128869235626,
972
+ "eval_accuracy": 0.7852704257767549,
973
+ "eval_f1_macro": 0.5951106108532582,
974
+ "eval_loss": 1.924727201461792,
975
+ "eval_runtime": 107.712,
976
+ "eval_samples_per_second": 80.678,
977
+ "eval_steps_per_second": 2.525,
978
+ "step": 22800
979
+ },
980
+ {
981
+ "epoch": 58.62286797220467,
982
+ "grad_norm": 0.0328911654651165,
983
+ "learning_rate": 0.00012592592592592592,
984
+ "loss": 0.0152,
985
+ "step": 23200
986
+ },
987
+ {
988
+ "epoch": 58.62286797220467,
989
+ "eval_accuracy": 0.7881472957422324,
990
+ "eval_f1_macro": 0.6063430405057288,
991
+ "eval_loss": 1.8502182960510254,
992
+ "eval_runtime": 107.788,
993
+ "eval_samples_per_second": 80.621,
994
+ "eval_steps_per_second": 2.523,
995
+ "step": 23200
996
+ },
997
+ {
998
+ "epoch": 59.63360707517372,
999
+ "grad_norm": 0.00955616869032383,
1000
+ "learning_rate": 0.00011851851851851852,
1001
+ "loss": 0.0142,
1002
+ "step": 23600
1003
+ },
1004
+ {
1005
+ "epoch": 59.63360707517372,
1006
+ "eval_accuracy": 0.789873417721519,
1007
+ "eval_f1_macro": 0.617993825444742,
1008
+ "eval_loss": 1.7803289890289307,
1009
+ "eval_runtime": 107.8043,
1010
+ "eval_samples_per_second": 80.609,
1011
+ "eval_steps_per_second": 2.523,
1012
+ "step": 23600
1013
+ },
1014
+ {
1015
+ "epoch": 60.64434617814277,
1016
+ "grad_norm": 0.06125176325440407,
1017
+ "learning_rate": 0.0001111111111111111,
1018
+ "loss": 0.0105,
1019
+ "step": 24000
1020
+ },
1021
+ {
1022
+ "epoch": 60.64434617814277,
1023
+ "eval_accuracy": 0.7861910241657077,
1024
+ "eval_f1_macro": 0.6254018987758924,
1025
+ "eval_loss": 1.916595458984375,
1026
+ "eval_runtime": 107.7673,
1027
+ "eval_samples_per_second": 80.637,
1028
+ "eval_steps_per_second": 2.524,
1029
+ "step": 24000
1030
+ },
1031
+ {
1032
+ "epoch": 61.655085281111816,
1033
+ "grad_norm": 0.10605888813734055,
1034
+ "learning_rate": 0.0001037037037037037,
1035
+ "loss": 0.0116,
1036
+ "step": 24400
1037
+ },
1038
+ {
1039
+ "epoch": 61.655085281111816,
1040
+ "eval_accuracy": 0.7858457997698504,
1041
+ "eval_f1_macro": 0.5961002471321352,
1042
+ "eval_loss": 1.9204109907150269,
1043
+ "eval_runtime": 107.7648,
1044
+ "eval_samples_per_second": 80.639,
1045
+ "eval_steps_per_second": 2.524,
1046
+ "step": 24400
1047
+ },
1048
+ {
1049
+ "epoch": 62.66582438408086,
1050
+ "grad_norm": 0.044181693345308304,
1051
+ "learning_rate": 9.62962962962963e-05,
1052
+ "loss": 0.0112,
1053
+ "step": 24800
1054
+ },
1055
+ {
1056
+ "epoch": 62.66582438408086,
1057
+ "eval_accuracy": 0.7878020713463751,
1058
+ "eval_f1_macro": 0.6235710102313945,
1059
+ "eval_loss": 1.9822152853012085,
1060
+ "eval_runtime": 107.735,
1061
+ "eval_samples_per_second": 80.661,
1062
+ "eval_steps_per_second": 2.525,
1063
+ "step": 24800
1064
+ },
1065
+ {
1066
+ "epoch": 63.676563487049904,
1067
+ "grad_norm": 0.023459970951080322,
1068
+ "learning_rate": 8.888888888888889e-05,
1069
+ "loss": 0.0102,
1070
+ "step": 25200
1071
+ },
1072
+ {
1073
+ "epoch": 63.676563487049904,
1074
+ "eval_accuracy": 0.7840046029919447,
1075
+ "eval_f1_macro": 0.6155669395709024,
1076
+ "eval_loss": 1.9653674364089966,
1077
+ "eval_runtime": 107.7821,
1078
+ "eval_samples_per_second": 80.626,
1079
+ "eval_steps_per_second": 2.524,
1080
+ "step": 25200
1081
+ },
1082
+ {
1083
+ "epoch": 64.68730259001894,
1084
+ "grad_norm": 1.9076263904571533,
1085
+ "learning_rate": 8.148148148148149e-05,
1086
+ "loss": 0.01,
1087
+ "step": 25600
1088
+ },
1089
+ {
1090
+ "epoch": 64.68730259001894,
1091
+ "eval_accuracy": 0.7880322209436134,
1092
+ "eval_f1_macro": 0.6226637633596005,
1093
+ "eval_loss": 1.938231348991394,
1094
+ "eval_runtime": 107.7205,
1095
+ "eval_samples_per_second": 80.672,
1096
+ "eval_steps_per_second": 2.525,
1097
+ "step": 25600
1098
+ },
1099
+ {
1100
+ "epoch": 65.698041692988,
1101
+ "grad_norm": 0.4948989748954773,
1102
+ "learning_rate": 7.407407407407407e-05,
1103
+ "loss": 0.0101,
1104
+ "step": 26000
1105
+ },
1106
+ {
1107
+ "epoch": 65.698041692988,
1108
+ "eval_accuracy": 0.7960874568469505,
1109
+ "eval_f1_macro": 0.6277935659004009,
1110
+ "eval_loss": 1.8299671411514282,
1111
+ "eval_runtime": 107.7348,
1112
+ "eval_samples_per_second": 80.661,
1113
+ "eval_steps_per_second": 2.525,
1114
+ "step": 26000
1115
+ },
1116
+ {
1117
+ "epoch": 66.70878079595704,
1118
+ "grad_norm": 0.00608784519135952,
1119
+ "learning_rate": 6.666666666666667e-05,
1120
+ "loss": 0.0086,
1121
+ "step": 26400
1122
+ },
1123
+ {
1124
+ "epoch": 66.70878079595704,
1125
+ "eval_accuracy": 0.7968929804372842,
1126
+ "eval_f1_macro": 0.6234372893298947,
1127
+ "eval_loss": 1.9254202842712402,
1128
+ "eval_runtime": 108.035,
1129
+ "eval_samples_per_second": 80.437,
1130
+ "eval_steps_per_second": 2.518,
1131
+ "step": 26400
1132
+ },
1133
+ {
1134
+ "epoch": 67.7195198989261,
1135
+ "grad_norm": 0.08328448981046677,
1136
+ "learning_rate": 5.925925925925926e-05,
1137
+ "loss": 0.0073,
1138
+ "step": 26800
1139
+ },
1140
+ {
1141
+ "epoch": 67.7195198989261,
1142
+ "eval_accuracy": 0.7915995397008055,
1143
+ "eval_f1_macro": 0.6320923241131308,
1144
+ "eval_loss": 1.8887046575546265,
1145
+ "eval_runtime": 107.8399,
1146
+ "eval_samples_per_second": 80.582,
1147
+ "eval_steps_per_second": 2.522,
1148
+ "step": 26800
1149
+ },
1150
+ {
1151
+ "epoch": 68.73025900189513,
1152
+ "grad_norm": 0.02061997540295124,
1153
+ "learning_rate": 5.185185185185185e-05,
1154
+ "loss": 0.0069,
1155
+ "step": 27200
1156
+ },
1157
+ {
1158
+ "epoch": 68.73025900189513,
1159
+ "eval_accuracy": 0.794361334867664,
1160
+ "eval_f1_macro": 0.636665979654867,
1161
+ "eval_loss": 1.9074466228485107,
1162
+ "eval_runtime": 107.6829,
1163
+ "eval_samples_per_second": 80.7,
1164
+ "eval_steps_per_second": 2.526,
1165
+ "step": 27200
1166
+ },
1167
+ {
1168
+ "epoch": 69.74099810486418,
1169
+ "grad_norm": 0.012987918220460415,
1170
+ "learning_rate": 4.4444444444444447e-05,
1171
+ "loss": 0.0059,
1172
+ "step": 27600
1173
+ },
1174
+ {
1175
+ "epoch": 69.74099810486418,
1176
+ "eval_accuracy": 0.792059838895282,
1177
+ "eval_f1_macro": 0.6315720450251525,
1178
+ "eval_loss": 1.9398057460784912,
1179
+ "eval_runtime": 107.8991,
1180
+ "eval_samples_per_second": 80.538,
1181
+ "eval_steps_per_second": 2.521,
1182
+ "step": 27600
1183
+ },
1184
+ {
1185
+ "epoch": 70.75173720783323,
1186
+ "grad_norm": 0.005101632326841354,
1187
+ "learning_rate": 3.7037037037037037e-05,
1188
+ "loss": 0.0066,
1189
+ "step": 28000
1190
+ },
1191
+ {
1192
+ "epoch": 70.75173720783323,
1193
+ "eval_accuracy": 0.794361334867664,
1194
+ "eval_f1_macro": 0.6349818220797456,
1195
+ "eval_loss": 1.8699119091033936,
1196
+ "eval_runtime": 109.2809,
1197
+ "eval_samples_per_second": 79.52,
1198
+ "eval_steps_per_second": 2.489,
1199
+ "step": 28000
1200
+ },
1201
+ {
1202
+ "epoch": 71.76247631080227,
1203
+ "grad_norm": 0.6047748923301697,
1204
+ "learning_rate": 2.962962962962963e-05,
1205
+ "loss": 0.0062,
1206
+ "step": 28400
1207
+ },
1208
+ {
1209
+ "epoch": 71.76247631080227,
1210
+ "eval_accuracy": 0.7951668584579977,
1211
+ "eval_f1_macro": 0.6343250573277666,
1212
+ "eval_loss": 1.8893409967422485,
1213
+ "eval_runtime": 109.2978,
1214
+ "eval_samples_per_second": 79.508,
1215
+ "eval_steps_per_second": 2.489,
1216
+ "step": 28400
1217
+ },
1218
+ {
1219
+ "epoch": 72.77321541377133,
1220
+ "grad_norm": 0.012553258799016476,
1221
+ "learning_rate": 2.2222222222222223e-05,
1222
+ "loss": 0.0058,
1223
+ "step": 28800
1224
+ },
1225
+ {
1226
+ "epoch": 72.77321541377133,
1227
+ "eval_accuracy": 0.7982738780207135,
1228
+ "eval_f1_macro": 0.6409643965446785,
1229
+ "eval_loss": 1.883091926574707,
1230
+ "eval_runtime": 109.2468,
1231
+ "eval_samples_per_second": 79.545,
1232
+ "eval_steps_per_second": 2.49,
1233
+ "step": 28800
1234
+ },
1235
+ {
1236
+ "epoch": 73.78395451674037,
1237
+ "grad_norm": 0.0007793375989422202,
1238
+ "learning_rate": 1.4814814814814815e-05,
1239
+ "loss": 0.0056,
1240
+ "step": 29200
1241
+ },
1242
+ {
1243
+ "epoch": 73.78395451674037,
1244
+ "eval_accuracy": 0.7958573072497123,
1245
+ "eval_f1_macro": 0.6356613761441215,
1246
+ "eval_loss": 1.8901586532592773,
1247
+ "eval_runtime": 108.6154,
1248
+ "eval_samples_per_second": 80.007,
1249
+ "eval_steps_per_second": 2.504,
1250
+ "step": 29200
1251
+ },
1252
+ {
1253
+ "epoch": 74.7946936197094,
1254
+ "grad_norm": 0.14352725446224213,
1255
+ "learning_rate": 7.4074074074074075e-06,
1256
+ "loss": 0.0053,
1257
+ "step": 29600
1258
+ },
1259
+ {
1260
+ "epoch": 74.7946936197094,
1261
+ "eval_accuracy": 0.7991944764096662,
1262
+ "eval_f1_macro": 0.643747242061282,
1263
+ "eval_loss": 1.888542890548706,
1264
+ "eval_runtime": 108.5316,
1265
+ "eval_samples_per_second": 80.069,
1266
+ "eval_steps_per_second": 2.506,
1267
+ "step": 29600
1268
+ },
1269
+ {
1270
+ "epoch": 75.80543272267846,
1271
+ "grad_norm": 0.9781034588813782,
1272
+ "learning_rate": 0.0,
1273
+ "loss": 0.0046,
1274
+ "step": 30000
1275
+ },
1276
+ {
1277
+ "epoch": 75.80543272267846,
1278
+ "eval_accuracy": 0.8005753739930955,
1279
+ "eval_f1_macro": 0.6435443913467072,
1280
+ "eval_loss": 1.888439655303955,
1281
+ "eval_runtime": 108.5256,
1282
+ "eval_samples_per_second": 80.073,
1283
+ "eval_steps_per_second": 2.506,
1284
+ "step": 30000
1285
+ }
1286
+ ],
1287
+ "logging_steps": 400,
1288
+ "max_steps": 30000,
1289
+ "num_input_tokens_seen": 0,
1290
+ "num_train_epochs": 76,
1291
+ "save_steps": 1200,
1292
+ "stateful_callbacks": {
1293
+ "TrainerControl": {
1294
+ "args": {
1295
+ "should_epoch_stop": false,
1296
+ "should_evaluate": false,
1297
+ "should_log": false,
1298
+ "should_save": true,
1299
+ "should_training_stop": true
1300
+ },
1301
+ "attributes": {}
1302
+ }
1303
+ },
1304
+ "total_flos": 5.8164789316384843e+20,
1305
+ "train_batch_size": 32,
1306
+ "trial_name": null,
1307
+ "trial_params": null
1308
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93195b6ce37c327e37144d872765300fb413254de9caea3613dca78eed8e3139
3
+ size 5304