winnieyangwannan commited on
Commit
464b62a
·
verified ·
1 Parent(s): 5026196

Training in progress, step 500, checkpoint

Browse files
checkpoint-500/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "q_proj",
28
- "gate_proj",
29
  "up_proj",
30
  "o_proj",
 
 
 
31
  "k_proj",
32
- "down_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
 
26
  "up_proj",
27
  "o_proj",
28
+ "v_proj",
29
+ "q_proj",
30
+ "down_proj",
31
  "k_proj",
32
+ "gate_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-500/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:882c6a449a15a7c89b405a9b2bb953973aabb46d5bb332fd97e2636db7009c10
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77ad03a26563fb97d39b54488373b39fc25f88d455d31ef54c5aab28afa972cc
3
  size 167832240
checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec4b78ace1156f82710f9323015277767c2c7913e75b033470080d89dafce4ea
3
  size 335922386
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5341d64ff65b343938bba55a46153c0ffa1516010b29f36f738cdc6ed7a1a7f8
3
  size 335922386
checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f45fad1928be6a3cba447ba135798b1f889eac81fc87eb617b20212d56ec355
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec4f25bf25febb815eb9a87cc696ea933d62d4744509c6c7ee46b8ee52dfff33
3
  size 14244
checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d3dd7bf6e53c14f5c31b3d93195825c804547bfedb7e1ffecff6ed079275331
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb33de8afdd2f737b169d16a4787977e578963170b5dd53b43167e251c838ee3
3
  size 1064
checkpoint-500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5252100840336135,
5
  "eval_steps": 50,
6
  "global_step": 500,
7
  "is_hyper_param_search": false,
@@ -9,438 +9,438 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01050420168067227,
13
- "grad_norm": 2.5582146644592285,
14
- "learning_rate": 4.98249299719888e-05,
15
- "loss": 1.6787,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.02100840336134454,
20
- "grad_norm": 0.9345070719718933,
21
- "learning_rate": 4.96498599439776e-05,
22
- "loss": 0.518,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.031512605042016806,
27
- "grad_norm": 1.6358414888381958,
28
- "learning_rate": 4.947478991596639e-05,
29
- "loss": 0.4604,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.04201680672268908,
34
- "grad_norm": 0.7778844237327576,
35
- "learning_rate": 4.9299719887955186e-05,
36
- "loss": 0.3771,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.052521008403361345,
41
- "grad_norm": 0.7006077766418457,
42
- "learning_rate": 4.912464985994398e-05,
43
- "loss": 0.3842,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.052521008403361345,
48
- "eval_loss": 0.42603224515914917,
49
- "eval_runtime": 13.5673,
50
- "eval_samples_per_second": 35.379,
51
- "eval_steps_per_second": 2.211,
52
  "step": 50
53
  },
54
  {
55
- "epoch": 0.06302521008403361,
56
- "grad_norm": 0.6415153741836548,
57
- "learning_rate": 4.8949579831932775e-05,
58
- "loss": 0.3399,
59
  "step": 60
60
  },
61
  {
62
- "epoch": 0.07352941176470588,
63
- "grad_norm": 0.6030780076980591,
64
- "learning_rate": 4.877450980392157e-05,
65
- "loss": 0.3447,
66
  "step": 70
67
  },
68
  {
69
- "epoch": 0.08403361344537816,
70
- "grad_norm": 0.688852071762085,
71
- "learning_rate": 4.859943977591036e-05,
72
- "loss": 0.3219,
73
  "step": 80
74
  },
75
  {
76
- "epoch": 0.09453781512605042,
77
- "grad_norm": 0.6371557712554932,
78
- "learning_rate": 4.8424369747899164e-05,
79
- "loss": 0.3379,
80
  "step": 90
81
  },
82
  {
83
- "epoch": 0.10504201680672269,
84
- "grad_norm": 0.7739270329475403,
85
- "learning_rate": 4.824929971988796e-05,
86
- "loss": 0.3177,
87
  "step": 100
88
  },
89
  {
90
- "epoch": 0.10504201680672269,
91
- "eval_loss": 0.3801896274089813,
92
- "eval_runtime": 13.6107,
93
- "eval_samples_per_second": 35.266,
94
- "eval_steps_per_second": 2.204,
95
  "step": 100
96
  },
97
  {
98
- "epoch": 0.11554621848739496,
99
- "grad_norm": 0.649507462978363,
100
- "learning_rate": 4.807422969187675e-05,
101
- "loss": 0.3415,
102
  "step": 110
103
  },
104
  {
105
- "epoch": 0.12605042016806722,
106
- "grad_norm": 0.594717264175415,
107
- "learning_rate": 4.7899159663865554e-05,
108
- "loss": 0.3325,
109
  "step": 120
110
  },
111
  {
112
- "epoch": 0.13655462184873948,
113
- "grad_norm": 0.627918541431427,
114
- "learning_rate": 4.772408963585435e-05,
115
- "loss": 0.3222,
116
  "step": 130
117
  },
118
  {
119
- "epoch": 0.14705882352941177,
120
- "grad_norm": 0.5384674668312073,
121
- "learning_rate": 4.7549019607843135e-05,
122
- "loss": 0.3426,
123
  "step": 140
124
  },
125
  {
126
- "epoch": 0.15756302521008403,
127
- "grad_norm": 0.5673420429229736,
128
- "learning_rate": 4.7373949579831936e-05,
129
- "loss": 0.3061,
130
  "step": 150
131
  },
132
  {
133
- "epoch": 0.15756302521008403,
134
- "eval_loss": 0.3653399348258972,
135
- "eval_runtime": 13.5947,
136
- "eval_samples_per_second": 35.308,
137
- "eval_steps_per_second": 2.207,
138
  "step": 150
139
  },
140
  {
141
- "epoch": 0.16806722689075632,
142
- "grad_norm": 0.6111018657684326,
143
- "learning_rate": 4.719887955182073e-05,
144
- "loss": 0.3271,
145
  "step": 160
146
  },
147
  {
148
- "epoch": 0.17857142857142858,
149
- "grad_norm": 0.7422594428062439,
150
- "learning_rate": 4.7023809523809525e-05,
151
- "loss": 0.315,
152
  "step": 170
153
  },
154
  {
155
- "epoch": 0.18907563025210083,
156
- "grad_norm": 0.7226534485816956,
157
- "learning_rate": 4.684873949579832e-05,
158
- "loss": 0.3031,
159
  "step": 180
160
  },
161
  {
162
- "epoch": 0.19957983193277312,
163
- "grad_norm": 0.6302976012229919,
164
- "learning_rate": 4.667366946778712e-05,
165
- "loss": 0.3161,
166
  "step": 190
167
  },
168
  {
169
- "epoch": 0.21008403361344538,
170
- "grad_norm": 0.6225076913833618,
171
- "learning_rate": 4.6498599439775914e-05,
172
- "loss": 0.3038,
173
  "step": 200
174
  },
175
  {
176
- "epoch": 0.21008403361344538,
177
- "eval_loss": 0.35061606764793396,
178
- "eval_runtime": 13.5616,
179
- "eval_samples_per_second": 35.394,
180
- "eval_steps_per_second": 2.212,
181
  "step": 200
182
  },
183
  {
184
- "epoch": 0.22058823529411764,
185
- "grad_norm": 0.6001319885253906,
186
- "learning_rate": 4.632352941176471e-05,
187
- "loss": 0.3129,
188
  "step": 210
189
  },
190
  {
191
- "epoch": 0.23109243697478993,
192
- "grad_norm": 0.5385990142822266,
193
- "learning_rate": 4.61484593837535e-05,
194
- "loss": 0.2991,
195
  "step": 220
196
  },
197
  {
198
- "epoch": 0.2415966386554622,
199
- "grad_norm": 0.4513624906539917,
200
- "learning_rate": 4.59733893557423e-05,
201
- "loss": 0.2896,
202
  "step": 230
203
  },
204
  {
205
- "epoch": 0.25210084033613445,
206
- "grad_norm": 0.6142160892486572,
207
- "learning_rate": 4.579831932773109e-05,
208
- "loss": 0.3059,
209
  "step": 240
210
  },
211
  {
212
- "epoch": 0.26260504201680673,
213
- "grad_norm": 0.6714802384376526,
214
- "learning_rate": 4.562324929971989e-05,
215
- "loss": 0.2897,
216
  "step": 250
217
  },
218
  {
219
- "epoch": 0.26260504201680673,
220
- "eval_loss": 0.3456435203552246,
221
- "eval_runtime": 13.5552,
222
- "eval_samples_per_second": 35.411,
223
- "eval_steps_per_second": 2.213,
224
  "step": 250
225
  },
226
  {
227
- "epoch": 0.27310924369747897,
228
- "grad_norm": 0.6518235206604004,
229
- "learning_rate": 4.5448179271708687e-05,
230
- "loss": 0.312,
231
  "step": 260
232
  },
233
  {
234
- "epoch": 0.28361344537815125,
235
- "grad_norm": 0.6250632405281067,
236
- "learning_rate": 4.527310924369748e-05,
237
- "loss": 0.2959,
238
  "step": 270
239
  },
240
  {
241
- "epoch": 0.29411764705882354,
242
- "grad_norm": 0.5683826804161072,
243
- "learning_rate": 4.5098039215686275e-05,
244
- "loss": 0.3027,
245
  "step": 280
246
  },
247
  {
248
- "epoch": 0.30462184873949577,
249
- "grad_norm": 0.560312807559967,
250
- "learning_rate": 4.4922969187675076e-05,
251
- "loss": 0.3002,
252
  "step": 290
253
  },
254
  {
255
- "epoch": 0.31512605042016806,
256
- "grad_norm": 0.66291743516922,
257
- "learning_rate": 4.474789915966387e-05,
258
- "loss": 0.2925,
259
  "step": 300
260
  },
261
  {
262
- "epoch": 0.31512605042016806,
263
- "eval_loss": 0.3431606888771057,
264
- "eval_runtime": 13.5629,
265
- "eval_samples_per_second": 35.391,
266
- "eval_steps_per_second": 2.212,
267
  "step": 300
268
  },
269
  {
270
- "epoch": 0.32563025210084034,
271
- "grad_norm": 0.6478439569473267,
272
- "learning_rate": 4.4572829131652665e-05,
273
- "loss": 0.2893,
274
  "step": 310
275
  },
276
  {
277
- "epoch": 0.33613445378151263,
278
- "grad_norm": 0.5832348465919495,
279
- "learning_rate": 4.439775910364146e-05,
280
- "loss": 0.2842,
281
  "step": 320
282
  },
283
  {
284
- "epoch": 0.34663865546218486,
285
- "grad_norm": 0.525932252407074,
286
- "learning_rate": 4.422268907563025e-05,
287
- "loss": 0.2837,
288
  "step": 330
289
  },
290
  {
291
- "epoch": 0.35714285714285715,
292
- "grad_norm": 0.5487508177757263,
293
- "learning_rate": 4.404761904761905e-05,
294
- "loss": 0.2706,
295
  "step": 340
296
  },
297
  {
298
- "epoch": 0.36764705882352944,
299
- "grad_norm": 0.5392388701438904,
300
- "learning_rate": 4.387254901960784e-05,
301
- "loss": 0.2835,
302
  "step": 350
303
  },
304
  {
305
- "epoch": 0.36764705882352944,
306
- "eval_loss": 0.33528536558151245,
307
- "eval_runtime": 13.5508,
308
- "eval_samples_per_second": 35.422,
309
- "eval_steps_per_second": 2.214,
310
  "step": 350
311
  },
312
  {
313
- "epoch": 0.37815126050420167,
314
- "grad_norm": 0.6706260442733765,
315
- "learning_rate": 4.369747899159664e-05,
316
- "loss": 0.2844,
317
  "step": 360
318
  },
319
  {
320
- "epoch": 0.38865546218487396,
321
- "grad_norm": 0.6042625904083252,
322
- "learning_rate": 4.352240896358544e-05,
323
- "loss": 0.2758,
324
  "step": 370
325
  },
326
  {
327
- "epoch": 0.39915966386554624,
328
- "grad_norm": 0.534008264541626,
329
- "learning_rate": 4.334733893557423e-05,
330
- "loss": 0.2918,
331
  "step": 380
332
  },
333
  {
334
- "epoch": 0.4096638655462185,
335
- "grad_norm": 0.48162588477134705,
336
- "learning_rate": 4.317226890756303e-05,
337
- "loss": 0.273,
338
  "step": 390
339
  },
340
  {
341
- "epoch": 0.42016806722689076,
342
- "grad_norm": 0.5669644474983215,
343
- "learning_rate": 4.2997198879551826e-05,
344
- "loss": 0.285,
345
  "step": 400
346
  },
347
  {
348
- "epoch": 0.42016806722689076,
349
- "eval_loss": 0.3348632752895355,
350
- "eval_runtime": 13.5507,
351
- "eval_samples_per_second": 35.423,
352
- "eval_steps_per_second": 2.214,
353
  "step": 400
354
  },
355
  {
356
- "epoch": 0.43067226890756305,
357
- "grad_norm": 0.6257824897766113,
358
- "learning_rate": 4.2822128851540614e-05,
359
- "loss": 0.299,
360
  "step": 410
361
  },
362
  {
363
- "epoch": 0.4411764705882353,
364
- "grad_norm": 0.5430576205253601,
365
- "learning_rate": 4.2647058823529415e-05,
366
- "loss": 0.2868,
367
  "step": 420
368
  },
369
  {
370
- "epoch": 0.45168067226890757,
371
- "grad_norm": 0.5633955597877502,
372
- "learning_rate": 4.247198879551821e-05,
373
- "loss": 0.2589,
374
  "step": 430
375
  },
376
  {
377
- "epoch": 0.46218487394957986,
378
- "grad_norm": 0.5294789671897888,
379
- "learning_rate": 4.2296918767507e-05,
380
- "loss": 0.2777,
381
  "step": 440
382
  },
383
  {
384
- "epoch": 0.4726890756302521,
385
- "grad_norm": 0.5480856895446777,
386
- "learning_rate": 4.21218487394958e-05,
387
- "loss": 0.2704,
388
  "step": 450
389
  },
390
  {
391
- "epoch": 0.4726890756302521,
392
- "eval_loss": 0.329515278339386,
393
- "eval_runtime": 13.5423,
394
- "eval_samples_per_second": 35.445,
395
- "eval_steps_per_second": 2.215,
396
  "step": 450
397
  },
398
  {
399
- "epoch": 0.4831932773109244,
400
- "grad_norm": 0.5051332116127014,
401
- "learning_rate": 4.19467787114846e-05,
402
- "loss": 0.2438,
403
  "step": 460
404
  },
405
  {
406
- "epoch": 0.49369747899159666,
407
- "grad_norm": 0.6251511573791504,
408
- "learning_rate": 4.177170868347339e-05,
409
- "loss": 0.2748,
410
  "step": 470
411
  },
412
  {
413
- "epoch": 0.5042016806722689,
414
- "grad_norm": 0.4729413092136383,
415
- "learning_rate": 4.159663865546219e-05,
416
- "loss": 0.2689,
417
  "step": 480
418
  },
419
  {
420
- "epoch": 0.5147058823529411,
421
- "grad_norm": 0.5220003724098206,
422
- "learning_rate": 4.142156862745099e-05,
423
- "loss": 0.2899,
424
  "step": 490
425
  },
426
  {
427
- "epoch": 0.5252100840336135,
428
- "grad_norm": 0.54283207654953,
429
- "learning_rate": 4.1246498599439776e-05,
430
- "loss": 0.272,
431
  "step": 500
432
  },
433
  {
434
- "epoch": 0.5252100840336135,
435
- "eval_loss": 0.32714489102363586,
436
- "eval_runtime": 13.5497,
437
- "eval_samples_per_second": 35.425,
438
- "eval_steps_per_second": 2.214,
439
  "step": 500
440
  }
441
  ],
442
  "logging_steps": 10,
443
- "max_steps": 2856,
444
  "num_input_tokens_seen": 0,
445
  "num_train_epochs": 3,
446
  "save_steps": 100,
@@ -456,7 +456,7 @@
456
  "attributes": {}
457
  }
458
  },
459
- "total_flos": 5.977409027624141e+16,
460
  "train_batch_size": 16,
461
  "trial_name": null,
462
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5268703898840885,
5
  "eval_steps": 50,
6
  "global_step": 500,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01053740779768177,
13
+ "grad_norm": 3.070249080657959,
14
+ "learning_rate": 4.982437653670531e-05,
15
+ "loss": 1.7879,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.02107481559536354,
20
+ "grad_norm": 1.702326774597168,
21
+ "learning_rate": 4.964875307341061e-05,
22
+ "loss": 0.5567,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.03161222339304531,
27
+ "grad_norm": 1.1947294473648071,
28
+ "learning_rate": 4.947312961011591e-05,
29
+ "loss": 0.4493,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.04214963119072708,
34
+ "grad_norm": 0.9556658267974854,
35
+ "learning_rate": 4.929750614682122e-05,
36
+ "loss": 0.3728,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.05268703898840885,
41
+ "grad_norm": 0.7952510714530945,
42
+ "learning_rate": 4.9121882683526524e-05,
43
+ "loss": 0.3535,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.05268703898840885,
48
+ "eval_loss": 0.4311191439628601,
49
+ "eval_runtime": 13.6539,
50
+ "eval_samples_per_second": 35.155,
51
+ "eval_steps_per_second": 2.197,
52
  "step": 50
53
  },
54
  {
55
+ "epoch": 0.06322444678609063,
56
+ "grad_norm": 0.6962826251983643,
57
+ "learning_rate": 4.894625922023183e-05,
58
+ "loss": 0.3507,
59
  "step": 60
60
  },
61
  {
62
+ "epoch": 0.0737618545837724,
63
+ "grad_norm": 0.6941961646080017,
64
+ "learning_rate": 4.877063575693713e-05,
65
+ "loss": 0.3585,
66
  "step": 70
67
  },
68
  {
69
+ "epoch": 0.08429926238145416,
70
+ "grad_norm": 0.6864392757415771,
71
+ "learning_rate": 4.8595012293642434e-05,
72
+ "loss": 0.3496,
73
  "step": 80
74
  },
75
  {
76
+ "epoch": 0.09483667017913593,
77
+ "grad_norm": 0.7322937846183777,
78
+ "learning_rate": 4.841938883034774e-05,
79
+ "loss": 0.3295,
80
  "step": 90
81
  },
82
  {
83
+ "epoch": 0.1053740779768177,
84
+ "grad_norm": 0.6921488046646118,
85
+ "learning_rate": 4.824376536705304e-05,
86
+ "loss": 0.3357,
87
  "step": 100
88
  },
89
  {
90
+ "epoch": 0.1053740779768177,
91
+ "eval_loss": 0.39120009541511536,
92
+ "eval_runtime": 13.7031,
93
+ "eval_samples_per_second": 35.029,
94
+ "eval_steps_per_second": 2.189,
95
  "step": 100
96
  },
97
  {
98
+ "epoch": 0.11591148577449947,
99
+ "grad_norm": 0.6553240418434143,
100
+ "learning_rate": 4.8068141903758344e-05,
101
+ "loss": 0.3105,
102
  "step": 110
103
  },
104
  {
105
+ "epoch": 0.12644889357218125,
106
+ "grad_norm": 0.5637819170951843,
107
+ "learning_rate": 4.789251844046364e-05,
108
+ "loss": 0.3164,
109
  "step": 120
110
  },
111
  {
112
+ "epoch": 0.136986301369863,
113
+ "grad_norm": 0.6341928839683533,
114
+ "learning_rate": 4.7716894977168955e-05,
115
+ "loss": 0.304,
116
  "step": 130
117
  },
118
  {
119
+ "epoch": 0.1475237091675448,
120
+ "grad_norm": 0.5917785167694092,
121
+ "learning_rate": 4.754127151387426e-05,
122
+ "loss": 0.3234,
123
  "step": 140
124
  },
125
  {
126
+ "epoch": 0.15806111696522657,
127
+ "grad_norm": 0.5884453654289246,
128
+ "learning_rate": 4.736564805057956e-05,
129
+ "loss": 0.317,
130
  "step": 150
131
  },
132
  {
133
+ "epoch": 0.15806111696522657,
134
+ "eval_loss": 0.37688738107681274,
135
+ "eval_runtime": 13.6535,
136
+ "eval_samples_per_second": 35.156,
137
+ "eval_steps_per_second": 2.197,
138
  "step": 150
139
  },
140
  {
141
+ "epoch": 0.16859852476290832,
142
+ "grad_norm": 0.5819964408874512,
143
+ "learning_rate": 4.7190024587284866e-05,
144
+ "loss": 0.2992,
145
  "step": 160
146
  },
147
  {
148
+ "epoch": 0.1791359325605901,
149
+ "grad_norm": 0.689468264579773,
150
+ "learning_rate": 4.7014401123990165e-05,
151
+ "loss": 0.3168,
152
  "step": 170
153
  },
154
  {
155
+ "epoch": 0.18967334035827185,
156
+ "grad_norm": 0.6950872540473938,
157
+ "learning_rate": 4.683877766069547e-05,
158
+ "loss": 0.3041,
159
  "step": 180
160
  },
161
  {
162
+ "epoch": 0.20021074815595363,
163
+ "grad_norm": 0.8322122097015381,
164
+ "learning_rate": 4.6663154197400776e-05,
165
+ "loss": 0.3028,
166
  "step": 190
167
  },
168
  {
169
+ "epoch": 0.2107481559536354,
170
+ "grad_norm": 0.5850774645805359,
171
+ "learning_rate": 4.6487530734106075e-05,
172
+ "loss": 0.2992,
173
  "step": 200
174
  },
175
  {
176
+ "epoch": 0.2107481559536354,
177
+ "eval_loss": 0.36230018734931946,
178
+ "eval_runtime": 13.6165,
179
+ "eval_samples_per_second": 35.251,
180
+ "eval_steps_per_second": 2.203,
181
  "step": 200
182
  },
183
  {
184
+ "epoch": 0.22128556375131717,
185
+ "grad_norm": 0.6668715476989746,
186
+ "learning_rate": 4.631190727081138e-05,
187
+ "loss": 0.2924,
188
  "step": 210
189
  },
190
  {
191
+ "epoch": 0.23182297154899895,
192
+ "grad_norm": 0.4749641418457031,
193
+ "learning_rate": 4.6136283807516686e-05,
194
+ "loss": 0.3017,
195
  "step": 220
196
  },
197
  {
198
+ "epoch": 0.24236037934668073,
199
+ "grad_norm": 0.6381515860557556,
200
+ "learning_rate": 4.596066034422199e-05,
201
+ "loss": 0.2887,
202
  "step": 230
203
  },
204
  {
205
+ "epoch": 0.2528977871443625,
206
+ "grad_norm": 0.49952977895736694,
207
+ "learning_rate": 4.57850368809273e-05,
208
+ "loss": 0.2833,
209
  "step": 240
210
  },
211
  {
212
+ "epoch": 0.26343519494204426,
213
+ "grad_norm": 0.699518084526062,
214
+ "learning_rate": 4.5609413417632596e-05,
215
+ "loss": 0.2859,
216
  "step": 250
217
  },
218
  {
219
+ "epoch": 0.26343519494204426,
220
+ "eval_loss": 0.3563433885574341,
221
+ "eval_runtime": 13.6275,
222
+ "eval_samples_per_second": 35.223,
223
+ "eval_steps_per_second": 2.201,
224
  "step": 250
225
  },
226
  {
227
+ "epoch": 0.273972602739726,
228
+ "grad_norm": 0.5912085175514221,
229
+ "learning_rate": 4.54337899543379e-05,
230
+ "loss": 0.302,
231
  "step": 260
232
  },
233
  {
234
+ "epoch": 0.2845100105374078,
235
+ "grad_norm": 0.6353363990783691,
236
+ "learning_rate": 4.525816649104321e-05,
237
+ "loss": 0.3112,
238
  "step": 270
239
  },
240
  {
241
+ "epoch": 0.2950474183350896,
242
+ "grad_norm": 0.5483567118644714,
243
+ "learning_rate": 4.5082543027748506e-05,
244
+ "loss": 0.2808,
245
  "step": 280
246
  },
247
  {
248
+ "epoch": 0.3055848261327713,
249
+ "grad_norm": 0.5003193616867065,
250
+ "learning_rate": 4.490691956445381e-05,
251
+ "loss": 0.3185,
252
  "step": 290
253
  },
254
  {
255
+ "epoch": 0.31612223393045313,
256
+ "grad_norm": 0.4919240176677704,
257
+ "learning_rate": 4.473129610115912e-05,
258
+ "loss": 0.2883,
259
  "step": 300
260
  },
261
  {
262
+ "epoch": 0.31612223393045313,
263
+ "eval_loss": 0.35315924882888794,
264
+ "eval_runtime": 13.6133,
265
+ "eval_samples_per_second": 35.26,
266
+ "eval_steps_per_second": 2.204,
267
  "step": 300
268
  },
269
  {
270
+ "epoch": 0.3266596417281349,
271
+ "grad_norm": 0.5145038366317749,
272
+ "learning_rate": 4.455567263786442e-05,
273
+ "loss": 0.3084,
274
  "step": 310
275
  },
276
  {
277
+ "epoch": 0.33719704952581664,
278
+ "grad_norm": 0.6343855261802673,
279
+ "learning_rate": 4.438004917456973e-05,
280
+ "loss": 0.295,
281
  "step": 320
282
  },
283
  {
284
+ "epoch": 0.34773445732349845,
285
+ "grad_norm": 0.5336400270462036,
286
+ "learning_rate": 4.420442571127503e-05,
287
+ "loss": 0.2882,
288
  "step": 330
289
  },
290
  {
291
+ "epoch": 0.3582718651211802,
292
+ "grad_norm": 0.49547308683395386,
293
+ "learning_rate": 4.4028802247980333e-05,
294
+ "loss": 0.3084,
295
  "step": 340
296
  },
297
  {
298
+ "epoch": 0.36880927291886195,
299
+ "grad_norm": 0.5037292242050171,
300
+ "learning_rate": 4.385317878468563e-05,
301
+ "loss": 0.3084,
302
  "step": 350
303
  },
304
  {
305
+ "epoch": 0.36880927291886195,
306
+ "eval_loss": 0.34608179330825806,
307
+ "eval_runtime": 13.6537,
308
+ "eval_samples_per_second": 35.155,
309
+ "eval_steps_per_second": 2.197,
310
  "step": 350
311
  },
312
  {
313
+ "epoch": 0.3793466807165437,
314
+ "grad_norm": 0.5956543684005737,
315
+ "learning_rate": 4.367755532139094e-05,
316
+ "loss": 0.2763,
317
  "step": 360
318
  },
319
  {
320
+ "epoch": 0.3898840885142255,
321
+ "grad_norm": 0.6263634562492371,
322
+ "learning_rate": 4.3501931858096244e-05,
323
+ "loss": 0.3025,
324
  "step": 370
325
  },
326
  {
327
+ "epoch": 0.40042149631190727,
328
+ "grad_norm": 0.4832920730113983,
329
+ "learning_rate": 4.332630839480154e-05,
330
+ "loss": 0.2792,
331
  "step": 380
332
  },
333
  {
334
+ "epoch": 0.410958904109589,
335
+ "grad_norm": 0.4969714879989624,
336
+ "learning_rate": 4.3150684931506855e-05,
337
+ "loss": 0.2704,
338
  "step": 390
339
  },
340
  {
341
+ "epoch": 0.4214963119072708,
342
+ "grad_norm": 0.568900465965271,
343
+ "learning_rate": 4.297506146821216e-05,
344
+ "loss": 0.2734,
345
  "step": 400
346
  },
347
  {
348
+ "epoch": 0.4214963119072708,
349
+ "eval_loss": 0.34151414036750793,
350
+ "eval_runtime": 13.664,
351
+ "eval_samples_per_second": 35.129,
352
+ "eval_steps_per_second": 2.196,
353
  "step": 400
354
  },
355
  {
356
+ "epoch": 0.4320337197049526,
357
+ "grad_norm": 0.5023282170295715,
358
+ "learning_rate": 4.279943800491746e-05,
359
+ "loss": 0.2845,
360
  "step": 410
361
  },
362
  {
363
+ "epoch": 0.44257112750263433,
364
+ "grad_norm": 0.531538188457489,
365
+ "learning_rate": 4.2623814541622765e-05,
366
+ "loss": 0.276,
367
  "step": 420
368
  },
369
  {
370
+ "epoch": 0.45310853530031614,
371
+ "grad_norm": 0.6686979532241821,
372
+ "learning_rate": 4.2448191078328064e-05,
373
+ "loss": 0.283,
374
  "step": 430
375
  },
376
  {
377
+ "epoch": 0.4636459430979979,
378
+ "grad_norm": 0.5194190144538879,
379
+ "learning_rate": 4.227256761503337e-05,
380
+ "loss": 0.2636,
381
  "step": 440
382
  },
383
  {
384
+ "epoch": 0.47418335089567965,
385
+ "grad_norm": 0.6105541586875916,
386
+ "learning_rate": 4.2096944151738675e-05,
387
+ "loss": 0.2713,
388
  "step": 450
389
  },
390
  {
391
+ "epoch": 0.47418335089567965,
392
+ "eval_loss": 0.33879777789115906,
393
+ "eval_runtime": 13.6666,
394
+ "eval_samples_per_second": 35.122,
395
+ "eval_steps_per_second": 2.195,
396
  "step": 450
397
  },
398
  {
399
+ "epoch": 0.48472075869336145,
400
+ "grad_norm": 0.4929138123989105,
401
+ "learning_rate": 4.1921320688443974e-05,
402
+ "loss": 0.261,
403
  "step": 460
404
  },
405
  {
406
+ "epoch": 0.4952581664910432,
407
+ "grad_norm": 0.584095299243927,
408
+ "learning_rate": 4.174569722514928e-05,
409
+ "loss": 0.2714,
410
  "step": 470
411
  },
412
  {
413
+ "epoch": 0.505795574288725,
414
+ "grad_norm": 0.5386167764663696,
415
+ "learning_rate": 4.1570073761854585e-05,
416
+ "loss": 0.2855,
417
  "step": 480
418
  },
419
  {
420
+ "epoch": 0.5163329820864068,
421
+ "grad_norm": 0.5819774270057678,
422
+ "learning_rate": 4.139445029855989e-05,
423
+ "loss": 0.2911,
424
  "step": 490
425
  },
426
  {
427
+ "epoch": 0.5268703898840885,
428
+ "grad_norm": 0.5875944495201111,
429
+ "learning_rate": 4.12188268352652e-05,
430
+ "loss": 0.2619,
431
  "step": 500
432
  },
433
  {
434
+ "epoch": 0.5268703898840885,
435
+ "eval_loss": 0.3364439010620117,
436
+ "eval_runtime": 13.6731,
437
+ "eval_samples_per_second": 35.105,
438
+ "eval_steps_per_second": 2.194,
439
  "step": 500
440
  }
441
  ],
442
  "logging_steps": 10,
443
+ "max_steps": 2847,
444
  "num_input_tokens_seen": 0,
445
  "num_train_epochs": 3,
446
  "save_steps": 100,
 
456
  "attributes": {}
457
  }
458
  },
459
+ "total_flos": 5.975090629863014e+16,
460
  "train_batch_size": 16,
461
  "trial_name": null,
462
  "trial_params": null
checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5894a90f0aacad19de132730666f8b4647a0c4aa14309866a5f87d3723ce6a7
3
  size 5880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5fa9d8b62d1ebe6967a504a7decdb5eeee2bb4aac96e7414f3930f9adcff095
3
  size 5880