mrferr3t commited on
Commit
7276f39
·
verified ·
1 Parent(s): 00a5ce8

Training in progress, step 99, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,9 +20,9 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "Wqkv",
24
- "layer",
25
- "out_proj"
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "out_proj",
24
  "Wqkv",
25
+ "layer"
 
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3769974cce0839f437263f1fcb4c6672f0dba9f91b8100d002793da1f223679f
3
  size 5752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe3a9f1e13fc722192c8866e2604075b038e5ff2515d3b8dc7ccb353997e493d
3
  size 5752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c2a3e1850347c1cfb4067884be7df755a904a7316830846ceb0ce3bba1400d7
3
  size 15814
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3e09bf1bdfbf403552733db55f2cce3f7c6b52579e7d1bab8f62474f3a3a59d
3
  size 15814
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:424f739be4501cfe7d354e89eb7c2636e769fe1c10cff142c347b71521f5cff2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd7949088d7233b9cf27831b62c9b3897ec0e564f2b4de82b056a51e812411f4
3
  size 14244
last-checkpoint/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.15396578538102643,
5
  "eval_steps": 50,
6
  "global_step": 99,
7
  "is_hyper_param_search": false,
@@ -9,716 +9,51 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0015552099533437014,
13
- "grad_norm": 2.7270019927527755e-05,
14
- "learning_rate": 5e-05,
15
- "loss": 46.0,
16
- "step": 1
17
- },
18
- {
19
- "epoch": 0.0015552099533437014,
20
  "eval_loss": 11.5,
21
- "eval_runtime": 4.1326,
22
- "eval_samples_per_second": 65.575,
23
- "eval_steps_per_second": 32.909,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.003110419906687403,
28
- "grad_norm": 3.0478742701234296e-05,
29
- "learning_rate": 0.0001,
30
- "loss": 46.0,
31
- "step": 2
32
- },
33
- {
34
- "epoch": 0.004665629860031105,
35
- "grad_norm": 3.66069762094412e-05,
36
- "learning_rate": 0.00015,
37
- "loss": 46.0,
38
- "step": 3
39
- },
40
- {
41
- "epoch": 0.006220839813374806,
42
- "grad_norm": 1.8183880456490442e-05,
43
- "learning_rate": 0.0002,
44
- "loss": 46.0,
45
- "step": 4
46
- },
47
- {
48
- "epoch": 0.007776049766718507,
49
- "grad_norm": 3.452602322795428e-05,
50
- "learning_rate": 0.00025,
51
- "loss": 46.0,
52
- "step": 5
53
- },
54
- {
55
- "epoch": 0.00933125972006221,
56
- "grad_norm": 4.146944411331788e-05,
57
- "learning_rate": 0.0003,
58
- "loss": 46.0,
59
- "step": 6
60
- },
61
- {
62
- "epoch": 0.01088646967340591,
63
- "grad_norm": 2.2120035282569006e-05,
64
- "learning_rate": 0.00035,
65
- "loss": 46.0,
66
- "step": 7
67
- },
68
- {
69
- "epoch": 0.012441679626749611,
70
- "grad_norm": 3.082965122302994e-05,
71
- "learning_rate": 0.0004,
72
- "loss": 46.0,
73
- "step": 8
74
- },
75
- {
76
- "epoch": 0.013996889580093312,
77
- "grad_norm": 2.718676842050627e-05,
78
- "learning_rate": 0.00045000000000000004,
79
- "loss": 46.0,
80
- "step": 9
81
- },
82
- {
83
- "epoch": 0.015552099533437015,
84
- "grad_norm": 5.129418786964379e-05,
85
  "learning_rate": 0.0005,
86
- "loss": 46.0,
87
- "step": 10
88
- },
89
- {
90
- "epoch": 0.017107309486780714,
91
- "grad_norm": 3.0237812097766437e-05,
92
- "learning_rate": 0.0004998442655654946,
93
- "loss": 46.0,
94
- "step": 11
95
- },
96
- {
97
- "epoch": 0.01866251944012442,
98
- "grad_norm": 3.804243533522822e-05,
99
- "learning_rate": 0.0004993772562876909,
100
- "loss": 46.0,
101
- "step": 12
102
- },
103
- {
104
- "epoch": 0.02021772939346812,
105
- "grad_norm": 4.027284376206808e-05,
106
- "learning_rate": 0.0004985995540019955,
107
- "loss": 46.0,
108
- "step": 13
109
- },
110
- {
111
- "epoch": 0.02177293934681182,
112
- "grad_norm": 3.257339994888753e-05,
113
- "learning_rate": 0.0004975121276286136,
114
- "loss": 46.0,
115
- "step": 14
116
- },
117
- {
118
- "epoch": 0.02332814930015552,
119
- "grad_norm": 2.8166376068838872e-05,
120
- "learning_rate": 0.0004961163319653958,
121
- "loss": 46.0,
122
- "step": 15
123
- },
124
- {
125
- "epoch": 0.024883359253499222,
126
- "grad_norm": 3.2243387977359816e-05,
127
- "learning_rate": 0.0004944139059999286,
128
- "loss": 46.0,
129
- "step": 16
130
- },
131
- {
132
- "epoch": 0.026438569206842923,
133
- "grad_norm": 2.9551172701758333e-05,
134
- "learning_rate": 0.000492406970742972,
135
- "loss": 46.0,
136
- "step": 17
137
- },
138
- {
139
- "epoch": 0.027993779160186624,
140
- "grad_norm": 3.1649913580622524e-05,
141
- "learning_rate": 0.0004900980265859448,
142
- "loss": 46.0,
143
- "step": 18
144
- },
145
- {
146
- "epoch": 0.029548989113530325,
147
- "grad_norm": 3.053872205782682e-05,
148
- "learning_rate": 0.0004874899501857477,
149
- "loss": 46.0,
150
- "step": 19
151
- },
152
- {
153
- "epoch": 0.03110419906687403,
154
- "grad_norm": 3.0835763027425855e-05,
155
- "learning_rate": 0.00048458599088080736,
156
- "loss": 46.0,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 0.03265940902021773,
161
- "grad_norm": 2.700198456295766e-05,
162
- "learning_rate": 0.0004813897666428053,
163
- "loss": 46.0,
164
- "step": 21
165
- },
166
- {
167
- "epoch": 0.03421461897356143,
168
- "grad_norm": 3.770717012230307e-05,
169
- "learning_rate": 0.00047790525956913543,
170
- "loss": 46.0,
171
- "step": 22
172
- },
173
- {
174
- "epoch": 0.03576982892690513,
175
- "grad_norm": 3.4990694985026494e-05,
176
- "learning_rate": 0.0004741368109217071,
177
- "loss": 46.0,
178
- "step": 23
179
- },
180
- {
181
- "epoch": 0.03732503888024884,
182
- "grad_norm": 2.7294441679259762e-05,
183
- "learning_rate": 0.00047008911571827283,
184
- "loss": 46.0,
185
- "step": 24
186
- },
187
- {
188
- "epoch": 0.038880248833592534,
189
- "grad_norm": 2.366018816246651e-05,
190
- "learning_rate": 0.00046576721688302105,
191
- "loss": 46.0,
192
- "step": 25
193
- },
194
- {
195
- "epoch": 0.04043545878693624,
196
- "grad_norm": 3.290159293101169e-05,
197
- "learning_rate": 0.0004611764989637205,
198
- "loss": 46.0,
199
- "step": 26
200
- },
201
- {
202
- "epoch": 0.041990668740279936,
203
- "grad_norm": 2.414073605905287e-05,
204
- "learning_rate": 0.0004563226814232444,
205
- "loss": 46.0,
206
- "step": 27
207
- },
208
- {
209
- "epoch": 0.04354587869362364,
210
- "grad_norm": 5.248881279840134e-05,
211
- "learning_rate": 0.0004512118115138315,
212
- "loss": 46.0,
213
- "step": 28
214
- },
215
- {
216
- "epoch": 0.04510108864696734,
217
- "grad_norm": 3.5208511690143496e-05,
218
- "learning_rate": 0.0004458502567429631,
219
- "loss": 46.0,
220
- "step": 29
221
- },
222
- {
223
- "epoch": 0.04665629860031104,
224
- "grad_norm": 2.0407787815202028e-05,
225
- "learning_rate": 0.00044024469694024196,
226
- "loss": 46.0,
227
- "step": 30
228
- },
229
- {
230
- "epoch": 0.04821150855365474,
231
- "grad_norm": 3.576717062969692e-05,
232
- "learning_rate": 0.00043440211593515554,
233
- "loss": 46.0,
234
- "step": 31
235
- },
236
- {
237
- "epoch": 0.049766718506998445,
238
- "grad_norm": 3.6498990084510297e-05,
239
- "learning_rate": 0.0004283297928560951,
240
- "loss": 46.0,
241
- "step": 32
242
- },
243
- {
244
- "epoch": 0.05132192846034215,
245
- "grad_norm": 4.3066760554211214e-05,
246
- "learning_rate": 0.0004220352930614672,
247
- "loss": 46.0,
248
- "step": 33
249
- },
250
- {
251
- "epoch": 0.05287713841368585,
252
- "grad_norm": 3.5059842048212886e-05,
253
- "learning_rate": 0.00041552645871420013,
254
- "loss": 46.0,
255
- "step": 34
256
- },
257
- {
258
- "epoch": 0.05443234836702955,
259
- "grad_norm": 4.324290421209298e-05,
260
- "learning_rate": 0.00040881139901138467,
261
- "loss": 46.0,
262
- "step": 35
263
- },
264
- {
265
- "epoch": 0.05598755832037325,
266
- "grad_norm": 3.7152800359763205e-05,
267
- "learning_rate": 0.00040189848008122475,
268
- "loss": 46.0,
269
- "step": 36
270
- },
271
- {
272
- "epoch": 0.05754276827371695,
273
- "grad_norm": 5.383112875279039e-05,
274
- "learning_rate": 0.00039479631455988334,
275
- "loss": 46.0,
276
- "step": 37
277
- },
278
- {
279
- "epoch": 0.05909797822706065,
280
- "grad_norm": 4.21604527218733e-05,
281
- "learning_rate": 0.0003875137508612103,
282
- "loss": 46.0,
283
- "step": 38
284
- },
285
- {
286
- "epoch": 0.060653188180404355,
287
- "grad_norm": 5.1183244067942724e-05,
288
- "learning_rate": 0.00038005986215272055,
289
- "loss": 46.0,
290
- "step": 39
291
- },
292
- {
293
- "epoch": 0.06220839813374806,
294
- "grad_norm": 5.168653660803102e-05,
295
- "learning_rate": 0.0003724439350515571,
296
- "loss": 46.0,
297
  "step": 40
298
  },
299
  {
300
- "epoch": 0.06376360808709176,
301
- "grad_norm": 4.977727439836599e-05,
302
- "learning_rate": 0.0003646754580545226,
303
- "loss": 46.0,
304
- "step": 41
305
- },
306
- {
307
- "epoch": 0.06531881804043546,
308
- "grad_norm": 3.4292854252271354e-05,
309
- "learning_rate": 0.000356764109716594,
310
- "loss": 46.0,
311
- "step": 42
312
- },
313
- {
314
- "epoch": 0.06687402799377916,
315
- "grad_norm": 5.1678060117410496e-05,
316
- "learning_rate": 0.00034871974659264783,
317
- "loss": 46.0,
318
- "step": 43
319
- },
320
- {
321
- "epoch": 0.06842923794712286,
322
- "grad_norm": 4.2354509787401184e-05,
323
- "learning_rate": 0.0003405523909574206,
324
- "loss": 46.0,
325
- "step": 44
326
- },
327
- {
328
- "epoch": 0.06998444790046657,
329
- "grad_norm": 4.2241175833623856e-05,
330
- "learning_rate": 0.0003322722183190025,
331
- "loss": 46.0,
332
- "step": 45
333
- },
334
- {
335
- "epoch": 0.07153965785381027,
336
- "grad_norm": 4.768397775478661e-05,
337
- "learning_rate": 0.0003238895447414211,
338
- "loss": 46.0,
339
- "step": 46
340
- },
341
- {
342
- "epoch": 0.07309486780715396,
343
- "grad_norm": 3.0143904950818978e-05,
344
- "learning_rate": 0.0003154148139921102,
345
- "loss": 46.0,
346
- "step": 47
347
- },
348
- {
349
- "epoch": 0.07465007776049767,
350
- "grad_norm": 4.8561680159764364e-05,
351
- "learning_rate": 0.00030685858453027663,
352
- "loss": 46.0,
353
- "step": 48
354
- },
355
- {
356
- "epoch": 0.07620528771384137,
357
- "grad_norm": 2.9335944418562576e-05,
358
- "learning_rate": 0.0002982315163523742,
359
- "loss": 46.0,
360
- "step": 49
361
- },
362
- {
363
- "epoch": 0.07776049766718507,
364
- "grad_norm": 3.3190059184562415e-05,
365
- "learning_rate": 0.000289544357711076,
366
- "loss": 46.0,
367
- "step": 50
368
- },
369
- {
370
- "epoch": 0.07776049766718507,
371
  "eval_loss": 11.5,
372
- "eval_runtime": 4.0908,
373
- "eval_samples_per_second": 66.246,
374
- "eval_steps_per_second": 33.245,
375
  "step": 50
376
  },
377
  {
378
- "epoch": 0.07931570762052877,
379
- "grad_norm": 3.197933256160468e-05,
380
- "learning_rate": 0.0002808079317242896,
381
- "loss": 46.0,
382
- "step": 51
383
- },
384
- {
385
- "epoch": 0.08087091757387248,
386
- "grad_norm": 4.568342774291523e-05,
387
- "learning_rate": 0.0002720331228909005,
388
- "loss": 46.0,
389
- "step": 52
390
- },
391
- {
392
- "epoch": 0.08242612752721618,
393
- "grad_norm": 3.215171454939991e-05,
394
- "learning_rate": 0.00026323086353004075,
395
- "loss": 46.0,
396
- "step": 53
397
- },
398
- {
399
- "epoch": 0.08398133748055987,
400
- "grad_norm": 2.91961150651332e-05,
401
- "learning_rate": 0.0002544121201607822,
402
- "loss": 46.0,
403
- "step": 54
404
- },
405
- {
406
- "epoch": 0.08553654743390357,
407
- "grad_norm": 7.603203994221985e-05,
408
- "learning_rate": 0.00024558787983921783,
409
- "loss": 46.0,
410
- "step": 55
411
- },
412
- {
413
- "epoch": 0.08709175738724728,
414
- "grad_norm": 5.6477994803572074e-05,
415
- "learning_rate": 0.0002367691364699592,
416
- "loss": 46.0,
417
- "step": 56
418
- },
419
- {
420
- "epoch": 0.08864696734059098,
421
- "grad_norm": 3.0733051971765235e-05,
422
- "learning_rate": 0.00022796687710909964,
423
- "loss": 46.0,
424
- "step": 57
425
- },
426
- {
427
- "epoch": 0.09020217729393468,
428
- "grad_norm": 3.2454528991365805e-05,
429
- "learning_rate": 0.00021919206827571036,
430
- "loss": 46.0,
431
- "step": 58
432
- },
433
- {
434
- "epoch": 0.09175738724727839,
435
- "grad_norm": 7.772906974423677e-05,
436
- "learning_rate": 0.00021045564228892402,
437
- "loss": 46.0,
438
- "step": 59
439
- },
440
- {
441
- "epoch": 0.09331259720062209,
442
- "grad_norm": 4.852826532442123e-05,
443
- "learning_rate": 0.00020176848364762578,
444
- "loss": 46.0,
445
  "step": 60
446
  },
447
  {
448
- "epoch": 0.09486780715396578,
449
- "grad_norm": 3.3706204703776166e-05,
450
- "learning_rate": 0.00019314141546972343,
451
- "loss": 46.0,
452
- "step": 61
453
- },
454
- {
455
- "epoch": 0.09642301710730948,
456
- "grad_norm": 4.146797800785862e-05,
457
- "learning_rate": 0.00018458518600788986,
458
- "loss": 46.0,
459
- "step": 62
460
- },
461
- {
462
- "epoch": 0.09797822706065319,
463
- "grad_norm": 5.218683509156108e-05,
464
- "learning_rate": 0.00017611045525857898,
465
- "loss": 46.0,
466
- "step": 63
467
- },
468
- {
469
- "epoch": 0.09953343701399689,
470
- "grad_norm": 6.322271656244993e-05,
471
- "learning_rate": 0.0001677277816809975,
472
- "loss": 46.0,
473
- "step": 64
474
- },
475
- {
476
- "epoch": 0.10108864696734059,
477
- "grad_norm": 3.459403887973167e-05,
478
- "learning_rate": 0.00015944760904257942,
479
- "loss": 46.0,
480
- "step": 65
481
- },
482
- {
483
- "epoch": 0.1026438569206843,
484
- "grad_norm": 5.7651352108223364e-05,
485
- "learning_rate": 0.0001512802534073522,
486
- "loss": 46.0,
487
- "step": 66
488
- },
489
- {
490
- "epoch": 0.104199066874028,
491
- "grad_norm": 5.704832437913865e-05,
492
- "learning_rate": 0.00014323589028340596,
493
- "loss": 46.0,
494
- "step": 67
495
- },
496
- {
497
- "epoch": 0.1057542768273717,
498
- "grad_norm": 3.9827515138313174e-05,
499
- "learning_rate": 0.00013532454194547733,
500
- "loss": 46.0,
501
- "step": 68
502
- },
503
- {
504
- "epoch": 0.10730948678071539,
505
- "grad_norm": 7.096585613908246e-05,
506
- "learning_rate": 0.00012755606494844294,
507
- "loss": 46.0,
508
- "step": 69
509
- },
510
- {
511
- "epoch": 0.1088646967340591,
512
- "grad_norm": 5.004853301215917e-05,
513
- "learning_rate": 0.00011994013784727947,
514
- "loss": 46.0,
515
- "step": 70
516
- },
517
- {
518
- "epoch": 0.1104199066874028,
519
- "grad_norm": 3.065542477997951e-05,
520
- "learning_rate": 0.00011248624913878966,
521
- "loss": 46.0,
522
- "step": 71
523
- },
524
- {
525
- "epoch": 0.1119751166407465,
526
- "grad_norm": 3.73735892935656e-05,
527
- "learning_rate": 0.0001052036854401166,
528
- "loss": 46.0,
529
- "step": 72
530
- },
531
- {
532
- "epoch": 0.11353032659409021,
533
- "grad_norm": 8.166713814716786e-05,
534
- "learning_rate": 9.810151991877531e-05,
535
- "loss": 46.0,
536
- "step": 73
537
- },
538
- {
539
- "epoch": 0.1150855365474339,
540
- "grad_norm": 5.165397305972874e-05,
541
- "learning_rate": 9.118860098861537e-05,
542
- "loss": 46.0,
543
- "step": 74
544
- },
545
- {
546
- "epoch": 0.1166407465007776,
547
- "grad_norm": 4.3509502575034276e-05,
548
- "learning_rate": 8.44735412857999e-05,
549
- "loss": 46.0,
550
- "step": 75
551
- },
552
- {
553
- "epoch": 0.1181959564541213,
554
- "grad_norm": 3.0016582968528382e-05,
555
- "learning_rate": 7.79647069385328e-05,
556
- "loss": 46.0,
557
- "step": 76
558
- },
559
- {
560
- "epoch": 0.11975116640746501,
561
- "grad_norm": 4.617219383362681e-05,
562
- "learning_rate": 7.167020714390501e-05,
563
- "loss": 46.0,
564
- "step": 77
565
- },
566
- {
567
- "epoch": 0.12130637636080871,
568
- "grad_norm": 6.536076398333535e-05,
569
- "learning_rate": 6.559788406484446e-05,
570
- "loss": 46.0,
571
- "step": 78
572
- },
573
- {
574
- "epoch": 0.12286158631415241,
575
- "grad_norm": 5.4571206419495866e-05,
576
- "learning_rate": 5.975530305975807e-05,
577
- "loss": 46.0,
578
- "step": 79
579
- },
580
- {
581
- "epoch": 0.12441679626749612,
582
- "grad_norm": 4.393588096718304e-05,
583
- "learning_rate": 5.414974325703686e-05,
584
- "loss": 46.0,
585
  "step": 80
586
- },
587
- {
588
- "epoch": 0.12597200622083982,
589
- "grad_norm": 0.00010264776210533455,
590
- "learning_rate": 4.8788188486168616e-05,
591
- "loss": 46.0,
592
- "step": 81
593
- },
594
- {
595
- "epoch": 0.12752721617418353,
596
- "grad_norm": 8.71722077135928e-05,
597
- "learning_rate": 4.367731857675569e-05,
598
- "loss": 46.0,
599
- "step": 82
600
- },
601
- {
602
- "epoch": 0.1290824261275272,
603
- "grad_norm": 8.793560118647292e-05,
604
- "learning_rate": 3.882350103627952e-05,
605
- "loss": 46.0,
606
- "step": 83
607
- },
608
- {
609
- "epoch": 0.13063763608087092,
610
- "grad_norm": 6.0609807405853644e-05,
611
- "learning_rate": 3.423278311697897e-05,
612
- "loss": 46.0,
613
- "step": 84
614
- },
615
- {
616
- "epoch": 0.1321928460342146,
617
- "grad_norm": 4.071524745086208e-05,
618
- "learning_rate": 2.9910884281727225e-05,
619
- "loss": 46.0,
620
- "step": 85
621
- },
622
- {
623
- "epoch": 0.13374805598755832,
624
- "grad_norm": 4.141516546951607e-05,
625
- "learning_rate": 2.586318907829291e-05,
626
- "loss": 46.0,
627
- "step": 86
628
- },
629
- {
630
- "epoch": 0.13530326594090203,
631
- "grad_norm": 5.893910929444246e-05,
632
- "learning_rate": 2.209474043086457e-05,
633
- "loss": 46.0,
634
- "step": 87
635
- },
636
- {
637
- "epoch": 0.1368584758942457,
638
- "grad_norm": 3.4431377571308985e-05,
639
- "learning_rate": 1.861023335719475e-05,
640
- "loss": 46.0,
641
- "step": 88
642
- },
643
- {
644
- "epoch": 0.13841368584758942,
645
- "grad_norm": 8.926719601731747e-05,
646
- "learning_rate": 1.5414009119192633e-05,
647
- "loss": 46.0,
648
- "step": 89
649
- },
650
- {
651
- "epoch": 0.13996889580093314,
652
- "grad_norm": 5.9101457736687735e-05,
653
- "learning_rate": 1.25100498142523e-05,
654
- "loss": 46.0,
655
- "step": 90
656
- },
657
- {
658
- "epoch": 0.14152410575427682,
659
- "grad_norm": 5.563394006458111e-05,
660
- "learning_rate": 9.901973414055187e-06,
661
- "loss": 46.0,
662
- "step": 91
663
- },
664
- {
665
- "epoch": 0.14307931570762053,
666
- "grad_norm": 3.268073487561196e-05,
667
- "learning_rate": 7.593029257027956e-06,
668
- "loss": 46.0,
669
- "step": 92
670
- },
671
- {
672
- "epoch": 0.14463452566096424,
673
- "grad_norm": 4.3672669562511146e-05,
674
- "learning_rate": 5.5860940000714015e-06,
675
- "loss": 46.0,
676
- "step": 93
677
- },
678
- {
679
- "epoch": 0.14618973561430793,
680
- "grad_norm": 6.923845648998395e-05,
681
- "learning_rate": 3.8836680346041594e-06,
682
- "loss": 46.0,
683
- "step": 94
684
- },
685
- {
686
- "epoch": 0.14774494556765164,
687
- "grad_norm": 8.794792665867135e-05,
688
- "learning_rate": 2.487872371386424e-06,
689
- "loss": 46.0,
690
- "step": 95
691
- },
692
- {
693
- "epoch": 0.14930015552099535,
694
- "grad_norm": 5.365633114706725e-05,
695
- "learning_rate": 1.4004459980045125e-06,
696
- "loss": 46.0,
697
- "step": 96
698
- },
699
- {
700
- "epoch": 0.15085536547433903,
701
- "grad_norm": 4.416866795509122e-05,
702
- "learning_rate": 6.22743712309054e-07,
703
- "loss": 46.0,
704
- "step": 97
705
- },
706
- {
707
- "epoch": 0.15241057542768274,
708
- "grad_norm": 9.724321716930717e-05,
709
- "learning_rate": 1.557344345054501e-07,
710
- "loss": 46.0,
711
- "step": 98
712
- },
713
- {
714
- "epoch": 0.15396578538102643,
715
- "grad_norm": 4.769267616211437e-05,
716
- "learning_rate": 0.0,
717
- "loss": 46.0,
718
- "step": 99
719
  }
720
  ],
721
- "logging_steps": 1,
722
  "max_steps": 99,
723
  "num_input_tokens_seen": 0,
724
  "num_train_epochs": 1,
@@ -735,8 +70,8 @@
735
  "attributes": {}
736
  }
737
  },
738
- "total_flos": 1972364967936.0,
739
- "train_batch_size": 2,
740
  "trial_name": null,
741
  "trial_params": null
742
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.30745341614906835,
5
  "eval_steps": 50,
6
  "global_step": 99,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.003105590062111801,
 
 
 
 
 
 
 
13
  "eval_loss": 11.5,
14
+ "eval_runtime": 0.766,
15
+ "eval_samples_per_second": 353.798,
16
+ "eval_steps_per_second": 22.194,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.062111801242236024,
21
+ "grad_norm": 8.48894524096977e-06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "learning_rate": 0.0005,
23
+ "loss": 11.5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  "step": 20
25
  },
26
  {
27
+ "epoch": 0.12422360248447205,
28
+ "grad_norm": 4.7956536946003325e-06,
29
+ "learning_rate": 0.00042501051864235636,
30
+ "loss": 11.5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "step": 40
32
  },
33
  {
34
+ "epoch": 0.15527950310559005,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  "eval_loss": 11.5,
36
+ "eval_runtime": 0.6897,
37
+ "eval_samples_per_second": 392.927,
38
+ "eval_steps_per_second": 24.649,
39
  "step": 50
40
  },
41
  {
42
+ "epoch": 0.18633540372670807,
43
+ "grad_norm": 8.120113307086285e-06,
44
+ "learning_rate": 0.00024502945308373244,
45
+ "loss": 11.5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  "step": 60
47
  },
48
  {
49
+ "epoch": 0.2484472049689441,
50
+ "grad_norm": 8.012266334844753e-06,
51
+ "learning_rate": 6.803029740762648e-05,
52
+ "loss": 11.5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  }
55
  ],
56
+ "logging_steps": 20,
57
  "max_steps": 99,
58
  "num_input_tokens_seen": 0,
59
  "num_train_epochs": 1,
 
70
  "attributes": {}
71
  }
72
  },
73
+ "total_flos": 3944729935872.0,
74
+ "train_batch_size": 16,
75
  "trial_name": null,
76
  "trial_params": null
77
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:613de94b40ae6b244db74ea2a6e74db03ab415e2565b42e9e7749c6efa71c8fb
3
- size 6712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c894c75a1cba5d37858d46fa46d51225ce080fd0ac9d5db564f04c7de591383
3
+ size 6776