error577 commited on
Commit
3c06fcf
·
verified ·
1 Parent(s): 4ec891c

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "dense_4h_to_h",
24
- "query_key_value",
25
  "dense",
26
- "dense_h_to_4h"
 
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "dense",
24
+ "dense_4h_to_h",
25
+ "dense_h_to_4h",
26
+ "query_key_value"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:800aee9135c82e7383fe0f102af822e1bde43c1c0bbfd0f4735a6c53573ff11e
3
  size 1579384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6122248efad15f08bb35ef89dbfb4db02d69860b1a9b820e62900a2fb0d6353
3
  size 1579384
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3b8855ae5dc728c904dd8a326be7481b54ef94eb72bd7f239ffbcb5235e93ba
3
  size 857338
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6ba50451f2f4098994e44c0276099046264f1481fc025e6ae04dd5638e8e654
3
  size 857338
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab45fcec9a67a4f49e7f54aed12e065949f921379b9ff9cd3c68696ffda141d0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27038ab56031f405c10aacd6fb0edec15d29b713b4ea2a4062288739d9c449ed
3
  size 14244
last-checkpoint/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0569494882799604,
5
  "eval_steps": 200,
6
  "global_step": 400,
7
  "is_hyper_param_search": false,
@@ -10,2826 +10,2826 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.002641135688345989,
13
- "grad_norm": 36301.09765625,
14
  "learning_rate": 1.0000000000000002e-06,
15
- "loss": 77.7289,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.002641135688345989,
20
- "eval_loss": 10.947957992553711,
21
- "eval_runtime": 2.8903,
22
- "eval_samples_per_second": 171.262,
23
- "eval_steps_per_second": 42.902,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.005282271376691978,
28
- "grad_norm": 58047.49609375,
29
  "learning_rate": 2.0000000000000003e-06,
30
- "loss": 73.7497,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.007923407065037967,
35
- "grad_norm": 99760.890625,
36
  "learning_rate": 3e-06,
37
- "loss": 79.9203,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.010564542753383956,
42
- "grad_norm": 60740.26953125,
43
  "learning_rate": 4.000000000000001e-06,
44
- "loss": 76.8553,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.013205678441729944,
49
- "grad_norm": 80668.09375,
50
  "learning_rate": 5e-06,
51
- "loss": 75.6527,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.015846814130075933,
56
- "grad_norm": 77119.0078125,
57
  "learning_rate": 6e-06,
58
- "loss": 77.4062,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.01848794981842192,
63
- "grad_norm": 110103.203125,
64
  "learning_rate": 7.000000000000001e-06,
65
- "loss": 76.1566,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.02112908550676791,
70
- "grad_norm": 46966.25390625,
71
  "learning_rate": 8.000000000000001e-06,
72
- "loss": 75.0136,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.0237702211951139,
77
- "grad_norm": 52369.109375,
78
  "learning_rate": 9e-06,
79
- "loss": 75.3808,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.02641135688345989,
84
- "grad_norm": 67832.140625,
85
  "learning_rate": 1e-05,
86
- "loss": 74.4742,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.029052492571805876,
91
- "grad_norm": 58381.0078125,
92
  "learning_rate": 1.1000000000000001e-05,
93
- "loss": 72.1378,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.03169362826015187,
98
- "grad_norm": 32421.669921875,
99
  "learning_rate": 1.2e-05,
100
- "loss": 74.1041,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.034334763948497854,
105
- "grad_norm": 53718.72265625,
106
  "learning_rate": 1.3000000000000001e-05,
107
- "loss": 74.6332,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.03697589963684384,
112
- "grad_norm": 96267.078125,
113
  "learning_rate": 1.4000000000000001e-05,
114
- "loss": 75.1989,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.03961703532518983,
119
- "grad_norm": 76299.265625,
120
  "learning_rate": 1.5e-05,
121
- "loss": 71.9961,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.04225817101353582,
126
- "grad_norm": 95350.3671875,
127
  "learning_rate": 1.6000000000000003e-05,
128
- "loss": 75.3688,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.04489930670188181,
133
- "grad_norm": 27829.681640625,
134
  "learning_rate": 1.7000000000000003e-05,
135
- "loss": 71.3448,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.0475404423902278,
140
- "grad_norm": 82636.9609375,
141
  "learning_rate": 1.8e-05,
142
- "loss": 70.2817,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.050181578078573784,
147
- "grad_norm": 32414.7890625,
148
  "learning_rate": 1.9e-05,
149
- "loss": 72.0135,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.05282271376691978,
154
- "grad_norm": 47665.40234375,
155
  "learning_rate": 2e-05,
156
- "loss": 69.6301,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.055463849455265765,
161
- "grad_norm": 32801.06640625,
162
  "learning_rate": 2.1e-05,
163
- "loss": 69.1858,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.05810498514361175,
168
- "grad_norm": 57841.734375,
169
  "learning_rate": 2.2000000000000003e-05,
170
- "loss": 74.4269,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.06074612083195774,
175
- "grad_norm": 977927.5,
176
  "learning_rate": 2.3000000000000003e-05,
177
- "loss": 374.1096,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.06338725652030373,
182
- "grad_norm": 822052.8125,
183
  "learning_rate": 2.4e-05,
184
- "loss": 593.3242,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.06602839220864971,
189
- "grad_norm": 1799951.0,
190
  "learning_rate": 2.5e-05,
191
- "loss": 679.3867,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.06866952789699571,
196
- "grad_norm": 1745750.875,
197
  "learning_rate": 2.6000000000000002e-05,
198
- "loss": 576.8527,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.0713106635853417,
203
- "grad_norm": 1711292.125,
204
  "learning_rate": 2.7000000000000002e-05,
205
- "loss": 607.4575,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.07395179927368768,
210
- "grad_norm": 963344.875,
211
  "learning_rate": 2.8000000000000003e-05,
212
- "loss": 617.1985,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.07659293496203368,
217
- "grad_norm": 1535239.125,
218
  "learning_rate": 2.9e-05,
219
- "loss": 783.793,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.07923407065037966,
224
- "grad_norm": 1122964.75,
225
  "learning_rate": 3e-05,
226
- "loss": 647.9023,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.08187520633872565,
231
- "grad_norm": 969634.125,
232
  "learning_rate": 3.1e-05,
233
- "loss": 517.2833,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.08451634202707164,
238
- "grad_norm": 889313.3125,
239
  "learning_rate": 3.2000000000000005e-05,
240
- "loss": 533.6053,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.08715747771541763,
245
- "grad_norm": 295694.34375,
246
  "learning_rate": 3.3e-05,
247
- "loss": 150.9701,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.08979861340376362,
252
- "grad_norm": 17467.146484375,
253
  "learning_rate": 3.4000000000000007e-05,
254
- "loss": 67.8817,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.09243974909210961,
259
- "grad_norm": 37610.78515625,
260
  "learning_rate": 3.5e-05,
261
- "loss": 68.1266,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.0950808847804556,
266
- "grad_norm": 23183.30859375,
267
  "learning_rate": 3.6e-05,
268
- "loss": 66.7708,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.09772202046880159,
273
- "grad_norm": 21793.3984375,
274
  "learning_rate": 3.7e-05,
275
- "loss": 69.4378,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.10036315615714757,
280
- "grad_norm": 25680.82421875,
281
  "learning_rate": 3.8e-05,
282
- "loss": 67.6288,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.10300429184549356,
287
- "grad_norm": 20375.560546875,
288
  "learning_rate": 3.9000000000000006e-05,
289
- "loss": 68.1002,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.10564542753383956,
294
- "grad_norm": 20276.283203125,
295
  "learning_rate": 4e-05,
296
- "loss": 64.9515,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.10828656322218554,
301
- "grad_norm": 22337.115234375,
302
  "learning_rate": 4.1e-05,
303
- "loss": 66.2982,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.11092769891053153,
308
- "grad_norm": 36014.4765625,
309
  "learning_rate": 4.2e-05,
310
- "loss": 67.0856,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.11356883459887751,
315
- "grad_norm": 55459.51171875,
316
  "learning_rate": 4.3e-05,
317
- "loss": 64.725,
318
  "step": 43
319
  },
320
  {
321
  "epoch": 0.1162099702872235,
322
- "grad_norm": 28144.5546875,
323
  "learning_rate": 4.4000000000000006e-05,
324
- "loss": 66.0296,
325
  "step": 44
326
  },
327
  {
328
  "epoch": 0.1188511059755695,
329
- "grad_norm": 27595.615234375,
330
  "learning_rate": 4.5e-05,
331
- "loss": 64.3844,
332
  "step": 45
333
  },
334
  {
335
  "epoch": 0.12149224166391548,
336
- "grad_norm": 35889.7265625,
337
  "learning_rate": 4.600000000000001e-05,
338
- "loss": 66.3218,
339
  "step": 46
340
  },
341
  {
342
  "epoch": 0.12413337735226147,
343
- "grad_norm": 26046.734375,
344
  "learning_rate": 4.7e-05,
345
- "loss": 66.0898,
346
  "step": 47
347
  },
348
  {
349
  "epoch": 0.12677451304060747,
350
- "grad_norm": 34353.89453125,
351
  "learning_rate": 4.8e-05,
352
- "loss": 67.0278,
353
  "step": 48
354
  },
355
  {
356
  "epoch": 0.12941564872895345,
357
- "grad_norm": 43143.78515625,
358
  "learning_rate": 4.9e-05,
359
- "loss": 71.7618,
360
  "step": 49
361
  },
362
  {
363
  "epoch": 0.13205678441729943,
364
- "grad_norm": 71989.6796875,
365
  "learning_rate": 5e-05,
366
- "loss": 79.8577,
367
  "step": 50
368
  },
369
  {
370
  "epoch": 0.13469792010564544,
371
- "grad_norm": 8369.36328125,
372
  "learning_rate": 5.1000000000000006e-05,
373
- "loss": 68.7658,
374
  "step": 51
375
  },
376
  {
377
  "epoch": 0.13733905579399142,
378
- "grad_norm": 18901.203125,
379
  "learning_rate": 5.2000000000000004e-05,
380
- "loss": 68.0294,
381
  "step": 52
382
  },
383
  {
384
  "epoch": 0.1399801914823374,
385
- "grad_norm": 21011.810546875,
386
  "learning_rate": 5.300000000000001e-05,
387
- "loss": 65.6121,
388
  "step": 53
389
  },
390
  {
391
  "epoch": 0.1426213271706834,
392
- "grad_norm": 50907.6875,
393
  "learning_rate": 5.4000000000000005e-05,
394
- "loss": 67.7626,
395
  "step": 54
396
  },
397
  {
398
  "epoch": 0.14526246285902938,
399
- "grad_norm": 18672.953125,
400
  "learning_rate": 5.500000000000001e-05,
401
- "loss": 67.0266,
402
  "step": 55
403
  },
404
  {
405
  "epoch": 0.14790359854737536,
406
- "grad_norm": 21917.009765625,
407
  "learning_rate": 5.6000000000000006e-05,
408
- "loss": 67.1658,
409
  "step": 56
410
  },
411
  {
412
  "epoch": 0.15054473423572137,
413
- "grad_norm": 15709.1328125,
414
  "learning_rate": 5.6999999999999996e-05,
415
- "loss": 66.1013,
416
  "step": 57
417
  },
418
  {
419
  "epoch": 0.15318586992406735,
420
- "grad_norm": 14906.9873046875,
421
  "learning_rate": 5.8e-05,
422
- "loss": 65.4285,
423
  "step": 58
424
  },
425
  {
426
  "epoch": 0.15582700561241333,
427
- "grad_norm": 14021.7392578125,
428
  "learning_rate": 5.9e-05,
429
- "loss": 67.0967,
430
  "step": 59
431
  },
432
  {
433
  "epoch": 0.1584681413007593,
434
- "grad_norm": 19881.90234375,
435
  "learning_rate": 6e-05,
436
- "loss": 62.4191,
437
  "step": 60
438
  },
439
  {
440
  "epoch": 0.16110927698910532,
441
- "grad_norm": 15729.5654296875,
442
  "learning_rate": 6.1e-05,
443
- "loss": 65.3837,
444
  "step": 61
445
  },
446
  {
447
  "epoch": 0.1637504126774513,
448
- "grad_norm": 15976.841796875,
449
  "learning_rate": 6.2e-05,
450
- "loss": 63.9247,
451
  "step": 62
452
  },
453
  {
454
  "epoch": 0.16639154836579728,
455
- "grad_norm": 17226.25390625,
456
  "learning_rate": 6.3e-05,
457
- "loss": 63.7733,
458
  "step": 63
459
  },
460
  {
461
  "epoch": 0.1690326840541433,
462
- "grad_norm": 12932.412109375,
463
  "learning_rate": 6.400000000000001e-05,
464
- "loss": 63.3991,
465
  "step": 64
466
  },
467
  {
468
  "epoch": 0.17167381974248927,
469
- "grad_norm": 11609.3994140625,
470
  "learning_rate": 6.500000000000001e-05,
471
- "loss": 62.8542,
472
  "step": 65
473
  },
474
  {
475
  "epoch": 0.17431495543083525,
476
- "grad_norm": 16818.75390625,
477
  "learning_rate": 6.6e-05,
478
- "loss": 62.5173,
479
  "step": 66
480
  },
481
  {
482
  "epoch": 0.17695609111918126,
483
- "grad_norm": 12031.3447265625,
484
  "learning_rate": 6.7e-05,
485
- "loss": 62.8865,
486
  "step": 67
487
  },
488
  {
489
  "epoch": 0.17959722680752724,
490
- "grad_norm": 17571.72265625,
491
  "learning_rate": 6.800000000000001e-05,
492
- "loss": 61.6707,
493
  "step": 68
494
  },
495
  {
496
  "epoch": 0.18223836249587322,
497
- "grad_norm": 15332.890625,
498
  "learning_rate": 6.9e-05,
499
- "loss": 62.2125,
500
  "step": 69
501
  },
502
  {
503
  "epoch": 0.18487949818421923,
504
- "grad_norm": 9654.8876953125,
505
  "learning_rate": 7e-05,
506
- "loss": 62.2438,
507
  "step": 70
508
  },
509
  {
510
  "epoch": 0.1875206338725652,
511
- "grad_norm": 14719.578125,
512
  "learning_rate": 7.1e-05,
513
- "loss": 62.446,
514
  "step": 71
515
  },
516
  {
517
  "epoch": 0.1901617695609112,
518
- "grad_norm": 130687.6328125,
519
  "learning_rate": 7.2e-05,
520
- "loss": 142.5886,
521
  "step": 72
522
  },
523
  {
524
  "epoch": 0.19280290524925717,
525
- "grad_norm": 940455.125,
526
  "learning_rate": 7.3e-05,
527
- "loss": 648.629,
528
  "step": 73
529
  },
530
  {
531
  "epoch": 0.19544404093760318,
532
- "grad_norm": 902928.3125,
533
  "learning_rate": 7.4e-05,
534
- "loss": 667.6523,
535
  "step": 74
536
  },
537
  {
538
  "epoch": 0.19808517662594916,
539
- "grad_norm": 666524.25,
540
  "learning_rate": 7.500000000000001e-05,
541
- "loss": 532.2306,
542
  "step": 75
543
  },
544
  {
545
  "epoch": 0.20072631231429514,
546
- "grad_norm": 988706.125,
547
  "learning_rate": 7.6e-05,
548
- "loss": 576.9766,
549
  "step": 76
550
  },
551
  {
552
  "epoch": 0.20336744800264114,
553
- "grad_norm": 883775.5625,
554
  "learning_rate": 7.7e-05,
555
- "loss": 546.8032,
556
  "step": 77
557
  },
558
  {
559
  "epoch": 0.20600858369098712,
560
- "grad_norm": 2083634.75,
561
  "learning_rate": 7.800000000000001e-05,
562
- "loss": 611.6584,
563
  "step": 78
564
  },
565
  {
566
  "epoch": 0.2086497193793331,
567
- "grad_norm": 775611.5625,
568
  "learning_rate": 7.900000000000001e-05,
569
- "loss": 548.3682,
570
  "step": 79
571
  },
572
  {
573
  "epoch": 0.2112908550676791,
574
- "grad_norm": 562781.5625,
575
  "learning_rate": 8e-05,
576
- "loss": 523.7821,
577
  "step": 80
578
  },
579
  {
580
  "epoch": 0.2139319907560251,
581
- "grad_norm": 421653.0625,
582
  "learning_rate": 8.1e-05,
583
- "loss": 449.1597,
584
  "step": 81
585
  },
586
  {
587
  "epoch": 0.21657312644437107,
588
- "grad_norm": 579641.8125,
589
  "learning_rate": 8.2e-05,
590
- "loss": 360.1368,
591
  "step": 82
592
  },
593
  {
594
  "epoch": 0.21921426213271708,
595
- "grad_norm": 230648.234375,
596
  "learning_rate": 8.3e-05,
597
- "loss": 72.1037,
598
  "step": 83
599
  },
600
  {
601
  "epoch": 0.22185539782106306,
602
- "grad_norm": 25170.8203125,
603
  "learning_rate": 8.4e-05,
604
- "loss": 63.1604,
605
  "step": 84
606
  },
607
  {
608
  "epoch": 0.22449653350940904,
609
- "grad_norm": 60256.33984375,
610
  "learning_rate": 8.5e-05,
611
- "loss": 66.941,
612
  "step": 85
613
  },
614
  {
615
  "epoch": 0.22713766919775502,
616
- "grad_norm": 27881.91015625,
617
  "learning_rate": 8.6e-05,
618
- "loss": 63.0213,
619
  "step": 86
620
  },
621
  {
622
  "epoch": 0.22977880488610103,
623
- "grad_norm": 14699.3232421875,
624
  "learning_rate": 8.7e-05,
625
- "loss": 63.4355,
626
  "step": 87
627
  },
628
  {
629
  "epoch": 0.232419940574447,
630
- "grad_norm": 27491.953125,
631
  "learning_rate": 8.800000000000001e-05,
632
- "loss": 60.2028,
633
  "step": 88
634
  },
635
  {
636
  "epoch": 0.235061076262793,
637
- "grad_norm": 15658.068359375,
638
  "learning_rate": 8.900000000000001e-05,
639
- "loss": 58.5889,
640
  "step": 89
641
  },
642
  {
643
  "epoch": 0.237702211951139,
644
- "grad_norm": 16560.603515625,
645
  "learning_rate": 9e-05,
646
- "loss": 60.2038,
647
  "step": 90
648
  },
649
  {
650
  "epoch": 0.24034334763948498,
651
- "grad_norm": 11106.2470703125,
652
  "learning_rate": 9.1e-05,
653
- "loss": 59.1099,
654
  "step": 91
655
  },
656
  {
657
  "epoch": 0.24298448332783096,
658
- "grad_norm": 13895.1904296875,
659
  "learning_rate": 9.200000000000001e-05,
660
- "loss": 60.0216,
661
  "step": 92
662
  },
663
  {
664
  "epoch": 0.24562561901617697,
665
- "grad_norm": 13232.1796875,
666
  "learning_rate": 9.300000000000001e-05,
667
- "loss": 60.0393,
668
  "step": 93
669
  },
670
  {
671
  "epoch": 0.24826675470452295,
672
- "grad_norm": 16614.0703125,
673
  "learning_rate": 9.4e-05,
674
- "loss": 59.1421,
675
  "step": 94
676
  },
677
  {
678
  "epoch": 0.2509078903928689,
679
- "grad_norm": 17079.46484375,
680
  "learning_rate": 9.5e-05,
681
- "loss": 59.6571,
682
  "step": 95
683
  },
684
  {
685
  "epoch": 0.25354902608121493,
686
- "grad_norm": 10043.0849609375,
687
  "learning_rate": 9.6e-05,
688
- "loss": 58.3455,
689
  "step": 96
690
  },
691
  {
692
  "epoch": 0.2561901617695609,
693
- "grad_norm": 13873.9345703125,
694
  "learning_rate": 9.7e-05,
695
- "loss": 61.7211,
696
  "step": 97
697
  },
698
  {
699
  "epoch": 0.2588312974579069,
700
- "grad_norm": 15096.0869140625,
701
  "learning_rate": 9.8e-05,
702
- "loss": 65.4602,
703
  "step": 98
704
  },
705
  {
706
  "epoch": 0.2614724331462529,
707
- "grad_norm": 16339.740234375,
708
  "learning_rate": 9.900000000000001e-05,
709
- "loss": 66.4909,
710
  "step": 99
711
  },
712
  {
713
  "epoch": 0.26411356883459886,
714
- "grad_norm": 60479.21875,
715
  "learning_rate": 0.0001,
716
- "loss": 75.1021,
717
  "step": 100
718
  },
719
  {
720
  "epoch": 0.26675470452294486,
721
- "grad_norm": 6854.955078125,
722
  "learning_rate": 9.99999993018897e-05,
723
- "loss": 58.5582,
724
  "step": 101
725
  },
726
  {
727
  "epoch": 0.26939584021129087,
728
- "grad_norm": 7745.33740234375,
729
  "learning_rate": 9.999999720755877e-05,
730
- "loss": 56.6808,
731
  "step": 102
732
  },
733
  {
734
  "epoch": 0.2720369758996368,
735
- "grad_norm": 9056.62109375,
736
  "learning_rate": 9.99999937170073e-05,
737
- "loss": 56.6537,
738
  "step": 103
739
  },
740
  {
741
  "epoch": 0.27467811158798283,
742
- "grad_norm": 9596.931640625,
743
  "learning_rate": 9.999998883023537e-05,
744
- "loss": 59.4572,
745
  "step": 104
746
  },
747
  {
748
  "epoch": 0.27731924727632884,
749
- "grad_norm": 9967.4287109375,
750
  "learning_rate": 9.999998254724313e-05,
751
- "loss": 60.9963,
752
  "step": 105
753
  },
754
  {
755
  "epoch": 0.2799603829646748,
756
- "grad_norm": 7881.37158203125,
757
  "learning_rate": 9.999997486803075e-05,
758
- "loss": 59.1903,
759
  "step": 106
760
  },
761
  {
762
  "epoch": 0.2826015186530208,
763
- "grad_norm": 7984.412109375,
764
  "learning_rate": 9.999996579259843e-05,
765
- "loss": 57.3962,
766
  "step": 107
767
  },
768
  {
769
  "epoch": 0.2852426543413668,
770
- "grad_norm": 8951.9453125,
771
  "learning_rate": 9.999995532094644e-05,
772
- "loss": 60.8434,
773
  "step": 108
774
  },
775
  {
776
  "epoch": 0.28788379002971276,
777
- "grad_norm": 9131.03515625,
778
  "learning_rate": 9.999994345307508e-05,
779
- "loss": 58.7974,
780
  "step": 109
781
  },
782
  {
783
  "epoch": 0.29052492571805877,
784
- "grad_norm": 7934.9453125,
785
  "learning_rate": 9.999993018898466e-05,
786
- "loss": 57.2941,
787
  "step": 110
788
  },
789
  {
790
  "epoch": 0.2931660614064048,
791
- "grad_norm": 7639.3134765625,
792
  "learning_rate": 9.999991552867558e-05,
793
- "loss": 57.9149,
794
  "step": 111
795
  },
796
  {
797
  "epoch": 0.29580719709475073,
798
- "grad_norm": 9613.30078125,
799
  "learning_rate": 9.99998994721482e-05,
800
- "loss": 59.5527,
801
  "step": 112
802
  },
803
  {
804
  "epoch": 0.29844833278309674,
805
- "grad_norm": 9338.255859375,
806
  "learning_rate": 9.999988201940302e-05,
807
- "loss": 59.147,
808
  "step": 113
809
  },
810
  {
811
  "epoch": 0.30108946847144274,
812
- "grad_norm": 7751.587890625,
813
  "learning_rate": 9.999986317044051e-05,
814
- "loss": 56.4785,
815
  "step": 114
816
  },
817
  {
818
  "epoch": 0.3037306041597887,
819
- "grad_norm": 10206.54296875,
820
  "learning_rate": 9.999984292526118e-05,
821
- "loss": 57.9974,
822
  "step": 115
823
  },
824
  {
825
  "epoch": 0.3063717398481347,
826
- "grad_norm": 8300.5419921875,
827
  "learning_rate": 9.999982128386562e-05,
828
- "loss": 58.3521,
829
  "step": 116
830
  },
831
  {
832
  "epoch": 0.3090128755364807,
833
- "grad_norm": 10024.880859375,
834
  "learning_rate": 9.99997982462544e-05,
835
- "loss": 57.092,
836
  "step": 117
837
  },
838
  {
839
  "epoch": 0.31165401122482667,
840
- "grad_norm": 11154.408203125,
841
  "learning_rate": 9.999977381242821e-05,
842
- "loss": 58.9941,
843
  "step": 118
844
  },
845
  {
846
  "epoch": 0.3142951469131727,
847
- "grad_norm": 9249.49609375,
848
  "learning_rate": 9.999974798238769e-05,
849
- "loss": 56.6213,
850
  "step": 119
851
  },
852
  {
853
  "epoch": 0.3169362826015186,
854
- "grad_norm": 8185.81689453125,
855
  "learning_rate": 9.99997207561336e-05,
856
- "loss": 58.972,
857
  "step": 120
858
  },
859
  {
860
  "epoch": 0.31957741828986463,
861
- "grad_norm": 8558.0712890625,
862
  "learning_rate": 9.999969213366667e-05,
863
- "loss": 57.1556,
864
  "step": 121
865
  },
866
  {
867
  "epoch": 0.32221855397821064,
868
- "grad_norm": 22493.716796875,
869
  "learning_rate": 9.99996621149877e-05,
870
- "loss": 72.2707,
871
  "step": 122
872
  },
873
  {
874
  "epoch": 0.3248596896665566,
875
- "grad_norm": 398594.625,
876
  "learning_rate": 9.999963070009755e-05,
877
- "loss": 406.1104,
878
  "step": 123
879
  },
880
  {
881
  "epoch": 0.3275008253549026,
882
- "grad_norm": 361898.625,
883
  "learning_rate": 9.999959788899706e-05,
884
- "loss": 667.9355,
885
  "step": 124
886
  },
887
  {
888
  "epoch": 0.3301419610432486,
889
- "grad_norm": 437867.09375,
890
  "learning_rate": 9.999956368168719e-05,
891
- "loss": 615.7256,
892
  "step": 125
893
  },
894
  {
895
  "epoch": 0.33278309673159456,
896
- "grad_norm": 301105.25,
897
  "learning_rate": 9.999952807816888e-05,
898
- "loss": 594.6784,
899
  "step": 126
900
  },
901
  {
902
  "epoch": 0.33542423241994057,
903
- "grad_norm": 792771.375,
904
  "learning_rate": 9.99994910784431e-05,
905
- "loss": 595.7666,
906
  "step": 127
907
  },
908
  {
909
  "epoch": 0.3380653681082866,
910
- "grad_norm": 329489.96875,
911
  "learning_rate": 9.999945268251092e-05,
912
- "loss": 508.584,
913
  "step": 128
914
  },
915
  {
916
  "epoch": 0.34070650379663253,
917
- "grad_norm": 470539.6875,
918
  "learning_rate": 9.999941289037338e-05,
919
- "loss": 558.1961,
920
  "step": 129
921
  },
922
  {
923
  "epoch": 0.34334763948497854,
924
- "grad_norm": 1048945.0,
925
  "learning_rate": 9.999937170203162e-05,
926
- "loss": 474.5325,
927
  "step": 130
928
  },
929
  {
930
  "epoch": 0.34598877517332455,
931
- "grad_norm": 778740.3125,
932
  "learning_rate": 9.999932911748678e-05,
933
- "loss": 475.9576,
934
  "step": 131
935
  },
936
  {
937
  "epoch": 0.3486299108616705,
938
- "grad_norm": 596922.1875,
939
  "learning_rate": 9.999928513674004e-05,
940
- "loss": 228.0436,
941
  "step": 132
942
  },
943
  {
944
  "epoch": 0.3512710465500165,
945
- "grad_norm": 14506.12890625,
946
  "learning_rate": 9.999923975979262e-05,
947
- "loss": 61.4294,
948
  "step": 133
949
  },
950
  {
951
  "epoch": 0.3539121822383625,
952
- "grad_norm": 18508.80859375,
953
  "learning_rate": 9.999919298664582e-05,
954
- "loss": 60.2299,
955
  "step": 134
956
  },
957
  {
958
  "epoch": 0.35655331792670847,
959
- "grad_norm": 24954.515625,
960
  "learning_rate": 9.999914481730092e-05,
961
- "loss": 60.3748,
962
  "step": 135
963
  },
964
  {
965
  "epoch": 0.3591944536150545,
966
- "grad_norm": 28892.380859375,
967
  "learning_rate": 9.999909525175927e-05,
968
- "loss": 59.6711,
969
  "step": 136
970
  },
971
  {
972
  "epoch": 0.3618355893034005,
973
- "grad_norm": 11845.759765625,
974
  "learning_rate": 9.999904429002225e-05,
975
- "loss": 59.5442,
976
  "step": 137
977
  },
978
  {
979
  "epoch": 0.36447672499174644,
980
- "grad_norm": 12252.9384765625,
981
  "learning_rate": 9.99989919320913e-05,
982
- "loss": 59.1238,
983
  "step": 138
984
  },
985
  {
986
  "epoch": 0.36711786068009244,
987
- "grad_norm": 14152.0126953125,
988
  "learning_rate": 9.999893817796786e-05,
989
- "loss": 57.509,
990
  "step": 139
991
  },
992
  {
993
  "epoch": 0.36975899636843845,
994
- "grad_norm": 18928.748046875,
995
  "learning_rate": 9.999888302765345e-05,
996
- "loss": 58.0275,
997
  "step": 140
998
  },
999
  {
1000
  "epoch": 0.3724001320567844,
1001
- "grad_norm": 15042.6201171875,
1002
  "learning_rate": 9.99988264811496e-05,
1003
- "loss": 57.775,
1004
  "step": 141
1005
  },
1006
  {
1007
  "epoch": 0.3750412677451304,
1008
- "grad_norm": 38463.7265625,
1009
  "learning_rate": 9.99987685384579e-05,
1010
- "loss": 57.786,
1011
  "step": 142
1012
  },
1013
  {
1014
  "epoch": 0.3776824034334764,
1015
- "grad_norm": 15558.212890625,
1016
  "learning_rate": 9.999870919957996e-05,
1017
- "loss": 57.7509,
1018
  "step": 143
1019
  },
1020
  {
1021
  "epoch": 0.3803235391218224,
1022
- "grad_norm": 12991.3076171875,
1023
  "learning_rate": 9.999864846451744e-05,
1024
- "loss": 59.5253,
1025
  "step": 144
1026
  },
1027
  {
1028
  "epoch": 0.3829646748101684,
1029
- "grad_norm": 23788.37890625,
1030
  "learning_rate": 9.999858633327201e-05,
1031
- "loss": 61.2206,
1032
  "step": 145
1033
  },
1034
  {
1035
  "epoch": 0.38560581049851433,
1036
- "grad_norm": 22398.08203125,
1037
  "learning_rate": 9.999852280584544e-05,
1038
- "loss": 60.5845,
1039
  "step": 146
1040
  },
1041
  {
1042
  "epoch": 0.38824694618686034,
1043
- "grad_norm": 14377.44921875,
1044
  "learning_rate": 9.999845788223949e-05,
1045
- "loss": 62.4098,
1046
  "step": 147
1047
  },
1048
  {
1049
  "epoch": 0.39088808187520635,
1050
- "grad_norm": 27776.720703125,
1051
  "learning_rate": 9.999839156245598e-05,
1052
- "loss": 65.2689,
1053
  "step": 148
1054
  },
1055
  {
1056
  "epoch": 0.3935292175635523,
1057
- "grad_norm": 51893.64453125,
1058
  "learning_rate": 9.999832384649674e-05,
1059
- "loss": 68.9281,
1060
  "step": 149
1061
  },
1062
  {
1063
  "epoch": 0.3961703532518983,
1064
- "grad_norm": 52230.5078125,
1065
  "learning_rate": 9.999825473436369e-05,
1066
- "loss": 80.9858,
1067
  "step": 150
1068
  },
1069
  {
1070
  "epoch": 0.3988114889402443,
1071
- "grad_norm": 5408.37353515625,
1072
  "learning_rate": 9.999818422605875e-05,
1073
- "loss": 62.4323,
1074
  "step": 151
1075
  },
1076
  {
1077
  "epoch": 0.40145262462859027,
1078
- "grad_norm": 10959.0517578125,
1079
  "learning_rate": 9.999811232158389e-05,
1080
- "loss": 58.8699,
1081
  "step": 152
1082
  },
1083
  {
1084
  "epoch": 0.4040937603169363,
1085
- "grad_norm": 8163.58935546875,
1086
  "learning_rate": 9.999803902094109e-05,
1087
- "loss": 60.9293,
1088
  "step": 153
1089
  },
1090
  {
1091
  "epoch": 0.4067348960052823,
1092
- "grad_norm": 9621.8291015625,
1093
  "learning_rate": 9.999796432413244e-05,
1094
- "loss": 62.2098,
1095
  "step": 154
1096
  },
1097
  {
1098
  "epoch": 0.40937603169362824,
1099
- "grad_norm": 11912.89453125,
1100
  "learning_rate": 9.999788823116001e-05,
1101
- "loss": 63.1052,
1102
  "step": 155
1103
  },
1104
  {
1105
  "epoch": 0.41201716738197425,
1106
- "grad_norm": 8239.365234375,
1107
  "learning_rate": 9.999781074202592e-05,
1108
- "loss": 62.9719,
1109
  "step": 156
1110
  },
1111
  {
1112
  "epoch": 0.41465830307032026,
1113
- "grad_norm": 26177.796875,
1114
  "learning_rate": 9.999773185673232e-05,
1115
- "loss": 66.0002,
1116
  "step": 157
1117
  },
1118
  {
1119
  "epoch": 0.4172994387586662,
1120
- "grad_norm": 9830.423828125,
1121
  "learning_rate": 9.999765157528145e-05,
1122
- "loss": 65.7361,
1123
  "step": 158
1124
  },
1125
  {
1126
  "epoch": 0.4199405744470122,
1127
- "grad_norm": 8552.87109375,
1128
  "learning_rate": 9.99975698976755e-05,
1129
- "loss": 64.1891,
1130
  "step": 159
1131
  },
1132
  {
1133
  "epoch": 0.4225817101353582,
1134
- "grad_norm": 10444.751953125,
1135
  "learning_rate": 9.99974868239168e-05,
1136
- "loss": 65.8855,
1137
  "step": 160
1138
  },
1139
  {
1140
  "epoch": 0.4252228458237042,
1141
- "grad_norm": 12173.095703125,
1142
  "learning_rate": 9.999740235400765e-05,
1143
- "loss": 63.3443,
1144
  "step": 161
1145
  },
1146
  {
1147
  "epoch": 0.4278639815120502,
1148
- "grad_norm": 11087.4599609375,
1149
  "learning_rate": 9.999731648795041e-05,
1150
- "loss": 65.6896,
1151
  "step": 162
1152
  },
1153
  {
1154
  "epoch": 0.4305051172003962,
1155
- "grad_norm": 10908.625,
1156
  "learning_rate": 9.999722922574749e-05,
1157
- "loss": 65.4028,
1158
  "step": 163
1159
  },
1160
  {
1161
  "epoch": 0.43314625288874214,
1162
- "grad_norm": 7291.02001953125,
1163
  "learning_rate": 9.999714056740129e-05,
1164
- "loss": 63.5922,
1165
  "step": 164
1166
  },
1167
  {
1168
  "epoch": 0.43578738857708815,
1169
- "grad_norm": 6860.20458984375,
1170
  "learning_rate": 9.999705051291432e-05,
1171
- "loss": 60.2327,
1172
  "step": 165
1173
  },
1174
  {
1175
  "epoch": 0.43842852426543416,
1176
- "grad_norm": 7382.8486328125,
1177
  "learning_rate": 9.999695906228908e-05,
1178
- "loss": 61.5298,
1179
  "step": 166
1180
  },
1181
  {
1182
  "epoch": 0.4410696599537801,
1183
- "grad_norm": 8748.15625,
1184
  "learning_rate": 9.999686621552813e-05,
1185
- "loss": 60.3499,
1186
  "step": 167
1187
  },
1188
  {
1189
  "epoch": 0.4437107956421261,
1190
- "grad_norm": 7557.123046875,
1191
  "learning_rate": 9.999677197263406e-05,
1192
- "loss": 61.2833,
1193
  "step": 168
1194
  },
1195
  {
1196
  "epoch": 0.44635193133047213,
1197
- "grad_norm": 9785.4970703125,
1198
  "learning_rate": 9.999667633360952e-05,
1199
- "loss": 59.1488,
1200
  "step": 169
1201
  },
1202
  {
1203
  "epoch": 0.4489930670188181,
1204
- "grad_norm": 6642.07568359375,
1205
  "learning_rate": 9.999657929845714e-05,
1206
- "loss": 60.2657,
1207
  "step": 170
1208
  },
1209
  {
1210
  "epoch": 0.4516342027071641,
1211
- "grad_norm": 7288.35791015625,
1212
  "learning_rate": 9.999648086717966e-05,
1213
- "loss": 58.5977,
1214
  "step": 171
1215
  },
1216
  {
1217
  "epoch": 0.45427533839551004,
1218
- "grad_norm": 9205.5859375,
1219
  "learning_rate": 9.999638103977982e-05,
1220
- "loss": 60.8046,
1221
  "step": 172
1222
  },
1223
  {
1224
  "epoch": 0.45691647408385605,
1225
- "grad_norm": 205730.28125,
1226
  "learning_rate": 9.999627981626041e-05,
1227
- "loss": 253.837,
1228
  "step": 173
1229
  },
1230
  {
1231
  "epoch": 0.45955760977220206,
1232
- "grad_norm": 292243.1875,
1233
  "learning_rate": 9.999617719662426e-05,
1234
- "loss": 577.5921,
1235
  "step": 174
1236
  },
1237
  {
1238
  "epoch": 0.462198745460548,
1239
- "grad_norm": 277600.3125,
1240
  "learning_rate": 9.999607318087423e-05,
1241
- "loss": 472.6887,
1242
  "step": 175
1243
  },
1244
  {
1245
  "epoch": 0.464839881148894,
1246
- "grad_norm": 338165.75,
1247
  "learning_rate": 9.999596776901322e-05,
1248
- "loss": 514.7128,
1249
  "step": 176
1250
  },
1251
  {
1252
  "epoch": 0.46748101683724,
1253
- "grad_norm": 315665.3125,
1254
  "learning_rate": 9.999586096104419e-05,
1255
- "loss": 483.1053,
1256
  "step": 177
1257
  },
1258
  {
1259
  "epoch": 0.470122152525586,
1260
- "grad_norm": 571927.125,
1261
  "learning_rate": 9.99957527569701e-05,
1262
- "loss": 511.9148,
1263
  "step": 178
1264
  },
1265
  {
1266
  "epoch": 0.472763288213932,
1267
- "grad_norm": 454109.53125,
1268
  "learning_rate": 9.999564315679398e-05,
1269
- "loss": 492.4402,
1270
  "step": 179
1271
  },
1272
  {
1273
  "epoch": 0.475404423902278,
1274
- "grad_norm": 706091.5,
1275
  "learning_rate": 9.99955321605189e-05,
1276
- "loss": 368.7538,
1277
  "step": 180
1278
  },
1279
  {
1280
  "epoch": 0.47804555959062395,
1281
- "grad_norm": 348701.1875,
1282
  "learning_rate": 9.999541976814796e-05,
1283
- "loss": 388.6287,
1284
  "step": 181
1285
  },
1286
  {
1287
  "epoch": 0.48068669527896996,
1288
- "grad_norm": 230977.109375,
1289
  "learning_rate": 9.999530597968428e-05,
1290
- "loss": 256.7187,
1291
  "step": 182
1292
  },
1293
  {
1294
  "epoch": 0.48332783096731596,
1295
- "grad_norm": 18771.59375,
1296
  "learning_rate": 9.999519079513107e-05,
1297
- "loss": 66.906,
1298
  "step": 183
1299
  },
1300
  {
1301
  "epoch": 0.4859689666556619,
1302
- "grad_norm": 24637.7421875,
1303
  "learning_rate": 9.999507421449151e-05,
1304
- "loss": 72.0359,
1305
  "step": 184
1306
  },
1307
  {
1308
  "epoch": 0.4886101023440079,
1309
- "grad_norm": 31083.162109375,
1310
  "learning_rate": 9.999495623776886e-05,
1311
- "loss": 68.0093,
1312
  "step": 185
1313
  },
1314
  {
1315
  "epoch": 0.49125123803235393,
1316
- "grad_norm": 30000.046875,
1317
  "learning_rate": 9.999483686496645e-05,
1318
- "loss": 66.7047,
1319
  "step": 186
1320
  },
1321
  {
1322
  "epoch": 0.4938923737206999,
1323
- "grad_norm": 18476.466796875,
1324
  "learning_rate": 9.999471609608757e-05,
1325
- "loss": 62.4154,
1326
  "step": 187
1327
  },
1328
  {
1329
  "epoch": 0.4965335094090459,
1330
- "grad_norm": 19716.6640625,
1331
  "learning_rate": 9.999459393113561e-05,
1332
- "loss": 63.3046,
1333
  "step": 188
1334
  },
1335
  {
1336
  "epoch": 0.4991746450973919,
1337
- "grad_norm": 20434.39453125,
1338
  "learning_rate": 9.9994470370114e-05,
1339
- "loss": 60.0479,
1340
  "step": 189
1341
  },
1342
  {
1343
  "epoch": 0.5018157807857379,
1344
- "grad_norm": 36585.4140625,
1345
  "learning_rate": 9.999434541302616e-05,
1346
- "loss": 59.0921,
1347
  "step": 190
1348
  },
1349
  {
1350
  "epoch": 0.5044569164740839,
1351
- "grad_norm": 9018.53125,
1352
  "learning_rate": 9.99942190598756e-05,
1353
- "loss": 58.4702,
1354
  "step": 191
1355
  },
1356
  {
1357
  "epoch": 0.5070980521624299,
1358
- "grad_norm": 12309.4326171875,
1359
  "learning_rate": 9.999409131066583e-05,
1360
- "loss": 59.0886,
1361
  "step": 192
1362
  },
1363
  {
1364
  "epoch": 0.5097391878507759,
1365
- "grad_norm": 12166.2783203125,
1366
  "learning_rate": 9.999396216540044e-05,
1367
- "loss": 59.2468,
1368
  "step": 193
1369
  },
1370
  {
1371
  "epoch": 0.5123803235391218,
1372
- "grad_norm": 14935.9521484375,
1373
  "learning_rate": 9.999383162408304e-05,
1374
- "loss": 60.0897,
1375
  "step": 194
1376
  },
1377
  {
1378
  "epoch": 0.5150214592274678,
1379
- "grad_norm": 8938.1298828125,
1380
  "learning_rate": 9.999369968671723e-05,
1381
- "loss": 60.3347,
1382
  "step": 195
1383
  },
1384
  {
1385
  "epoch": 0.5176625949158138,
1386
- "grad_norm": 17657.203125,
1387
  "learning_rate": 9.999356635330674e-05,
1388
- "loss": 58.8355,
1389
  "step": 196
1390
  },
1391
  {
1392
  "epoch": 0.5203037306041598,
1393
- "grad_norm": 11906.60546875,
1394
  "learning_rate": 9.999343162385529e-05,
1395
- "loss": 62.4666,
1396
  "step": 197
1397
  },
1398
  {
1399
  "epoch": 0.5229448662925058,
1400
- "grad_norm": 16041.3076171875,
1401
  "learning_rate": 9.99932954983666e-05,
1402
- "loss": 62.736,
1403
  "step": 198
1404
  },
1405
  {
1406
  "epoch": 0.5255860019808518,
1407
- "grad_norm": 17839.142578125,
1408
  "learning_rate": 9.999315797684451e-05,
1409
- "loss": 66.5862,
1410
  "step": 199
1411
  },
1412
  {
1413
  "epoch": 0.5282271376691977,
1414
- "grad_norm": 38838.1328125,
1415
  "learning_rate": 9.999301905929286e-05,
1416
- "loss": 88.0897,
1417
  "step": 200
1418
  },
1419
  {
1420
  "epoch": 0.5282271376691977,
1421
- "eval_loss": 10.59700870513916,
1422
- "eval_runtime": 2.1454,
1423
- "eval_samples_per_second": 230.732,
1424
- "eval_steps_per_second": 57.799,
1425
  "step": 200
1426
  },
1427
  {
1428
  "epoch": 0.5308682733575437,
1429
- "grad_norm": 3694.95458984375,
1430
  "learning_rate": 9.999287874571552e-05,
1431
- "loss": 70.7912,
1432
  "step": 201
1433
  },
1434
  {
1435
  "epoch": 0.5335094090458897,
1436
- "grad_norm": 7569.90283203125,
1437
  "learning_rate": 9.99927370361164e-05,
1438
- "loss": 70.1254,
1439
  "step": 202
1440
  },
1441
  {
1442
  "epoch": 0.5361505447342357,
1443
- "grad_norm": 11925.9208984375,
1444
  "learning_rate": 9.999259393049947e-05,
1445
- "loss": 67.0008,
1446
  "step": 203
1447
  },
1448
  {
1449
  "epoch": 0.5387916804225817,
1450
- "grad_norm": 11617.2470703125,
1451
  "learning_rate": 9.999244942886871e-05,
1452
- "loss": 68.452,
1453
  "step": 204
1454
  },
1455
  {
1456
  "epoch": 0.5414328161109278,
1457
- "grad_norm": 10854.8876953125,
1458
  "learning_rate": 9.999230353122819e-05,
1459
- "loss": 73.5345,
1460
  "step": 205
1461
  },
1462
  {
1463
  "epoch": 0.5440739517992736,
1464
- "grad_norm": 10908.919921875,
1465
  "learning_rate": 9.999215623758194e-05,
1466
- "loss": 73.0862,
1467
  "step": 206
1468
  },
1469
  {
1470
  "epoch": 0.5467150874876197,
1471
- "grad_norm": 11733.8857421875,
1472
  "learning_rate": 9.99920075479341e-05,
1473
- "loss": 67.9017,
1474
  "step": 207
1475
  },
1476
  {
1477
  "epoch": 0.5493562231759657,
1478
- "grad_norm": 11104.685546875,
1479
  "learning_rate": 9.999185746228882e-05,
1480
- "loss": 70.7617,
1481
  "step": 208
1482
  },
1483
  {
1484
  "epoch": 0.5519973588643117,
1485
- "grad_norm": 11203.22265625,
1486
  "learning_rate": 9.999170598065028e-05,
1487
- "loss": 70.7774,
1488
  "step": 209
1489
  },
1490
  {
1491
  "epoch": 0.5546384945526577,
1492
- "grad_norm": 11004.7236328125,
1493
  "learning_rate": 9.999155310302273e-05,
1494
- "loss": 71.0292,
1495
  "step": 210
1496
  },
1497
  {
1498
  "epoch": 0.5572796302410036,
1499
- "grad_norm": 19437.9375,
1500
  "learning_rate": 9.999139882941043e-05,
1501
- "loss": 67.4517,
1502
  "step": 211
1503
  },
1504
  {
1505
  "epoch": 0.5599207659293496,
1506
- "grad_norm": 10864.544921875,
1507
  "learning_rate": 9.999124315981766e-05,
1508
- "loss": 66.7299,
1509
  "step": 212
1510
  },
1511
  {
1512
  "epoch": 0.5625619016176956,
1513
- "grad_norm": 9301.4072265625,
1514
  "learning_rate": 9.999108609424881e-05,
1515
- "loss": 66.4239,
1516
  "step": 213
1517
  },
1518
  {
1519
  "epoch": 0.5652030373060416,
1520
- "grad_norm": 10588.080078125,
1521
  "learning_rate": 9.999092763270823e-05,
1522
- "loss": 66.0725,
1523
  "step": 214
1524
  },
1525
  {
1526
  "epoch": 0.5678441729943876,
1527
- "grad_norm": 8979.6298828125,
1528
  "learning_rate": 9.999076777520037e-05,
1529
- "loss": 65.9684,
1530
  "step": 215
1531
  },
1532
  {
1533
  "epoch": 0.5704853086827336,
1534
- "grad_norm": 11648.29296875,
1535
  "learning_rate": 9.99906065217297e-05,
1536
- "loss": 64.9926,
1537
  "step": 216
1538
  },
1539
  {
1540
  "epoch": 0.5731264443710795,
1541
- "grad_norm": 8562.33984375,
1542
  "learning_rate": 9.99904438723007e-05,
1543
- "loss": 63.8293,
1544
  "step": 217
1545
  },
1546
  {
1547
  "epoch": 0.5757675800594255,
1548
- "grad_norm": 9085.8671875,
1549
  "learning_rate": 9.999027982691793e-05,
1550
- "loss": 64.2146,
1551
  "step": 218
1552
  },
1553
  {
1554
  "epoch": 0.5784087157477715,
1555
- "grad_norm": 11899.927734375,
1556
  "learning_rate": 9.999011438558595e-05,
1557
- "loss": 66.3891,
1558
  "step": 219
1559
  },
1560
  {
1561
  "epoch": 0.5810498514361175,
1562
- "grad_norm": 9598.9765625,
1563
  "learning_rate": 9.99899475483094e-05,
1564
- "loss": 64.4793,
1565
  "step": 220
1566
  },
1567
  {
1568
  "epoch": 0.5836909871244635,
1569
- "grad_norm": 25579.998046875,
1570
  "learning_rate": 9.998977931509291e-05,
1571
- "loss": 81.3959,
1572
  "step": 221
1573
  },
1574
  {
1575
  "epoch": 0.5863321228128096,
1576
- "grad_norm": 245254.65625,
1577
  "learning_rate": 9.998960968594121e-05,
1578
- "loss": 284.9948,
1579
  "step": 222
1580
  },
1581
  {
1582
  "epoch": 0.5889732585011554,
1583
- "grad_norm": 461914.90625,
1584
  "learning_rate": 9.998943866085903e-05,
1585
- "loss": 593.3946,
1586
  "step": 223
1587
  },
1588
  {
1589
  "epoch": 0.5916143941895015,
1590
- "grad_norm": 585288.625,
1591
  "learning_rate": 9.998926623985114e-05,
1592
- "loss": 503.4478,
1593
  "step": 224
1594
  },
1595
  {
1596
  "epoch": 0.5942555298778475,
1597
- "grad_norm": 269043.96875,
1598
  "learning_rate": 9.998909242292235e-05,
1599
- "loss": 707.6039,
1600
  "step": 225
1601
  },
1602
  {
1603
  "epoch": 0.5968966655661935,
1604
- "grad_norm": 381116.90625,
1605
  "learning_rate": 9.998891721007752e-05,
1606
- "loss": 658.0789,
1607
  "step": 226
1608
  },
1609
  {
1610
  "epoch": 0.5995378012545395,
1611
- "grad_norm": 337170.28125,
1612
  "learning_rate": 9.998874060132155e-05,
1613
- "loss": 471.6391,
1614
  "step": 227
1615
  },
1616
  {
1617
  "epoch": 0.6021789369428855,
1618
- "grad_norm": 825105.6875,
1619
  "learning_rate": 9.998856259665936e-05,
1620
- "loss": 563.7874,
1621
  "step": 228
1622
  },
1623
  {
1624
  "epoch": 0.6048200726312314,
1625
- "grad_norm": 796742.5625,
1626
  "learning_rate": 9.998838319609591e-05,
1627
- "loss": 596.4635,
1628
  "step": 229
1629
  },
1630
  {
1631
  "epoch": 0.6074612083195774,
1632
- "grad_norm": 948548.125,
1633
  "learning_rate": 9.998820239963624e-05,
1634
- "loss": 609.4591,
1635
  "step": 230
1636
  },
1637
  {
1638
  "epoch": 0.6101023440079234,
1639
- "grad_norm": 808582.0,
1640
  "learning_rate": 9.998802020728537e-05,
1641
- "loss": 401.162,
1642
  "step": 231
1643
  },
1644
  {
1645
  "epoch": 0.6127434796962694,
1646
- "grad_norm": 13525.0390625,
1647
  "learning_rate": 9.998783661904843e-05,
1648
- "loss": 69.3578,
1649
  "step": 232
1650
  },
1651
  {
1652
  "epoch": 0.6153846153846154,
1653
- "grad_norm": 20728.669921875,
1654
  "learning_rate": 9.99876516349305e-05,
1655
- "loss": 71.1247,
1656
  "step": 233
1657
  },
1658
  {
1659
  "epoch": 0.6180257510729614,
1660
- "grad_norm": 15068.5966796875,
1661
  "learning_rate": 9.998746525493674e-05,
1662
- "loss": 65.7083,
1663
  "step": 234
1664
  },
1665
  {
1666
  "epoch": 0.6206668867613073,
1667
- "grad_norm": 14315.1669921875,
1668
  "learning_rate": 9.99872774790724e-05,
1669
- "loss": 65.8457,
1670
  "step": 235
1671
  },
1672
  {
1673
  "epoch": 0.6233080224496533,
1674
- "grad_norm": 19398.673828125,
1675
  "learning_rate": 9.99870883073427e-05,
1676
- "loss": 64.2142,
1677
  "step": 236
1678
  },
1679
  {
1680
  "epoch": 0.6259491581379993,
1681
- "grad_norm": 13164.232421875,
1682
  "learning_rate": 9.998689773975291e-05,
1683
- "loss": 62.7163,
1684
  "step": 237
1685
  },
1686
  {
1687
  "epoch": 0.6285902938263453,
1688
- "grad_norm": 12071.3203125,
1689
  "learning_rate": 9.998670577630838e-05,
1690
- "loss": 63.4349,
1691
  "step": 238
1692
  },
1693
  {
1694
  "epoch": 0.6312314295146914,
1695
- "grad_norm": 14491.6279296875,
1696
  "learning_rate": 9.998651241701445e-05,
1697
- "loss": 63.4186,
1698
  "step": 239
1699
  },
1700
  {
1701
  "epoch": 0.6338725652030373,
1702
- "grad_norm": 14329.0244140625,
1703
  "learning_rate": 9.998631766187651e-05,
1704
- "loss": 63.8988,
1705
  "step": 240
1706
  },
1707
  {
1708
  "epoch": 0.6365137008913833,
1709
- "grad_norm": 13378.42578125,
1710
  "learning_rate": 9.998612151090003e-05,
1711
- "loss": 64.8519,
1712
  "step": 241
1713
  },
1714
  {
1715
  "epoch": 0.6391548365797293,
1716
- "grad_norm": 17472.20703125,
1717
  "learning_rate": 9.998592396409047e-05,
1718
- "loss": 65.1649,
1719
  "step": 242
1720
  },
1721
  {
1722
  "epoch": 0.6417959722680753,
1723
- "grad_norm": 14112.044921875,
1724
  "learning_rate": 9.998572502145334e-05,
1725
- "loss": 63.5169,
1726
  "step": 243
1727
  },
1728
  {
1729
  "epoch": 0.6444371079564213,
1730
- "grad_norm": 12317.0458984375,
1731
  "learning_rate": 9.998552468299421e-05,
1732
- "loss": 64.896,
1733
  "step": 244
1734
  },
1735
  {
1736
  "epoch": 0.6470782436447673,
1737
- "grad_norm": 14267.3935546875,
1738
  "learning_rate": 9.998532294871866e-05,
1739
- "loss": 63.8178,
1740
  "step": 245
1741
  },
1742
  {
1743
  "epoch": 0.6497193793331132,
1744
- "grad_norm": 11890.05859375,
1745
  "learning_rate": 9.998511981863232e-05,
1746
- "loss": 61.3388,
1747
  "step": 246
1748
  },
1749
  {
1750
  "epoch": 0.6523605150214592,
1751
- "grad_norm": 16452.38671875,
1752
  "learning_rate": 9.998491529274089e-05,
1753
- "loss": 62.4636,
1754
  "step": 247
1755
  },
1756
  {
1757
  "epoch": 0.6550016507098052,
1758
- "grad_norm": 18463.275390625,
1759
  "learning_rate": 9.998470937105006e-05,
1760
- "loss": 64.5606,
1761
  "step": 248
1762
  },
1763
  {
1764
  "epoch": 0.6576427863981512,
1765
- "grad_norm": 16050.0419921875,
1766
  "learning_rate": 9.998450205356557e-05,
1767
- "loss": 67.6247,
1768
  "step": 249
1769
  },
1770
  {
1771
  "epoch": 0.6602839220864972,
1772
- "grad_norm": 43525.80859375,
1773
  "learning_rate": 9.998429334029323e-05,
1774
- "loss": 74.6141,
1775
  "step": 250
1776
  },
1777
  {
1778
  "epoch": 0.6629250577748432,
1779
- "grad_norm": 3725.85693359375,
1780
  "learning_rate": 9.998408323123887e-05,
1781
- "loss": 68.5792,
1782
  "step": 251
1783
  },
1784
  {
1785
  "epoch": 0.6655661934631891,
1786
- "grad_norm": 9637.2275390625,
1787
  "learning_rate": 9.998387172640834e-05,
1788
- "loss": 68.4554,
1789
  "step": 252
1790
  },
1791
  {
1792
  "epoch": 0.6682073291515351,
1793
- "grad_norm": 14481.005859375,
1794
  "learning_rate": 9.998365882580756e-05,
1795
- "loss": 68.1712,
1796
  "step": 253
1797
  },
1798
  {
1799
  "epoch": 0.6708484648398811,
1800
- "grad_norm": 11797.9658203125,
1801
  "learning_rate": 9.998344452944247e-05,
1802
- "loss": 65.2306,
1803
  "step": 254
1804
  },
1805
  {
1806
  "epoch": 0.6734896005282272,
1807
- "grad_norm": 10886.556640625,
1808
  "learning_rate": 9.998322883731903e-05,
1809
- "loss": 66.0697,
1810
  "step": 255
1811
  },
1812
  {
1813
  "epoch": 0.6761307362165732,
1814
- "grad_norm": 9600.83203125,
1815
  "learning_rate": 9.998301174944332e-05,
1816
- "loss": 65.1392,
1817
  "step": 256
1818
  },
1819
  {
1820
  "epoch": 0.6787718719049192,
1821
- "grad_norm": 9407.5732421875,
1822
  "learning_rate": 9.998279326582134e-05,
1823
- "loss": 66.4946,
1824
  "step": 257
1825
  },
1826
  {
1827
  "epoch": 0.6814130075932651,
1828
- "grad_norm": 9613.078125,
1829
  "learning_rate": 9.998257338645924e-05,
1830
- "loss": 66.5279,
1831
  "step": 258
1832
  },
1833
  {
1834
  "epoch": 0.6840541432816111,
1835
- "grad_norm": 11849.9658203125,
1836
  "learning_rate": 9.998235211136312e-05,
1837
- "loss": 62.42,
1838
  "step": 259
1839
  },
1840
  {
1841
  "epoch": 0.6866952789699571,
1842
- "grad_norm": 7529.81298828125,
1843
  "learning_rate": 9.99821294405392e-05,
1844
- "loss": 63.2745,
1845
  "step": 260
1846
  },
1847
  {
1848
  "epoch": 0.6893364146583031,
1849
- "grad_norm": 8242.2734375,
1850
  "learning_rate": 9.998190537399366e-05,
1851
- "loss": 62.0032,
1852
  "step": 261
1853
  },
1854
  {
1855
  "epoch": 0.6919775503466491,
1856
- "grad_norm": 8303.287109375,
1857
  "learning_rate": 9.998167991173277e-05,
1858
- "loss": 61.1192,
1859
  "step": 262
1860
  },
1861
  {
1862
  "epoch": 0.6946186860349951,
1863
- "grad_norm": 8776.1552734375,
1864
  "learning_rate": 9.998145305376286e-05,
1865
- "loss": 61.8228,
1866
  "step": 263
1867
  },
1868
  {
1869
  "epoch": 0.697259821723341,
1870
- "grad_norm": 8703.177734375,
1871
  "learning_rate": 9.99812248000902e-05,
1872
- "loss": 58.9814,
1873
  "step": 264
1874
  },
1875
  {
1876
  "epoch": 0.699900957411687,
1877
- "grad_norm": 5833.73291015625,
1878
  "learning_rate": 9.998099515072122e-05,
1879
- "loss": 59.393,
1880
  "step": 265
1881
  },
1882
  {
1883
  "epoch": 0.702542093100033,
1884
- "grad_norm": 7563.53955078125,
1885
  "learning_rate": 9.998076410566229e-05,
1886
- "loss": 59.9513,
1887
  "step": 266
1888
  },
1889
  {
1890
  "epoch": 0.705183228788379,
1891
- "grad_norm": 6206.10302734375,
1892
  "learning_rate": 9.99805316649199e-05,
1893
- "loss": 56.7019,
1894
  "step": 267
1895
  },
1896
  {
1897
  "epoch": 0.707824364476725,
1898
- "grad_norm": 10631.1572265625,
1899
  "learning_rate": 9.998029782850051e-05,
1900
- "loss": 57.3627,
1901
  "step": 268
1902
  },
1903
  {
1904
  "epoch": 0.7104655001650709,
1905
- "grad_norm": 9288.6162109375,
1906
  "learning_rate": 9.998006259641068e-05,
1907
- "loss": 57.3989,
1908
  "step": 269
1909
  },
1910
  {
1911
  "epoch": 0.7131066358534169,
1912
- "grad_norm": 6667.482421875,
1913
  "learning_rate": 9.997982596865695e-05,
1914
- "loss": 57.6789,
1915
  "step": 270
1916
  },
1917
  {
1918
  "epoch": 0.715747771541763,
1919
- "grad_norm": 17684.41796875,
1920
  "learning_rate": 9.997958794524594e-05,
1921
- "loss": 62.7689,
1922
  "step": 271
1923
  },
1924
  {
1925
  "epoch": 0.718388907230109,
1926
- "grad_norm": 252370.53125,
1927
  "learning_rate": 9.99793485261843e-05,
1928
- "loss": 305.7854,
1929
  "step": 272
1930
  },
1931
  {
1932
  "epoch": 0.721030042918455,
1933
- "grad_norm": 664039.5,
1934
  "learning_rate": 9.997910771147872e-05,
1935
- "loss": 555.3836,
1936
  "step": 273
1937
  },
1938
  {
1939
  "epoch": 0.723671178606801,
1940
- "grad_norm": 580103.3125,
1941
  "learning_rate": 9.99788655011359e-05,
1942
- "loss": 510.4261,
1943
  "step": 274
1944
  },
1945
  {
1946
  "epoch": 0.7263123142951469,
1947
- "grad_norm": 580392.125,
1948
  "learning_rate": 9.997862189516263e-05,
1949
- "loss": 611.9569,
1950
  "step": 275
1951
  },
1952
  {
1953
  "epoch": 0.7289534499834929,
1954
- "grad_norm": 400392.0,
1955
  "learning_rate": 9.99783768935657e-05,
1956
- "loss": 430.9034,
1957
  "step": 276
1958
  },
1959
  {
1960
  "epoch": 0.7315945856718389,
1961
- "grad_norm": 478446.125,
1962
  "learning_rate": 9.997813049635195e-05,
1963
- "loss": 414.0167,
1964
  "step": 277
1965
  },
1966
  {
1967
  "epoch": 0.7342357213601849,
1968
- "grad_norm": 808013.3125,
1969
  "learning_rate": 9.997788270352827e-05,
1970
- "loss": 480.0751,
1971
  "step": 278
1972
  },
1973
  {
1974
  "epoch": 0.7368768570485309,
1975
- "grad_norm": 705417.6875,
1976
  "learning_rate": 9.997763351510157e-05,
1977
- "loss": 438.6093,
1978
  "step": 279
1979
  },
1980
  {
1981
  "epoch": 0.7395179927368769,
1982
- "grad_norm": 465065.15625,
1983
  "learning_rate": 9.997738293107881e-05,
1984
- "loss": 402.4464,
1985
  "step": 280
1986
  },
1987
  {
1988
  "epoch": 0.7421591284252228,
1989
- "grad_norm": 1029938.5,
1990
  "learning_rate": 9.9977130951467e-05,
1991
- "loss": 325.2719,
1992
  "step": 281
1993
  },
1994
  {
1995
  "epoch": 0.7448002641135688,
1996
- "grad_norm": 438944.9375,
1997
  "learning_rate": 9.997687757627316e-05,
1998
- "loss": 133.4326,
1999
  "step": 282
2000
  },
2001
  {
2002
  "epoch": 0.7474413998019148,
2003
- "grad_norm": 11383.103515625,
2004
  "learning_rate": 9.997662280550437e-05,
2005
- "loss": 59.436,
2006
  "step": 283
2007
  },
2008
  {
2009
  "epoch": 0.7500825354902608,
2010
- "grad_norm": 12389.4365234375,
2011
  "learning_rate": 9.997636663916776e-05,
2012
- "loss": 59.0242,
2013
  "step": 284
2014
  },
2015
  {
2016
  "epoch": 0.7527236711786068,
2017
- "grad_norm": 17077.525390625,
2018
  "learning_rate": 9.997610907727046e-05,
2019
- "loss": 62.5651,
2020
  "step": 285
2021
  },
2022
  {
2023
  "epoch": 0.7553648068669528,
2024
- "grad_norm": 13895.0234375,
2025
  "learning_rate": 9.997585011981966e-05,
2026
- "loss": 59.2132,
2027
  "step": 286
2028
  },
2029
  {
2030
  "epoch": 0.7580059425552987,
2031
- "grad_norm": 15847.7353515625,
2032
  "learning_rate": 9.997558976682262e-05,
2033
- "loss": 57.9522,
2034
  "step": 287
2035
  },
2036
  {
2037
  "epoch": 0.7606470782436447,
2038
- "grad_norm": 11588.943359375,
2039
  "learning_rate": 9.997532801828658e-05,
2040
- "loss": 57.9217,
2041
  "step": 288
2042
  },
2043
  {
2044
  "epoch": 0.7632882139319908,
2045
- "grad_norm": 16579.208984375,
2046
  "learning_rate": 9.997506487421888e-05,
2047
- "loss": 60.2787,
2048
  "step": 289
2049
  },
2050
  {
2051
  "epoch": 0.7659293496203368,
2052
- "grad_norm": 14254.7177734375,
2053
  "learning_rate": 9.997480033462683e-05,
2054
- "loss": 59.4413,
2055
  "step": 290
2056
  },
2057
  {
2058
  "epoch": 0.7685704853086828,
2059
- "grad_norm": 17540.08984375,
2060
  "learning_rate": 9.997453439951784e-05,
2061
- "loss": 59.8224,
2062
  "step": 291
2063
  },
2064
  {
2065
  "epoch": 0.7712116209970287,
2066
- "grad_norm": 9918.6962890625,
2067
  "learning_rate": 9.997426706889935e-05,
2068
- "loss": 59.8017,
2069
  "step": 292
2070
  },
2071
  {
2072
  "epoch": 0.7738527566853747,
2073
- "grad_norm": 13995.48828125,
2074
  "learning_rate": 9.997399834277878e-05,
2075
- "loss": 60.269,
2076
  "step": 293
2077
  },
2078
  {
2079
  "epoch": 0.7764938923737207,
2080
- "grad_norm": 8614.5107421875,
2081
  "learning_rate": 9.997372822116368e-05,
2082
- "loss": 59.3231,
2083
  "step": 294
2084
  },
2085
  {
2086
  "epoch": 0.7791350280620667,
2087
- "grad_norm": 13204.427734375,
2088
  "learning_rate": 9.99734567040616e-05,
2089
- "loss": 56.924,
2090
  "step": 295
2091
  },
2092
  {
2093
  "epoch": 0.7817761637504127,
2094
- "grad_norm": 16248.548828125,
2095
  "learning_rate": 9.997318379148007e-05,
2096
- "loss": 61.0552,
2097
  "step": 296
2098
  },
2099
  {
2100
  "epoch": 0.7844172994387587,
2101
- "grad_norm": 14133.9208984375,
2102
  "learning_rate": 9.997290948342673e-05,
2103
- "loss": 59.6736,
2104
  "step": 297
2105
  },
2106
  {
2107
  "epoch": 0.7870584351271046,
2108
- "grad_norm": 15652.419921875,
2109
  "learning_rate": 9.997263377990926e-05,
2110
- "loss": 60.1692,
2111
  "step": 298
2112
  },
2113
  {
2114
  "epoch": 0.7896995708154506,
2115
- "grad_norm": 18741.619140625,
2116
  "learning_rate": 9.997235668093535e-05,
2117
- "loss": 66.0071,
2118
  "step": 299
2119
  },
2120
  {
2121
  "epoch": 0.7923407065037966,
2122
- "grad_norm": 21825.05078125,
2123
  "learning_rate": 9.997207818651274e-05,
2124
- "loss": 67.6062,
2125
  "step": 300
2126
  },
2127
  {
2128
  "epoch": 0.7949818421921426,
2129
- "grad_norm": 2637.73046875,
2130
  "learning_rate": 9.997179829664918e-05,
2131
- "loss": 64.2331,
2132
  "step": 301
2133
  },
2134
  {
2135
  "epoch": 0.7976229778804886,
2136
- "grad_norm": 8206.5966796875,
2137
  "learning_rate": 9.997151701135253e-05,
2138
- "loss": 63.1637,
2139
  "step": 302
2140
  },
2141
  {
2142
  "epoch": 0.8002641135688346,
2143
- "grad_norm": 9712.4833984375,
2144
  "learning_rate": 9.997123433063062e-05,
2145
- "loss": 61.4383,
2146
  "step": 303
2147
  },
2148
  {
2149
  "epoch": 0.8029052492571805,
2150
- "grad_norm": 8519.5078125,
2151
  "learning_rate": 9.997095025449134e-05,
2152
- "loss": 65.2138,
2153
  "step": 304
2154
  },
2155
  {
2156
  "epoch": 0.8055463849455266,
2157
- "grad_norm": 9691.27734375,
2158
  "learning_rate": 9.997066478294262e-05,
2159
- "loss": 66.0385,
2160
  "step": 305
2161
  },
2162
  {
2163
  "epoch": 0.8081875206338726,
2164
- "grad_norm": 7133.42822265625,
2165
  "learning_rate": 9.997037791599245e-05,
2166
- "loss": 63.5754,
2167
  "step": 306
2168
  },
2169
  {
2170
  "epoch": 0.8108286563222186,
2171
- "grad_norm": 11070.7421875,
2172
  "learning_rate": 9.997008965364884e-05,
2173
- "loss": 62.8546,
2174
  "step": 307
2175
  },
2176
  {
2177
  "epoch": 0.8134697920105646,
2178
- "grad_norm": 17131.470703125,
2179
  "learning_rate": 9.996979999591983e-05,
2180
- "loss": 64.5126,
2181
  "step": 308
2182
  },
2183
  {
2184
  "epoch": 0.8161109276989106,
2185
- "grad_norm": 8100.720703125,
2186
  "learning_rate": 9.996950894281349e-05,
2187
- "loss": 62.2825,
2188
  "step": 309
2189
  },
2190
  {
2191
  "epoch": 0.8187520633872565,
2192
- "grad_norm": 8409.66015625,
2193
  "learning_rate": 9.996921649433796e-05,
2194
- "loss": 63.2592,
2195
  "step": 310
2196
  },
2197
  {
2198
  "epoch": 0.8213931990756025,
2199
- "grad_norm": 13785.3310546875,
2200
  "learning_rate": 9.996892265050144e-05,
2201
- "loss": 62.3268,
2202
  "step": 311
2203
  },
2204
  {
2205
  "epoch": 0.8240343347639485,
2206
- "grad_norm": 8336.591796875,
2207
  "learning_rate": 9.99686274113121e-05,
2208
- "loss": 62.088,
2209
  "step": 312
2210
  },
2211
  {
2212
  "epoch": 0.8266754704522945,
2213
- "grad_norm": 10653.974609375,
2214
  "learning_rate": 9.996833077677819e-05,
2215
- "loss": 63.5181,
2216
  "step": 313
2217
  },
2218
  {
2219
  "epoch": 0.8293166061406405,
2220
- "grad_norm": 12778.6669921875,
2221
  "learning_rate": 9.9968032746908e-05,
2222
- "loss": 59.519,
2223
  "step": 314
2224
  },
2225
  {
2226
  "epoch": 0.8319577418289865,
2227
- "grad_norm": 7400.33642578125,
2228
  "learning_rate": 9.996773332170983e-05,
2229
- "loss": 58.8288,
2230
  "step": 315
2231
  },
2232
  {
2233
  "epoch": 0.8345988775173324,
2234
- "grad_norm": 9944.3662109375,
2235
  "learning_rate": 9.996743250119209e-05,
2236
- "loss": 59.9352,
2237
  "step": 316
2238
  },
2239
  {
2240
  "epoch": 0.8372400132056784,
2241
- "grad_norm": 13972.748046875,
2242
  "learning_rate": 9.996713028536313e-05,
2243
- "loss": 59.4545,
2244
  "step": 317
2245
  },
2246
  {
2247
  "epoch": 0.8398811488940244,
2248
- "grad_norm": 18110.81640625,
2249
  "learning_rate": 9.99668266742314e-05,
2250
- "loss": 59.069,
2251
  "step": 318
2252
  },
2253
  {
2254
  "epoch": 0.8425222845823704,
2255
- "grad_norm": 15435.197265625,
2256
  "learning_rate": 9.99665216678054e-05,
2257
- "loss": 58.4697,
2258
  "step": 319
2259
  },
2260
  {
2261
  "epoch": 0.8451634202707164,
2262
- "grad_norm": 13905.0234375,
2263
  "learning_rate": 9.996621526609364e-05,
2264
- "loss": 58.9704,
2265
  "step": 320
2266
  },
2267
  {
2268
  "epoch": 0.8478045559590623,
2269
- "grad_norm": 8557.2861328125,
2270
  "learning_rate": 9.996590746910467e-05,
2271
- "loss": 58.9029,
2272
  "step": 321
2273
  },
2274
  {
2275
  "epoch": 0.8504456916474084,
2276
- "grad_norm": 148880.8125,
2277
  "learning_rate": 9.996559827684709e-05,
2278
- "loss": 99.4343,
2279
  "step": 322
2280
  },
2281
  {
2282
  "epoch": 0.8530868273357544,
2283
- "grad_norm": 393529.59375,
2284
  "learning_rate": 9.996528768932951e-05,
2285
- "loss": 350.0454,
2286
  "step": 323
2287
  },
2288
  {
2289
  "epoch": 0.8557279630241004,
2290
- "grad_norm": 1061575.0,
2291
  "learning_rate": 9.996497570656062e-05,
2292
- "loss": 439.9771,
2293
  "step": 324
2294
  },
2295
  {
2296
  "epoch": 0.8583690987124464,
2297
- "grad_norm": 552709.625,
2298
  "learning_rate": 9.996466232854915e-05,
2299
- "loss": 467.0348,
2300
  "step": 325
2301
  },
2302
  {
2303
  "epoch": 0.8610102344007924,
2304
- "grad_norm": 767159.625,
2305
  "learning_rate": 9.996434755530384e-05,
2306
- "loss": 426.5998,
2307
  "step": 326
2308
  },
2309
  {
2310
  "epoch": 0.8636513700891383,
2311
- "grad_norm": 1236165.875,
2312
  "learning_rate": 9.996403138683347e-05,
2313
- "loss": 499.8257,
2314
  "step": 327
2315
  },
2316
  {
2317
  "epoch": 0.8662925057774843,
2318
- "grad_norm": 541820.5,
2319
  "learning_rate": 9.996371382314686e-05,
2320
- "loss": 493.8622,
2321
  "step": 328
2322
  },
2323
  {
2324
  "epoch": 0.8689336414658303,
2325
- "grad_norm": 289596.21875,
2326
  "learning_rate": 9.996339486425291e-05,
2327
- "loss": 424.9042,
2328
  "step": 329
2329
  },
2330
  {
2331
  "epoch": 0.8715747771541763,
2332
- "grad_norm": 575659.1875,
2333
  "learning_rate": 9.99630745101605e-05,
2334
- "loss": 383.0528,
2335
  "step": 330
2336
  },
2337
  {
2338
  "epoch": 0.8742159128425223,
2339
- "grad_norm": 1122850.75,
2340
  "learning_rate": 9.996275276087859e-05,
2341
- "loss": 381.5712,
2342
  "step": 331
2343
  },
2344
  {
2345
  "epoch": 0.8768570485308683,
2346
- "grad_norm": 510877.03125,
2347
  "learning_rate": 9.996242961641615e-05,
2348
- "loss": 305.4172,
2349
  "step": 332
2350
  },
2351
  {
2352
  "epoch": 0.8794981842192142,
2353
- "grad_norm": 23915.0078125,
2354
  "learning_rate": 9.996210507678223e-05,
2355
- "loss": 66.8374,
2356
  "step": 333
2357
  },
2358
  {
2359
  "epoch": 0.8821393199075602,
2360
- "grad_norm": 15597.9814453125,
2361
  "learning_rate": 9.996177914198586e-05,
2362
- "loss": 62.5703,
2363
  "step": 334
2364
  },
2365
  {
2366
  "epoch": 0.8847804555959062,
2367
- "grad_norm": 16045.9091796875,
2368
  "learning_rate": 9.996145181203615e-05,
2369
- "loss": 59.8236,
2370
  "step": 335
2371
  },
2372
  {
2373
  "epoch": 0.8874215912842522,
2374
- "grad_norm": 11153.94921875,
2375
  "learning_rate": 9.996112308694225e-05,
2376
- "loss": 59.7523,
2377
  "step": 336
2378
  },
2379
  {
2380
  "epoch": 0.8900627269725983,
2381
- "grad_norm": 24600.546875,
2382
  "learning_rate": 9.996079296671334e-05,
2383
- "loss": 59.103,
2384
  "step": 337
2385
  },
2386
  {
2387
  "epoch": 0.8927038626609443,
2388
- "grad_norm": 15846.4345703125,
2389
  "learning_rate": 9.996046145135865e-05,
2390
- "loss": 57.7033,
2391
  "step": 338
2392
  },
2393
  {
2394
  "epoch": 0.8953449983492902,
2395
- "grad_norm": 14696.9482421875,
2396
  "learning_rate": 9.99601285408874e-05,
2397
- "loss": 58.9927,
2398
  "step": 339
2399
  },
2400
  {
2401
  "epoch": 0.8979861340376362,
2402
- "grad_norm": 18041.802734375,
2403
  "learning_rate": 9.995979423530892e-05,
2404
- "loss": 58.9629,
2405
  "step": 340
2406
  },
2407
  {
2408
  "epoch": 0.9006272697259822,
2409
- "grad_norm": 14364.46484375,
2410
  "learning_rate": 9.995945853463253e-05,
2411
- "loss": 58.5847,
2412
  "step": 341
2413
  },
2414
  {
2415
  "epoch": 0.9032684054143282,
2416
- "grad_norm": 9606.1572265625,
2417
  "learning_rate": 9.995912143886763e-05,
2418
- "loss": 57.8905,
2419
  "step": 342
2420
  },
2421
  {
2422
  "epoch": 0.9059095411026742,
2423
- "grad_norm": 18811.689453125,
2424
  "learning_rate": 9.995878294802357e-05,
2425
- "loss": 58.0623,
2426
  "step": 343
2427
  },
2428
  {
2429
  "epoch": 0.9085506767910201,
2430
- "grad_norm": 11506.2353515625,
2431
  "learning_rate": 9.995844306210988e-05,
2432
- "loss": 58.0829,
2433
  "step": 344
2434
  },
2435
  {
2436
  "epoch": 0.9111918124793661,
2437
- "grad_norm": 14010.29296875,
2438
  "learning_rate": 9.995810178113599e-05,
2439
- "loss": 59.2473,
2440
  "step": 345
2441
  },
2442
  {
2443
  "epoch": 0.9138329481677121,
2444
- "grad_norm": 14964.6474609375,
2445
  "learning_rate": 9.995775910511147e-05,
2446
- "loss": 61.7464,
2447
  "step": 346
2448
  },
2449
  {
2450
  "epoch": 0.9164740838560581,
2451
- "grad_norm": 19705.8671875,
2452
  "learning_rate": 9.995741503404587e-05,
2453
- "loss": 59.3706,
2454
  "step": 347
2455
  },
2456
  {
2457
  "epoch": 0.9191152195444041,
2458
- "grad_norm": 52520.546875,
2459
  "learning_rate": 9.995706956794879e-05,
2460
- "loss": 64.7237,
2461
  "step": 348
2462
  },
2463
  {
2464
  "epoch": 0.9217563552327501,
2465
- "grad_norm": 22547.396484375,
2466
  "learning_rate": 9.99567227068299e-05,
2467
- "loss": 66.6127,
2468
  "step": 349
2469
  },
2470
  {
2471
  "epoch": 0.924397490921096,
2472
- "grad_norm": 46974.98046875,
2473
  "learning_rate": 9.995637445069887e-05,
2474
- "loss": 73.9882,
2475
  "step": 350
2476
  },
2477
  {
2478
  "epoch": 0.927038626609442,
2479
- "grad_norm": 6136.3505859375,
2480
  "learning_rate": 9.995602479956545e-05,
2481
- "loss": 62.4233,
2482
  "step": 351
2483
  },
2484
  {
2485
  "epoch": 0.929679762297788,
2486
- "grad_norm": 8587.3564453125,
2487
  "learning_rate": 9.995567375343937e-05,
2488
- "loss": 61.321,
2489
  "step": 352
2490
  },
2491
  {
2492
  "epoch": 0.932320897986134,
2493
- "grad_norm": 10655.9970703125,
2494
  "learning_rate": 9.995532131233044e-05,
2495
- "loss": 66.1859,
2496
  "step": 353
2497
  },
2498
  {
2499
  "epoch": 0.93496203367448,
2500
- "grad_norm": 8629.3466796875,
2501
  "learning_rate": 9.99549674762485e-05,
2502
- "loss": 65.6821,
2503
  "step": 354
2504
  },
2505
  {
2506
  "epoch": 0.9376031693628261,
2507
- "grad_norm": 12914.33984375,
2508
  "learning_rate": 9.995461224520345e-05,
2509
- "loss": 63.7299,
2510
  "step": 355
2511
  },
2512
  {
2513
  "epoch": 0.940244305051172,
2514
- "grad_norm": 8047.43408203125,
2515
  "learning_rate": 9.995425561920519e-05,
2516
- "loss": 63.5199,
2517
  "step": 356
2518
  },
2519
  {
2520
  "epoch": 0.942885440739518,
2521
- "grad_norm": 10055.4541015625,
2522
  "learning_rate": 9.99538975982637e-05,
2523
- "loss": 62.5686,
2524
  "step": 357
2525
  },
2526
  {
2527
  "epoch": 0.945526576427864,
2528
- "grad_norm": 7842.205078125,
2529
  "learning_rate": 9.995353818238895e-05,
2530
- "loss": 60.4935,
2531
  "step": 358
2532
  },
2533
  {
2534
  "epoch": 0.94816771211621,
2535
- "grad_norm": 11816.7744140625,
2536
  "learning_rate": 9.9953177371591e-05,
2537
- "loss": 60.5945,
2538
  "step": 359
2539
  },
2540
  {
2541
  "epoch": 0.950808847804556,
2542
- "grad_norm": 9103.77734375,
2543
  "learning_rate": 9.995281516587991e-05,
2544
- "loss": 58.469,
2545
  "step": 360
2546
  },
2547
  {
2548
  "epoch": 0.953449983492902,
2549
- "grad_norm": 9220.1435546875,
2550
  "learning_rate": 9.99524515652658e-05,
2551
- "loss": 60.1234,
2552
  "step": 361
2553
  },
2554
  {
2555
  "epoch": 0.9560911191812479,
2556
- "grad_norm": 12009.244140625,
2557
  "learning_rate": 9.995208656975884e-05,
2558
- "loss": 62.1523,
2559
  "step": 362
2560
  },
2561
  {
2562
  "epoch": 0.9587322548695939,
2563
- "grad_norm": 229421.28125,
2564
  "learning_rate": 9.995172017936919e-05,
2565
- "loss": 300.6131,
2566
  "step": 363
2567
  },
2568
  {
2569
  "epoch": 0.9613733905579399,
2570
- "grad_norm": 245295.8125,
2571
  "learning_rate": 9.99513523941071e-05,
2572
- "loss": 419.0695,
2573
  "step": 364
2574
  },
2575
  {
2576
  "epoch": 0.9640145262462859,
2577
- "grad_norm": 410617.90625,
2578
  "learning_rate": 9.995098321398284e-05,
2579
- "loss": 378.4255,
2580
  "step": 365
2581
  },
2582
  {
2583
  "epoch": 0.9666556619346319,
2584
- "grad_norm": 421334.40625,
2585
  "learning_rate": 9.995061263900671e-05,
2586
- "loss": 336.8766,
2587
  "step": 366
2588
  },
2589
  {
2590
  "epoch": 0.9692967976229779,
2591
- "grad_norm": 493650.0625,
2592
  "learning_rate": 9.995024066918908e-05,
2593
- "loss": 285.4703,
2594
  "step": 367
2595
  },
2596
  {
2597
  "epoch": 0.9719379333113238,
2598
- "grad_norm": 341138.4375,
2599
  "learning_rate": 9.994986730454031e-05,
2600
- "loss": 327.6716,
2601
  "step": 368
2602
  },
2603
  {
2604
  "epoch": 0.9745790689996698,
2605
- "grad_norm": 29870.67578125,
2606
  "learning_rate": 9.994949254507084e-05,
2607
- "loss": 68.203,
2608
  "step": 369
2609
  },
2610
  {
2611
  "epoch": 0.9772202046880158,
2612
- "grad_norm": 11504.2099609375,
2613
  "learning_rate": 9.994911639079112e-05,
2614
- "loss": 59.4822,
2615
  "step": 370
2616
  },
2617
  {
2618
  "epoch": 0.9798613403763619,
2619
- "grad_norm": 13730.798828125,
2620
  "learning_rate": 9.994873884171167e-05,
2621
- "loss": 61.4645,
2622
  "step": 371
2623
  },
2624
  {
2625
  "epoch": 0.9825024760647079,
2626
- "grad_norm": 13915.8955078125,
2627
  "learning_rate": 9.994835989784305e-05,
2628
- "loss": 61.7707,
2629
  "step": 372
2630
  },
2631
  {
2632
  "epoch": 0.9851436117530538,
2633
- "grad_norm": 11192.7958984375,
2634
  "learning_rate": 9.994797955919581e-05,
2635
- "loss": 60.4613,
2636
  "step": 373
2637
  },
2638
  {
2639
  "epoch": 0.9877847474413998,
2640
- "grad_norm": 16333.625,
2641
  "learning_rate": 9.994759782578058e-05,
2642
- "loss": 61.5988,
2643
  "step": 374
2644
  },
2645
  {
2646
  "epoch": 0.9904258831297458,
2647
- "grad_norm": 7811.99169921875,
2648
  "learning_rate": 9.994721469760801e-05,
2649
- "loss": 61.6461,
2650
  "step": 375
2651
  },
2652
  {
2653
  "epoch": 0.9930670188180918,
2654
- "grad_norm": 20771.388671875,
2655
  "learning_rate": 9.994683017468883e-05,
2656
- "loss": 60.9048,
2657
  "step": 376
2658
  },
2659
  {
2660
  "epoch": 0.9957081545064378,
2661
- "grad_norm": 40759.0625,
2662
  "learning_rate": 9.994644425703374e-05,
2663
- "loss": 63.0121,
2664
  "step": 377
2665
  },
2666
  {
2667
  "epoch": 0.9983492901947838,
2668
- "grad_norm": 18481.259765625,
2669
  "learning_rate": 9.994605694465355e-05,
2670
- "loss": 66.464,
2671
  "step": 378
2672
  },
2673
  {
2674
- "epoch": 1.0014856388246947,
2675
- "grad_norm": 15701.447265625,
2676
  "learning_rate": 9.994566823755907e-05,
2677
- "loss": 70.2724,
2678
  "step": 379
2679
  },
2680
  {
2681
- "epoch": 1.0041267745130407,
2682
- "grad_norm": 5057.27294921875,
2683
  "learning_rate": 9.99452781357611e-05,
2684
- "loss": 63.1513,
2685
  "step": 380
2686
  },
2687
  {
2688
- "epoch": 1.0067679102013867,
2689
- "grad_norm": 9864.3193359375,
2690
  "learning_rate": 9.994488663927062e-05,
2691
- "loss": 61.9906,
2692
  "step": 381
2693
  },
2694
  {
2695
- "epoch": 1.0094090458897327,
2696
- "grad_norm": 6700.53125,
2697
  "learning_rate": 9.994449374809851e-05,
2698
- "loss": 62.4899,
2699
  "step": 382
2700
  },
2701
  {
2702
- "epoch": 1.0120501815780785,
2703
- "grad_norm": 6837.01904296875,
2704
  "learning_rate": 9.994409946225574e-05,
2705
- "loss": 65.2269,
2706
  "step": 383
2707
  },
2708
  {
2709
- "epoch": 1.0146913172664245,
2710
- "grad_norm": 13164.2265625,
2711
  "learning_rate": 9.994370378175332e-05,
2712
- "loss": 64.4564,
2713
  "step": 384
2714
  },
2715
  {
2716
- "epoch": 1.0173324529547705,
2717
- "grad_norm": 6737.4931640625,
2718
  "learning_rate": 9.994330670660235e-05,
2719
- "loss": 66.8069,
2720
  "step": 385
2721
  },
2722
  {
2723
- "epoch": 1.0199735886431165,
2724
- "grad_norm": 7182.29052734375,
2725
  "learning_rate": 9.994290823681385e-05,
2726
- "loss": 63.7669,
2727
  "step": 386
2728
  },
2729
  {
2730
- "epoch": 1.0226147243314625,
2731
- "grad_norm": 6725.6865234375,
2732
  "learning_rate": 9.994250837239897e-05,
2733
- "loss": 64.2153,
2734
  "step": 387
2735
  },
2736
  {
2737
- "epoch": 1.0252558600198085,
2738
- "grad_norm": 8369.416015625,
2739
  "learning_rate": 9.994210711336891e-05,
2740
- "loss": 64.0927,
2741
  "step": 388
2742
  },
2743
  {
2744
- "epoch": 1.0278969957081545,
2745
- "grad_norm": 9409.310546875,
2746
  "learning_rate": 9.994170445973483e-05,
2747
- "loss": 64.6209,
2748
  "step": 389
2749
  },
2750
  {
2751
- "epoch": 1.0305381313965005,
2752
- "grad_norm": 9622.2392578125,
2753
  "learning_rate": 9.994130041150798e-05,
2754
- "loss": 63.6005,
2755
  "step": 390
2756
  },
2757
  {
2758
- "epoch": 1.0331792670848465,
2759
- "grad_norm": 7618.46533203125,
2760
  "learning_rate": 9.994089496869968e-05,
2761
- "loss": 63.3025,
2762
  "step": 391
2763
  },
2764
  {
2765
- "epoch": 1.0358204027731925,
2766
- "grad_norm": 12586.142578125,
2767
  "learning_rate": 9.994048813132119e-05,
2768
- "loss": 61.8642,
2769
  "step": 392
2770
  },
2771
  {
2772
- "epoch": 1.0384615384615385,
2773
- "grad_norm": 9250.41796875,
2774
  "learning_rate": 9.994007989938392e-05,
2775
- "loss": 61.8174,
2776
  "step": 393
2777
  },
2778
  {
2779
- "epoch": 1.0411026741498846,
2780
- "grad_norm": 8754.533203125,
2781
  "learning_rate": 9.993967027289927e-05,
2782
- "loss": 62.1932,
2783
  "step": 394
2784
  },
2785
  {
2786
- "epoch": 1.0437438098382303,
2787
- "grad_norm": 8648.9921875,
2788
  "learning_rate": 9.993925925187865e-05,
2789
- "loss": 60.8662,
2790
  "step": 395
2791
  },
2792
  {
2793
- "epoch": 1.0463849455265763,
2794
- "grad_norm": 9864.1015625,
2795
  "learning_rate": 9.993884683633354e-05,
2796
- "loss": 61.4257,
2797
  "step": 396
2798
  },
2799
  {
2800
- "epoch": 1.0490260812149224,
2801
- "grad_norm": 9742.0888671875,
2802
  "learning_rate": 9.993843302627549e-05,
2803
- "loss": 60.0106,
2804
  "step": 397
2805
  },
2806
  {
2807
- "epoch": 1.0516672169032684,
2808
- "grad_norm": 9846.623046875,
2809
  "learning_rate": 9.993801782171603e-05,
2810
- "loss": 60.2374,
2811
  "step": 398
2812
  },
2813
  {
2814
- "epoch": 1.0543083525916144,
2815
- "grad_norm": 6269.61376953125,
2816
  "learning_rate": 9.993760122266676e-05,
2817
- "loss": 60.5703,
2818
  "step": 399
2819
  },
2820
  {
2821
- "epoch": 1.0569494882799604,
2822
- "grad_norm": 35210.05078125,
2823
  "learning_rate": 9.99371832291393e-05,
2824
- "loss": 87.2034,
2825
  "step": 400
2826
  },
2827
  {
2828
- "epoch": 1.0569494882799604,
2829
- "eval_loss": 8.187799453735352,
2830
- "eval_runtime": 2.2268,
2831
- "eval_samples_per_second": 222.293,
2832
- "eval_steps_per_second": 55.686,
2833
  "step": 400
2834
  }
2835
  ],
@@ -2850,8 +2850,8 @@
2850
  "attributes": {}
2851
  }
2852
  },
2853
- "total_flos": 1043707151253504.0,
2854
- "train_batch_size": 4,
2855
  "trial_name": null,
2856
  "trial_params": null
2857
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0564542753383954,
5
  "eval_steps": 200,
6
  "global_step": 400,
7
  "is_hyper_param_search": false,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.002641135688345989,
13
+ "grad_norm": 38990.80078125,
14
  "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 39.1249,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.002641135688345989,
20
+ "eval_loss": 10.096823692321777,
21
+ "eval_runtime": 2.1873,
22
+ "eval_samples_per_second": 226.308,
23
+ "eval_steps_per_second": 28.346,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.005282271376691978,
28
+ "grad_norm": 22047.08203125,
29
  "learning_rate": 2.0000000000000003e-06,
30
+ "loss": 36.867,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.007923407065037967,
35
+ "grad_norm": 81682.6796875,
36
  "learning_rate": 3e-06,
37
+ "loss": 39.9853,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.010564542753383956,
42
+ "grad_norm": 26123.87109375,
43
  "learning_rate": 4.000000000000001e-06,
44
+ "loss": 38.4879,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.013205678441729944,
49
+ "grad_norm": 29102.25390625,
50
  "learning_rate": 5e-06,
51
+ "loss": 37.9034,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.015846814130075933,
56
+ "grad_norm": 24646.03125,
57
  "learning_rate": 6e-06,
58
+ "loss": 37.7639,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.01848794981842192,
63
+ "grad_norm": 56427.59375,
64
  "learning_rate": 7.000000000000001e-06,
65
+ "loss": 37.4074,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.02112908550676791,
70
+ "grad_norm": 28639.47265625,
71
  "learning_rate": 8.000000000000001e-06,
72
+ "loss": 37.1164,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.0237702211951139,
77
+ "grad_norm": 18426.78515625,
78
  "learning_rate": 9e-06,
79
+ "loss": 37.2912,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.02641135688345989,
84
+ "grad_norm": 29681.62109375,
85
  "learning_rate": 1e-05,
86
+ "loss": 37.1851,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.029052492571805876,
91
+ "grad_norm": 61535.3671875,
92
  "learning_rate": 1.1000000000000001e-05,
93
+ "loss": 38.4262,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.03169362826015187,
98
+ "grad_norm": 28930.455078125,
99
  "learning_rate": 1.2e-05,
100
+ "loss": 37.3699,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.034334763948497854,
105
+ "grad_norm": 31548.685546875,
106
  "learning_rate": 1.3000000000000001e-05,
107
+ "loss": 36.4952,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.03697589963684384,
112
+ "grad_norm": 18435.33203125,
113
  "learning_rate": 1.4000000000000001e-05,
114
+ "loss": 36.2176,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.03961703532518983,
119
+ "grad_norm": 28708.28515625,
120
  "learning_rate": 1.5e-05,
121
+ "loss": 36.6194,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.04225817101353582,
126
+ "grad_norm": 20564.423828125,
127
  "learning_rate": 1.6000000000000003e-05,
128
+ "loss": 36.3482,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.04489930670188181,
133
+ "grad_norm": 17695.943359375,
134
  "learning_rate": 1.7000000000000003e-05,
135
+ "loss": 35.715,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.0475404423902278,
140
+ "grad_norm": 15335.0712890625,
141
  "learning_rate": 1.8e-05,
142
+ "loss": 34.6268,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.050181578078573784,
147
+ "grad_norm": 13583.33203125,
148
  "learning_rate": 1.9e-05,
149
+ "loss": 35.5606,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.05282271376691978,
154
+ "grad_norm": 26019.890625,
155
  "learning_rate": 2e-05,
156
+ "loss": 35.5296,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.055463849455265765,
161
+ "grad_norm": 14379.9052734375,
162
  "learning_rate": 2.1e-05,
163
+ "loss": 34.4806,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.05810498514361175,
168
+ "grad_norm": 27736.314453125,
169
  "learning_rate": 2.2000000000000003e-05,
170
+ "loss": 36.2501,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.06074612083195774,
175
+ "grad_norm": 260855.0,
176
  "learning_rate": 2.3000000000000003e-05,
177
+ "loss": 150.3672,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.06338725652030373,
182
+ "grad_norm": 422515.90625,
183
  "learning_rate": 2.4e-05,
184
+ "loss": 292.9531,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.06602839220864971,
189
+ "grad_norm": 1018015.1875,
190
  "learning_rate": 2.5e-05,
191
+ "loss": 355.2773,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.06866952789699571,
196
+ "grad_norm": 817252.75,
197
  "learning_rate": 2.6000000000000002e-05,
198
+ "loss": 268.3635,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.0713106635853417,
203
+ "grad_norm": 650477.3125,
204
  "learning_rate": 2.7000000000000002e-05,
205
+ "loss": 304.9252,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.07395179927368768,
210
+ "grad_norm": 405537.28125,
211
  "learning_rate": 2.8000000000000003e-05,
212
+ "loss": 298.1852,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.07659293496203368,
217
+ "grad_norm": 1309428.0,
218
  "learning_rate": 2.9e-05,
219
+ "loss": 405.8535,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.07923407065037966,
224
+ "grad_norm": 574544.0625,
225
  "learning_rate": 3e-05,
226
+ "loss": 341.7129,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.08187520633872565,
231
+ "grad_norm": 425538.625,
232
  "learning_rate": 3.1e-05,
233
+ "loss": 260.0619,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.08451634202707164,
238
+ "grad_norm": 566855.6875,
239
  "learning_rate": 3.2000000000000005e-05,
240
+ "loss": 272.2906,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.08715747771541763,
245
+ "grad_norm": 300651.28125,
246
  "learning_rate": 3.3e-05,
247
+ "loss": 79.3429,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.08979861340376362,
252
+ "grad_norm": 14083.6494140625,
253
  "learning_rate": 3.4000000000000007e-05,
254
+ "loss": 35.4547,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.09243974909210961,
259
+ "grad_norm": 22081.201171875,
260
  "learning_rate": 3.5e-05,
261
+ "loss": 34.5699,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.0950808847804556,
266
+ "grad_norm": 11022.6884765625,
267
  "learning_rate": 3.6e-05,
268
+ "loss": 34.3495,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.09772202046880159,
273
+ "grad_norm": 11900.990234375,
274
  "learning_rate": 3.7e-05,
275
+ "loss": 35.6864,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.10036315615714757,
280
+ "grad_norm": 13156.771484375,
281
  "learning_rate": 3.8e-05,
282
+ "loss": 34.4444,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.10300429184549356,
287
+ "grad_norm": 11813.6083984375,
288
  "learning_rate": 3.9000000000000006e-05,
289
+ "loss": 34.9737,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.10564542753383956,
294
+ "grad_norm": 15030.2021484375,
295
  "learning_rate": 4e-05,
296
+ "loss": 34.0348,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.10828656322218554,
301
+ "grad_norm": 11196.2529296875,
302
  "learning_rate": 4.1e-05,
303
+ "loss": 34.3456,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.11092769891053153,
308
+ "grad_norm": 11016.130859375,
309
  "learning_rate": 4.2e-05,
310
+ "loss": 34.3275,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.11356883459887751,
315
+ "grad_norm": 14342.5283203125,
316
  "learning_rate": 4.3e-05,
317
+ "loss": 33.6461,
318
  "step": 43
319
  },
320
  {
321
  "epoch": 0.1162099702872235,
322
+ "grad_norm": 13592.828125,
323
  "learning_rate": 4.4000000000000006e-05,
324
+ "loss": 35.0608,
325
  "step": 44
326
  },
327
  {
328
  "epoch": 0.1188511059755695,
329
+ "grad_norm": 14278.0205078125,
330
  "learning_rate": 4.5e-05,
331
+ "loss": 33.6034,
332
  "step": 45
333
  },
334
  {
335
  "epoch": 0.12149224166391548,
336
+ "grad_norm": 15676.8076171875,
337
  "learning_rate": 4.600000000000001e-05,
338
+ "loss": 34.7689,
339
  "step": 46
340
  },
341
  {
342
  "epoch": 0.12413337735226147,
343
+ "grad_norm": 16533.037109375,
344
  "learning_rate": 4.7e-05,
345
+ "loss": 34.1397,
346
  "step": 47
347
  },
348
  {
349
  "epoch": 0.12677451304060747,
350
+ "grad_norm": 17516.21484375,
351
  "learning_rate": 4.8e-05,
352
+ "loss": 35.4853,
353
  "step": 48
354
  },
355
  {
356
  "epoch": 0.12941564872895345,
357
+ "grad_norm": 22093.806640625,
358
  "learning_rate": 4.9e-05,
359
+ "loss": 36.8646,
360
  "step": 49
361
  },
362
  {
363
  "epoch": 0.13205678441729943,
364
+ "grad_norm": 34389.921875,
365
  "learning_rate": 5e-05,
366
+ "loss": 39.5266,
367
  "step": 50
368
  },
369
  {
370
  "epoch": 0.13469792010564544,
371
+ "grad_norm": 7149.9775390625,
372
  "learning_rate": 5.1000000000000006e-05,
373
+ "loss": 35.8088,
374
  "step": 51
375
  },
376
  {
377
  "epoch": 0.13733905579399142,
378
+ "grad_norm": 6511.89306640625,
379
  "learning_rate": 5.2000000000000004e-05,
380
+ "loss": 35.0745,
381
  "step": 52
382
  },
383
  {
384
  "epoch": 0.1399801914823374,
385
+ "grad_norm": 11293.515625,
386
  "learning_rate": 5.300000000000001e-05,
387
+ "loss": 33.7064,
388
  "step": 53
389
  },
390
  {
391
  "epoch": 0.1426213271706834,
392
+ "grad_norm": 7394.4853515625,
393
  "learning_rate": 5.4000000000000005e-05,
394
+ "loss": 34.612,
395
  "step": 54
396
  },
397
  {
398
  "epoch": 0.14526246285902938,
399
+ "grad_norm": 7513.56982421875,
400
  "learning_rate": 5.500000000000001e-05,
401
+ "loss": 34.2795,
402
  "step": 55
403
  },
404
  {
405
  "epoch": 0.14790359854737536,
406
+ "grad_norm": 12561.0849609375,
407
  "learning_rate": 5.6000000000000006e-05,
408
+ "loss": 34.1079,
409
  "step": 56
410
  },
411
  {
412
  "epoch": 0.15054473423572137,
413
+ "grad_norm": 7255.42724609375,
414
  "learning_rate": 5.6999999999999996e-05,
415
+ "loss": 33.7773,
416
  "step": 57
417
  },
418
  {
419
  "epoch": 0.15318586992406735,
420
+ "grad_norm": 8305.197265625,
421
  "learning_rate": 5.8e-05,
422
+ "loss": 33.5425,
423
  "step": 58
424
  },
425
  {
426
  "epoch": 0.15582700561241333,
427
+ "grad_norm": 7724.32666015625,
428
  "learning_rate": 5.9e-05,
429
+ "loss": 34.3069,
430
  "step": 59
431
  },
432
  {
433
  "epoch": 0.1584681413007593,
434
+ "grad_norm": 6973.86669921875,
435
  "learning_rate": 6e-05,
436
+ "loss": 31.8323,
437
  "step": 60
438
  },
439
  {
440
  "epoch": 0.16110927698910532,
441
+ "grad_norm": 8178.408203125,
442
  "learning_rate": 6.1e-05,
443
+ "loss": 33.4728,
444
  "step": 61
445
  },
446
  {
447
  "epoch": 0.1637504126774513,
448
+ "grad_norm": 7446.3310546875,
449
  "learning_rate": 6.2e-05,
450
+ "loss": 32.2049,
451
  "step": 62
452
  },
453
  {
454
  "epoch": 0.16639154836579728,
455
+ "grad_norm": 7538.81494140625,
456
  "learning_rate": 6.3e-05,
457
+ "loss": 31.9451,
458
  "step": 63
459
  },
460
  {
461
  "epoch": 0.1690326840541433,
462
+ "grad_norm": 7067.33154296875,
463
  "learning_rate": 6.400000000000001e-05,
464
+ "loss": 32.0696,
465
  "step": 64
466
  },
467
  {
468
  "epoch": 0.17167381974248927,
469
+ "grad_norm": 7199.02294921875,
470
  "learning_rate": 6.500000000000001e-05,
471
+ "loss": 31.7234,
472
  "step": 65
473
  },
474
  {
475
  "epoch": 0.17431495543083525,
476
+ "grad_norm": 6351.2900390625,
477
  "learning_rate": 6.6e-05,
478
+ "loss": 31.4103,
479
  "step": 66
480
  },
481
  {
482
  "epoch": 0.17695609111918126,
483
+ "grad_norm": 9954.1572265625,
484
  "learning_rate": 6.7e-05,
485
+ "loss": 31.2581,
486
  "step": 67
487
  },
488
  {
489
  "epoch": 0.17959722680752724,
490
+ "grad_norm": 6812.11083984375,
491
  "learning_rate": 6.800000000000001e-05,
492
+ "loss": 31.0586,
493
  "step": 68
494
  },
495
  {
496
  "epoch": 0.18223836249587322,
497
+ "grad_norm": 6788.81787109375,
498
  "learning_rate": 6.9e-05,
499
+ "loss": 31.2012,
500
  "step": 69
501
  },
502
  {
503
  "epoch": 0.18487949818421923,
504
+ "grad_norm": 6330.77880859375,
505
  "learning_rate": 7e-05,
506
+ "loss": 31.758,
507
  "step": 70
508
  },
509
  {
510
  "epoch": 0.1875206338725652,
511
+ "grad_norm": 6925.2958984375,
512
  "learning_rate": 7.1e-05,
513
+ "loss": 31.7811,
514
  "step": 71
515
  },
516
  {
517
  "epoch": 0.1901617695609112,
518
+ "grad_norm": 15530.548828125,
519
  "learning_rate": 7.2e-05,
520
+ "loss": 38.1656,
521
  "step": 72
522
  },
523
  {
524
  "epoch": 0.19280290524925717,
525
+ "grad_norm": 248175.0,
526
  "learning_rate": 7.3e-05,
527
+ "loss": 314.6807,
528
  "step": 73
529
  },
530
  {
531
  "epoch": 0.19544404093760318,
532
+ "grad_norm": 348192.3125,
533
  "learning_rate": 7.4e-05,
534
+ "loss": 319.9785,
535
  "step": 74
536
  },
537
  {
538
  "epoch": 0.19808517662594916,
539
+ "grad_norm": 399153.90625,
540
  "learning_rate": 7.500000000000001e-05,
541
+ "loss": 245.481,
542
  "step": 75
543
  },
544
  {
545
  "epoch": 0.20072631231429514,
546
+ "grad_norm": 293389.5,
547
  "learning_rate": 7.6e-05,
548
+ "loss": 269.3301,
549
  "step": 76
550
  },
551
  {
552
  "epoch": 0.20336744800264114,
553
+ "grad_norm": 536027.375,
554
  "learning_rate": 7.7e-05,
555
+ "loss": 240.3848,
556
  "step": 77
557
  },
558
  {
559
  "epoch": 0.20600858369098712,
560
+ "grad_norm": 340636.96875,
561
  "learning_rate": 7.800000000000001e-05,
562
+ "loss": 257.0401,
563
  "step": 78
564
  },
565
  {
566
  "epoch": 0.2086497193793331,
567
+ "grad_norm": 343861.0625,
568
  "learning_rate": 7.900000000000001e-05,
569
+ "loss": 246.5806,
570
  "step": 79
571
  },
572
  {
573
  "epoch": 0.2112908550676791,
574
+ "grad_norm": 377362.75,
575
  "learning_rate": 8e-05,
576
+ "loss": 204.2622,
577
  "step": 80
578
  },
579
  {
580
  "epoch": 0.2139319907560251,
581
+ "grad_norm": 495172.15625,
582
  "learning_rate": 8.1e-05,
583
+ "loss": 152.9565,
584
  "step": 81
585
  },
586
  {
587
  "epoch": 0.21657312644437107,
588
+ "grad_norm": 410514.21875,
589
  "learning_rate": 8.2e-05,
590
+ "loss": 120.1336,
591
  "step": 82
592
  },
593
  {
594
  "epoch": 0.21921426213271708,
595
+ "grad_norm": 37318.89453125,
596
  "learning_rate": 8.3e-05,
597
+ "loss": 43.507,
598
  "step": 83
599
  },
600
  {
601
  "epoch": 0.22185539782106306,
602
+ "grad_norm": 46563.8515625,
603
  "learning_rate": 8.4e-05,
604
+ "loss": 42.6661,
605
  "step": 84
606
  },
607
  {
608
  "epoch": 0.22449653350940904,
609
+ "grad_norm": 25882.45703125,
610
  "learning_rate": 8.5e-05,
611
+ "loss": 41.1904,
612
  "step": 85
613
  },
614
  {
615
  "epoch": 0.22713766919775502,
616
+ "grad_norm": 21462.017578125,
617
  "learning_rate": 8.6e-05,
618
+ "loss": 35.6957,
619
  "step": 86
620
  },
621
  {
622
  "epoch": 0.22977880488610103,
623
+ "grad_norm": 11826.3798828125,
624
  "learning_rate": 8.7e-05,
625
+ "loss": 33.1654,
626
  "step": 87
627
  },
628
  {
629
  "epoch": 0.232419940574447,
630
+ "grad_norm": 10408.4365234375,
631
  "learning_rate": 8.800000000000001e-05,
632
+ "loss": 32.1405,
633
  "step": 88
634
  },
635
  {
636
  "epoch": 0.235061076262793,
637
+ "grad_norm": 9028.2587890625,
638
  "learning_rate": 8.900000000000001e-05,
639
+ "loss": 30.806,
640
  "step": 89
641
  },
642
  {
643
  "epoch": 0.237702211951139,
644
+ "grad_norm": 14064.7021484375,
645
  "learning_rate": 9e-05,
646
+ "loss": 32.0673,
647
  "step": 90
648
  },
649
  {
650
  "epoch": 0.24034334763948498,
651
+ "grad_norm": 10274.6611328125,
652
  "learning_rate": 9.1e-05,
653
+ "loss": 30.8923,
654
  "step": 91
655
  },
656
  {
657
  "epoch": 0.24298448332783096,
658
+ "grad_norm": 13376.0947265625,
659
  "learning_rate": 9.200000000000001e-05,
660
+ "loss": 32.0376,
661
  "step": 92
662
  },
663
  {
664
  "epoch": 0.24562561901617697,
665
+ "grad_norm": 13412.4970703125,
666
  "learning_rate": 9.300000000000001e-05,
667
+ "loss": 32.5937,
668
  "step": 93
669
  },
670
  {
671
  "epoch": 0.24826675470452295,
672
+ "grad_norm": 17289.099609375,
673
  "learning_rate": 9.4e-05,
674
+ "loss": 32.0219,
675
  "step": 94
676
  },
677
  {
678
  "epoch": 0.2509078903928689,
679
+ "grad_norm": 10165.4990234375,
680
  "learning_rate": 9.5e-05,
681
+ "loss": 32.7753,
682
  "step": 95
683
  },
684
  {
685
  "epoch": 0.25354902608121493,
686
+ "grad_norm": 16371.439453125,
687
  "learning_rate": 9.6e-05,
688
+ "loss": 31.7399,
689
  "step": 96
690
  },
691
  {
692
  "epoch": 0.2561901617695609,
693
+ "grad_norm": 28360.642578125,
694
  "learning_rate": 9.7e-05,
695
+ "loss": 32.4525,
696
  "step": 97
697
  },
698
  {
699
  "epoch": 0.2588312974579069,
700
+ "grad_norm": 19952.9296875,
701
  "learning_rate": 9.8e-05,
702
+ "loss": 33.4285,
703
  "step": 98
704
  },
705
  {
706
  "epoch": 0.2614724331462529,
707
+ "grad_norm": 20724.11328125,
708
  "learning_rate": 9.900000000000001e-05,
709
+ "loss": 34.1331,
710
  "step": 99
711
  },
712
  {
713
  "epoch": 0.26411356883459886,
714
+ "grad_norm": 38431.6328125,
715
  "learning_rate": 0.0001,
716
+ "loss": 40.0183,
717
  "step": 100
718
  },
719
  {
720
  "epoch": 0.26675470452294486,
721
+ "grad_norm": 9295.7626953125,
722
  "learning_rate": 9.99999993018897e-05,
723
+ "loss": 32.908,
724
  "step": 101
725
  },
726
  {
727
  "epoch": 0.26939584021129087,
728
+ "grad_norm": 9068.3134765625,
729
  "learning_rate": 9.999999720755877e-05,
730
+ "loss": 32.7796,
731
  "step": 102
732
  },
733
  {
734
  "epoch": 0.2720369758996368,
735
+ "grad_norm": 9507.033203125,
736
  "learning_rate": 9.99999937170073e-05,
737
+ "loss": 33.0735,
738
  "step": 103
739
  },
740
  {
741
  "epoch": 0.27467811158798283,
742
+ "grad_norm": 9898.73046875,
743
  "learning_rate": 9.999998883023537e-05,
744
+ "loss": 34.4524,
745
  "step": 104
746
  },
747
  {
748
  "epoch": 0.27731924727632884,
749
+ "grad_norm": 8197.7294921875,
750
  "learning_rate": 9.999998254724313e-05,
751
+ "loss": 34.3429,
752
  "step": 105
753
  },
754
  {
755
  "epoch": 0.2799603829646748,
756
+ "grad_norm": 7723.392578125,
757
  "learning_rate": 9.999997486803075e-05,
758
+ "loss": 33.3132,
759
  "step": 106
760
  },
761
  {
762
  "epoch": 0.2826015186530208,
763
+ "grad_norm": 9174.4091796875,
764
  "learning_rate": 9.999996579259843e-05,
765
+ "loss": 32.9465,
766
  "step": 107
767
  },
768
  {
769
  "epoch": 0.2852426543413668,
770
+ "grad_norm": 10098.0283203125,
771
  "learning_rate": 9.999995532094644e-05,
772
+ "loss": 34.1124,
773
  "step": 108
774
  },
775
  {
776
  "epoch": 0.28788379002971276,
777
+ "grad_norm": 7904.126953125,
778
  "learning_rate": 9.999994345307508e-05,
779
+ "loss": 32.68,
780
  "step": 109
781
  },
782
  {
783
  "epoch": 0.29052492571805877,
784
+ "grad_norm": 7395.32177734375,
785
  "learning_rate": 9.999993018898466e-05,
786
+ "loss": 32.1147,
787
  "step": 110
788
  },
789
  {
790
  "epoch": 0.2931660614064048,
791
+ "grad_norm": 15490.7314453125,
792
  "learning_rate": 9.999991552867558e-05,
793
+ "loss": 32.7157,
794
  "step": 111
795
  },
796
  {
797
  "epoch": 0.29580719709475073,
798
+ "grad_norm": 6962.9326171875,
799
  "learning_rate": 9.99998994721482e-05,
800
+ "loss": 32.5103,
801
  "step": 112
802
  },
803
  {
804
  "epoch": 0.29844833278309674,
805
+ "grad_norm": 6746.60546875,
806
  "learning_rate": 9.999988201940302e-05,
807
+ "loss": 31.9245,
808
  "step": 113
809
  },
810
  {
811
  "epoch": 0.30108946847144274,
812
+ "grad_norm": 6943.94140625,
813
  "learning_rate": 9.999986317044051e-05,
814
+ "loss": 30.399,
815
  "step": 114
816
  },
817
  {
818
  "epoch": 0.3037306041597887,
819
+ "grad_norm": 6095.4384765625,
820
  "learning_rate": 9.999984292526118e-05,
821
+ "loss": 29.941,
822
  "step": 115
823
  },
824
  {
825
  "epoch": 0.3063717398481347,
826
+ "grad_norm": 6518.970703125,
827
  "learning_rate": 9.999982128386562e-05,
828
+ "loss": 30.5093,
829
  "step": 116
830
  },
831
  {
832
  "epoch": 0.3090128755364807,
833
+ "grad_norm": 5806.0927734375,
834
  "learning_rate": 9.99997982462544e-05,
835
+ "loss": 29.6937,
836
  "step": 117
837
  },
838
  {
839
  "epoch": 0.31165401122482667,
840
+ "grad_norm": 6216.46435546875,
841
  "learning_rate": 9.999977381242821e-05,
842
+ "loss": 29.7115,
843
  "step": 118
844
  },
845
  {
846
  "epoch": 0.3142951469131727,
847
+ "grad_norm": 5445.48828125,
848
  "learning_rate": 9.999974798238769e-05,
849
+ "loss": 28.9644,
850
  "step": 119
851
  },
852
  {
853
  "epoch": 0.3169362826015186,
854
+ "grad_norm": 4930.64453125,
855
  "learning_rate": 9.99997207561336e-05,
856
+ "loss": 29.803,
857
  "step": 120
858
  },
859
  {
860
  "epoch": 0.31957741828986463,
861
+ "grad_norm": 5866.5478515625,
862
  "learning_rate": 9.999969213366667e-05,
863
+ "loss": 29.2732,
864
  "step": 121
865
  },
866
  {
867
  "epoch": 0.32221855397821064,
868
+ "grad_norm": 13160.4111328125,
869
  "learning_rate": 9.99996621149877e-05,
870
+ "loss": 31.3179,
871
  "step": 122
872
  },
873
  {
874
  "epoch": 0.3248596896665566,
875
+ "grad_norm": 577362.4375,
876
  "learning_rate": 9.999963070009755e-05,
877
+ "loss": 192.1116,
878
  "step": 123
879
  },
880
  {
881
  "epoch": 0.3275008253549026,
882
+ "grad_norm": 447577.625,
883
  "learning_rate": 9.999959788899706e-05,
884
+ "loss": 353.353,
885
  "step": 124
886
  },
887
  {
888
  "epoch": 0.3301419610432486,
889
+ "grad_norm": 422884.03125,
890
  "learning_rate": 9.999956368168719e-05,
891
+ "loss": 328.2871,
892
  "step": 125
893
  },
894
  {
895
  "epoch": 0.33278309673159456,
896
+ "grad_norm": 217372.875,
897
  "learning_rate": 9.999952807816888e-05,
898
+ "loss": 311.041,
899
  "step": 126
900
  },
901
  {
902
  "epoch": 0.33542423241994057,
903
+ "grad_norm": 197269.15625,
904
  "learning_rate": 9.99994910784431e-05,
905
+ "loss": 331.5454,
906
  "step": 127
907
  },
908
  {
909
  "epoch": 0.3380653681082866,
910
+ "grad_norm": 447190.15625,
911
  "learning_rate": 9.999945268251092e-05,
912
+ "loss": 292.5098,
913
  "step": 128
914
  },
915
  {
916
  "epoch": 0.34070650379663253,
917
+ "grad_norm": 156708.53125,
918
  "learning_rate": 9.999941289037338e-05,
919
+ "loss": 329.5899,
920
  "step": 129
921
  },
922
  {
923
  "epoch": 0.34334763948497854,
924
+ "grad_norm": 214527.265625,
925
  "learning_rate": 9.999937170203162e-05,
926
+ "loss": 295.6437,
927
  "step": 130
928
  },
929
  {
930
  "epoch": 0.34598877517332455,
931
+ "grad_norm": 144792.09375,
932
  "learning_rate": 9.999932911748678e-05,
933
+ "loss": 321.7724,
934
  "step": 131
935
  },
936
  {
937
  "epoch": 0.3486299108616705,
938
+ "grad_norm": 183092.328125,
939
  "learning_rate": 9.999928513674004e-05,
940
+ "loss": 138.0811,
941
  "step": 132
942
  },
943
  {
944
  "epoch": 0.3512710465500165,
945
+ "grad_norm": 6041.10107421875,
946
  "learning_rate": 9.999923975979262e-05,
947
+ "loss": 30.1601,
948
  "step": 133
949
  },
950
  {
951
  "epoch": 0.3539121822383625,
952
+ "grad_norm": 5054.18798828125,
953
  "learning_rate": 9.999919298664582e-05,
954
+ "loss": 29.4563,
955
  "step": 134
956
  },
957
  {
958
  "epoch": 0.35655331792670847,
959
+ "grad_norm": 9742.12890625,
960
  "learning_rate": 9.999914481730092e-05,
961
+ "loss": 29.5483,
962
  "step": 135
963
  },
964
  {
965
  "epoch": 0.3591944536150545,
966
+ "grad_norm": 13321.4970703125,
967
  "learning_rate": 9.999909525175927e-05,
968
+ "loss": 29.7589,
969
  "step": 136
970
  },
971
  {
972
  "epoch": 0.3618355893034005,
973
+ "grad_norm": 9211.091796875,
974
  "learning_rate": 9.999904429002225e-05,
975
+ "loss": 30.0795,
976
  "step": 137
977
  },
978
  {
979
  "epoch": 0.36447672499174644,
980
+ "grad_norm": 10673.2529296875,
981
  "learning_rate": 9.99989919320913e-05,
982
+ "loss": 29.8073,
983
  "step": 138
984
  },
985
  {
986
  "epoch": 0.36711786068009244,
987
+ "grad_norm": 9673.37109375,
988
  "learning_rate": 9.999893817796786e-05,
989
+ "loss": 30.8933,
990
  "step": 139
991
  },
992
  {
993
  "epoch": 0.36975899636843845,
994
+ "grad_norm": 10085.38671875,
995
  "learning_rate": 9.999888302765345e-05,
996
+ "loss": 29.8822,
997
  "step": 140
998
  },
999
  {
1000
  "epoch": 0.3724001320567844,
1001
+ "grad_norm": 10791.521484375,
1002
  "learning_rate": 9.99988264811496e-05,
1003
+ "loss": 30.1218,
1004
  "step": 141
1005
  },
1006
  {
1007
  "epoch": 0.3750412677451304,
1008
+ "grad_norm": 11358.93359375,
1009
  "learning_rate": 9.99987685384579e-05,
1010
+ "loss": 30.735,
1011
  "step": 142
1012
  },
1013
  {
1014
  "epoch": 0.3776824034334764,
1015
+ "grad_norm": 7013.380859375,
1016
  "learning_rate": 9.999870919957996e-05,
1017
+ "loss": 29.9077,
1018
  "step": 143
1019
  },
1020
  {
1021
  "epoch": 0.3803235391218224,
1022
+ "grad_norm": 7458.63525390625,
1023
  "learning_rate": 9.999864846451744e-05,
1024
+ "loss": 30.7425,
1025
  "step": 144
1026
  },
1027
  {
1028
  "epoch": 0.3829646748101684,
1029
+ "grad_norm": 8038.50732421875,
1030
  "learning_rate": 9.999858633327201e-05,
1031
+ "loss": 31.053,
1032
  "step": 145
1033
  },
1034
  {
1035
  "epoch": 0.38560581049851433,
1036
+ "grad_norm": 7841.15283203125,
1037
  "learning_rate": 9.999852280584544e-05,
1038
+ "loss": 30.7345,
1039
  "step": 146
1040
  },
1041
  {
1042
  "epoch": 0.38824694618686034,
1043
+ "grad_norm": 7719.5048828125,
1044
  "learning_rate": 9.999845788223949e-05,
1045
+ "loss": 30.9241,
1046
  "step": 147
1047
  },
1048
  {
1049
  "epoch": 0.39088808187520635,
1050
+ "grad_norm": 13179.359375,
1051
  "learning_rate": 9.999839156245598e-05,
1052
+ "loss": 31.945,
1053
  "step": 148
1054
  },
1055
  {
1056
  "epoch": 0.3935292175635523,
1057
+ "grad_norm": 11153.3046875,
1058
  "learning_rate": 9.999832384649674e-05,
1059
+ "loss": 34.644,
1060
  "step": 149
1061
  },
1062
  {
1063
  "epoch": 0.3961703532518983,
1064
+ "grad_norm": 47252.56640625,
1065
  "learning_rate": 9.999825473436369e-05,
1066
+ "loss": 39.1459,
1067
  "step": 150
1068
  },
1069
  {
1070
  "epoch": 0.3988114889402443,
1071
+ "grad_norm": 5307.408203125,
1072
  "learning_rate": 9.999818422605875e-05,
1073
+ "loss": 32.3124,
1074
  "step": 151
1075
  },
1076
  {
1077
  "epoch": 0.40145262462859027,
1078
+ "grad_norm": 8414.1484375,
1079
  "learning_rate": 9.999811232158389e-05,
1080
+ "loss": 31.5456,
1081
  "step": 152
1082
  },
1083
  {
1084
  "epoch": 0.4040937603169363,
1085
+ "grad_norm": 5779.16943359375,
1086
  "learning_rate": 9.999803902094109e-05,
1087
+ "loss": 32.0291,
1088
  "step": 153
1089
  },
1090
  {
1091
  "epoch": 0.4067348960052823,
1092
+ "grad_norm": 6989.2958984375,
1093
  "learning_rate": 9.999796432413244e-05,
1094
+ "loss": 32.4468,
1095
  "step": 154
1096
  },
1097
  {
1098
  "epoch": 0.40937603169362824,
1099
+ "grad_norm": 10169.005859375,
1100
  "learning_rate": 9.999788823116001e-05,
1101
+ "loss": 33.1476,
1102
  "step": 155
1103
  },
1104
  {
1105
  "epoch": 0.41201716738197425,
1106
+ "grad_norm": 6967.77197265625,
1107
  "learning_rate": 9.999781074202592e-05,
1108
+ "loss": 32.4884,
1109
  "step": 156
1110
  },
1111
  {
1112
  "epoch": 0.41465830307032026,
1113
+ "grad_norm": 8052.6611328125,
1114
  "learning_rate": 9.999773185673232e-05,
1115
+ "loss": 33.8162,
1116
  "step": 157
1117
  },
1118
  {
1119
  "epoch": 0.4172994387586662,
1120
+ "grad_norm": 13675.26953125,
1121
  "learning_rate": 9.999765157528145e-05,
1122
+ "loss": 33.4981,
1123
  "step": 158
1124
  },
1125
  {
1126
  "epoch": 0.4199405744470122,
1127
+ "grad_norm": 23900.8515625,
1128
  "learning_rate": 9.99975698976755e-05,
1129
+ "loss": 33.6758,
1130
  "step": 159
1131
  },
1132
  {
1133
  "epoch": 0.4225817101353582,
1134
+ "grad_norm": 8697.0146484375,
1135
  "learning_rate": 9.99974868239168e-05,
1136
+ "loss": 33.4007,
1137
  "step": 160
1138
  },
1139
  {
1140
  "epoch": 0.4252228458237042,
1141
+ "grad_norm": 7423.0234375,
1142
  "learning_rate": 9.999740235400765e-05,
1143
+ "loss": 32.8796,
1144
  "step": 161
1145
  },
1146
  {
1147
  "epoch": 0.4278639815120502,
1148
+ "grad_norm": 8968.0107421875,
1149
  "learning_rate": 9.999731648795041e-05,
1150
+ "loss": 35.1091,
1151
  "step": 162
1152
  },
1153
  {
1154
  "epoch": 0.4305051172003962,
1155
+ "grad_norm": 8960.2470703125,
1156
  "learning_rate": 9.999722922574749e-05,
1157
+ "loss": 34.6028,
1158
  "step": 163
1159
  },
1160
  {
1161
  "epoch": 0.43314625288874214,
1162
+ "grad_norm": 9324.4716796875,
1163
  "learning_rate": 9.999714056740129e-05,
1164
+ "loss": 35.0468,
1165
  "step": 164
1166
  },
1167
  {
1168
  "epoch": 0.43578738857708815,
1169
+ "grad_norm": 15031.443359375,
1170
  "learning_rate": 9.999705051291432e-05,
1171
+ "loss": 33.8078,
1172
  "step": 165
1173
  },
1174
  {
1175
  "epoch": 0.43842852426543416,
1176
+ "grad_norm": 10380.2470703125,
1177
  "learning_rate": 9.999695906228908e-05,
1178
+ "loss": 34.8672,
1179
  "step": 166
1180
  },
1181
  {
1182
  "epoch": 0.4410696599537801,
1183
+ "grad_norm": 18920.16796875,
1184
  "learning_rate": 9.999686621552813e-05,
1185
+ "loss": 34.697,
1186
  "step": 167
1187
  },
1188
  {
1189
  "epoch": 0.4437107956421261,
1190
+ "grad_norm": 17273.609375,
1191
  "learning_rate": 9.999677197263406e-05,
1192
+ "loss": 35.5471,
1193
  "step": 168
1194
  },
1195
  {
1196
  "epoch": 0.44635193133047213,
1197
+ "grad_norm": 10327.810546875,
1198
  "learning_rate": 9.999667633360952e-05,
1199
+ "loss": 33.3773,
1200
  "step": 169
1201
  },
1202
  {
1203
  "epoch": 0.4489930670188181,
1204
+ "grad_norm": 15529.2939453125,
1205
  "learning_rate": 9.999657929845714e-05,
1206
+ "loss": 35.3255,
1207
  "step": 170
1208
  },
1209
  {
1210
  "epoch": 0.4516342027071641,
1211
+ "grad_norm": 15885.65625,
1212
  "learning_rate": 9.999648086717966e-05,
1213
+ "loss": 35.0333,
1214
  "step": 171
1215
  },
1216
  {
1217
  "epoch": 0.45427533839551004,
1218
+ "grad_norm": 16440.353515625,
1219
  "learning_rate": 9.999638103977982e-05,
1220
+ "loss": 36.1782,
1221
  "step": 172
1222
  },
1223
  {
1224
  "epoch": 0.45691647408385605,
1225
+ "grad_norm": 623966.3125,
1226
  "learning_rate": 9.999627981626041e-05,
1227
+ "loss": 117.4766,
1228
  "step": 173
1229
  },
1230
  {
1231
  "epoch": 0.45955760977220206,
1232
+ "grad_norm": 443642.1875,
1233
  "learning_rate": 9.999617719662426e-05,
1234
+ "loss": 256.9298,
1235
  "step": 174
1236
  },
1237
  {
1238
  "epoch": 0.462198745460548,
1239
+ "grad_norm": 426303.78125,
1240
  "learning_rate": 9.999607318087423e-05,
1241
+ "loss": 213.1021,
1242
  "step": 175
1243
  },
1244
  {
1245
  "epoch": 0.464839881148894,
1246
+ "grad_norm": 753837.9375,
1247
  "learning_rate": 9.999596776901322e-05,
1248
+ "loss": 234.3458,
1249
  "step": 176
1250
  },
1251
  {
1252
  "epoch": 0.46748101683724,
1253
+ "grad_norm": 500841.875,
1254
  "learning_rate": 9.999586096104419e-05,
1255
+ "loss": 242.1502,
1256
  "step": 177
1257
  },
1258
  {
1259
  "epoch": 0.470122152525586,
1260
+ "grad_norm": 488348.28125,
1261
  "learning_rate": 9.99957527569701e-05,
1262
+ "loss": 259.3533,
1263
  "step": 178
1264
  },
1265
  {
1266
  "epoch": 0.472763288213932,
1267
+ "grad_norm": 599034.6875,
1268
  "learning_rate": 9.999564315679398e-05,
1269
+ "loss": 254.9457,
1270
  "step": 179
1271
  },
1272
  {
1273
  "epoch": 0.475404423902278,
1274
+ "grad_norm": 740236.3125,
1275
  "learning_rate": 9.99955321605189e-05,
1276
+ "loss": 200.2197,
1277
  "step": 180
1278
  },
1279
  {
1280
  "epoch": 0.47804555959062395,
1281
+ "grad_norm": 279145.40625,
1282
  "learning_rate": 9.999541976814796e-05,
1283
+ "loss": 211.6974,
1284
  "step": 181
1285
  },
1286
  {
1287
  "epoch": 0.48068669527896996,
1288
+ "grad_norm": 565175.5625,
1289
  "learning_rate": 9.999530597968428e-05,
1290
+ "loss": 152.1028,
1291
  "step": 182
1292
  },
1293
  {
1294
  "epoch": 0.48332783096731596,
1295
+ "grad_norm": 29268.025390625,
1296
  "learning_rate": 9.999519079513107e-05,
1297
+ "loss": 37.5746,
1298
  "step": 183
1299
  },
1300
  {
1301
  "epoch": 0.4859689666556619,
1302
+ "grad_norm": 19322.490234375,
1303
  "learning_rate": 9.999507421449151e-05,
1304
+ "loss": 38.4138,
1305
  "step": 184
1306
  },
1307
  {
1308
  "epoch": 0.4886101023440079,
1309
+ "grad_norm": 27010.8203125,
1310
  "learning_rate": 9.999495623776886e-05,
1311
+ "loss": 35.2608,
1312
  "step": 185
1313
  },
1314
  {
1315
  "epoch": 0.49125123803235393,
1316
+ "grad_norm": 25924.7890625,
1317
  "learning_rate": 9.999483686496645e-05,
1318
+ "loss": 38.389,
1319
  "step": 186
1320
  },
1321
  {
1322
  "epoch": 0.4938923737206999,
1323
+ "grad_norm": 33607.66015625,
1324
  "learning_rate": 9.999471609608757e-05,
1325
+ "loss": 35.7422,
1326
  "step": 187
1327
  },
1328
  {
1329
  "epoch": 0.4965335094090459,
1330
+ "grad_norm": 19824.349609375,
1331
  "learning_rate": 9.999459393113561e-05,
1332
+ "loss": 37.8325,
1333
  "step": 188
1334
  },
1335
  {
1336
  "epoch": 0.4991746450973919,
1337
+ "grad_norm": 16384.638671875,
1338
  "learning_rate": 9.9994470370114e-05,
1339
+ "loss": 37.3911,
1340
  "step": 189
1341
  },
1342
  {
1343
  "epoch": 0.5018157807857379,
1344
+ "grad_norm": 15732.8330078125,
1345
  "learning_rate": 9.999434541302616e-05,
1346
+ "loss": 35.9949,
1347
  "step": 190
1348
  },
1349
  {
1350
  "epoch": 0.5044569164740839,
1351
+ "grad_norm": 23623.61328125,
1352
  "learning_rate": 9.99942190598756e-05,
1353
+ "loss": 36.3237,
1354
  "step": 191
1355
  },
1356
  {
1357
  "epoch": 0.5070980521624299,
1358
+ "grad_norm": 32387.189453125,
1359
  "learning_rate": 9.999409131066583e-05,
1360
+ "loss": 36.7266,
1361
  "step": 192
1362
  },
1363
  {
1364
  "epoch": 0.5097391878507759,
1365
+ "grad_norm": 19656.185546875,
1366
  "learning_rate": 9.999396216540044e-05,
1367
+ "loss": 36.9575,
1368
  "step": 193
1369
  },
1370
  {
1371
  "epoch": 0.5123803235391218,
1372
+ "grad_norm": 20705.455078125,
1373
  "learning_rate": 9.999383162408304e-05,
1374
+ "loss": 37.2048,
1375
  "step": 194
1376
  },
1377
  {
1378
  "epoch": 0.5150214592274678,
1379
+ "grad_norm": 21470.52734375,
1380
  "learning_rate": 9.999369968671723e-05,
1381
+ "loss": 36.1668,
1382
  "step": 195
1383
  },
1384
  {
1385
  "epoch": 0.5176625949158138,
1386
+ "grad_norm": 19358.25,
1387
  "learning_rate": 9.999356635330674e-05,
1388
+ "loss": 33.7397,
1389
  "step": 196
1390
  },
1391
  {
1392
  "epoch": 0.5203037306041598,
1393
+ "grad_norm": 19253.916015625,
1394
  "learning_rate": 9.999343162385529e-05,
1395
+ "loss": 36.7927,
1396
  "step": 197
1397
  },
1398
  {
1399
  "epoch": 0.5229448662925058,
1400
+ "grad_norm": 41119.46875,
1401
  "learning_rate": 9.99932954983666e-05,
1402
+ "loss": 36.7557,
1403
  "step": 198
1404
  },
1405
  {
1406
  "epoch": 0.5255860019808518,
1407
+ "grad_norm": 23741.87109375,
1408
  "learning_rate": 9.999315797684451e-05,
1409
+ "loss": 38.4819,
1410
  "step": 199
1411
  },
1412
  {
1413
  "epoch": 0.5282271376691977,
1414
+ "grad_norm": 33874.09765625,
1415
  "learning_rate": 9.999301905929286e-05,
1416
+ "loss": 42.3858,
1417
  "step": 200
1418
  },
1419
  {
1420
  "epoch": 0.5282271376691977,
1421
+ "eval_loss": 7.334134578704834,
1422
+ "eval_runtime": 2.2174,
1423
+ "eval_samples_per_second": 223.237,
1424
+ "eval_steps_per_second": 27.961,
1425
  "step": 200
1426
  },
1427
  {
1428
  "epoch": 0.5308682733575437,
1429
+ "grad_norm": 15801.6083984375,
1430
  "learning_rate": 9.999287874571552e-05,
1431
+ "loss": 38.8128,
1432
  "step": 201
1433
  },
1434
  {
1435
  "epoch": 0.5335094090458897,
1436
+ "grad_norm": 12974.27734375,
1437
  "learning_rate": 9.99927370361164e-05,
1438
+ "loss": 38.6081,
1439
  "step": 202
1440
  },
1441
  {
1442
  "epoch": 0.5361505447342357,
1443
+ "grad_norm": 12007.9013671875,
1444
  "learning_rate": 9.999259393049947e-05,
1445
+ "loss": 37.1496,
1446
  "step": 203
1447
  },
1448
  {
1449
  "epoch": 0.5387916804225817,
1450
+ "grad_norm": 13070.220703125,
1451
  "learning_rate": 9.999244942886871e-05,
1452
+ "loss": 38.7187,
1453
  "step": 204
1454
  },
1455
  {
1456
  "epoch": 0.5414328161109278,
1457
+ "grad_norm": 16807.220703125,
1458
  "learning_rate": 9.999230353122819e-05,
1459
+ "loss": 41.07,
1460
  "step": 205
1461
  },
1462
  {
1463
  "epoch": 0.5440739517992736,
1464
+ "grad_norm": 14268.9052734375,
1465
  "learning_rate": 9.999215623758194e-05,
1466
+ "loss": 40.1817,
1467
  "step": 206
1468
  },
1469
  {
1470
  "epoch": 0.5467150874876197,
1471
+ "grad_norm": 13336.4287109375,
1472
  "learning_rate": 9.99920075479341e-05,
1473
+ "loss": 37.3859,
1474
  "step": 207
1475
  },
1476
  {
1477
  "epoch": 0.5493562231759657,
1478
+ "grad_norm": 15000.0390625,
1479
  "learning_rate": 9.999185746228882e-05,
1480
+ "loss": 37.9181,
1481
  "step": 208
1482
  },
1483
  {
1484
  "epoch": 0.5519973588643117,
1485
+ "grad_norm": 11059.775390625,
1486
  "learning_rate": 9.999170598065028e-05,
1487
+ "loss": 37.7867,
1488
  "step": 209
1489
  },
1490
  {
1491
  "epoch": 0.5546384945526577,
1492
+ "grad_norm": 12954.494140625,
1493
  "learning_rate": 9.999155310302273e-05,
1494
+ "loss": 38.3371,
1495
  "step": 210
1496
  },
1497
  {
1498
  "epoch": 0.5572796302410036,
1499
+ "grad_norm": 10920.3037109375,
1500
  "learning_rate": 9.999139882941043e-05,
1501
+ "loss": 35.1785,
1502
  "step": 211
1503
  },
1504
  {
1505
  "epoch": 0.5599207659293496,
1506
+ "grad_norm": 15022.30078125,
1507
  "learning_rate": 9.999124315981766e-05,
1508
+ "loss": 35.528,
1509
  "step": 212
1510
  },
1511
  {
1512
  "epoch": 0.5625619016176956,
1513
+ "grad_norm": 10339.8525390625,
1514
  "learning_rate": 9.999108609424881e-05,
1515
+ "loss": 34.5773,
1516
  "step": 213
1517
  },
1518
  {
1519
  "epoch": 0.5652030373060416,
1520
+ "grad_norm": 9615.1484375,
1521
  "learning_rate": 9.999092763270823e-05,
1522
+ "loss": 34.6027,
1523
  "step": 214
1524
  },
1525
  {
1526
  "epoch": 0.5678441729943876,
1527
+ "grad_norm": 13707.630859375,
1528
  "learning_rate": 9.999076777520037e-05,
1529
+ "loss": 34.8469,
1530
  "step": 215
1531
  },
1532
  {
1533
  "epoch": 0.5704853086827336,
1534
+ "grad_norm": 13718.404296875,
1535
  "learning_rate": 9.99906065217297e-05,
1536
+ "loss": 34.0409,
1537
  "step": 216
1538
  },
1539
  {
1540
  "epoch": 0.5731264443710795,
1541
+ "grad_norm": 12160.12109375,
1542
  "learning_rate": 9.99904438723007e-05,
1543
+ "loss": 32.9267,
1544
  "step": 217
1545
  },
1546
  {
1547
  "epoch": 0.5757675800594255,
1548
+ "grad_norm": 9693.056640625,
1549
  "learning_rate": 9.999027982691793e-05,
1550
+ "loss": 33.0474,
1551
  "step": 218
1552
  },
1553
  {
1554
  "epoch": 0.5784087157477715,
1555
+ "grad_norm": 14817.9755859375,
1556
  "learning_rate": 9.999011438558595e-05,
1557
+ "loss": 33.6275,
1558
  "step": 219
1559
  },
1560
  {
1561
  "epoch": 0.5810498514361175,
1562
+ "grad_norm": 12656.400390625,
1563
  "learning_rate": 9.99899475483094e-05,
1564
+ "loss": 33.9675,
1565
  "step": 220
1566
  },
1567
  {
1568
  "epoch": 0.5836909871244635,
1569
+ "grad_norm": 17197.283203125,
1570
  "learning_rate": 9.998977931509291e-05,
1571
+ "loss": 35.6857,
1572
  "step": 221
1573
  },
1574
  {
1575
  "epoch": 0.5863321228128096,
1576
+ "grad_norm": 215147.109375,
1577
  "learning_rate": 9.998960968594121e-05,
1578
+ "loss": 88.1464,
1579
  "step": 222
1580
  },
1581
  {
1582
  "epoch": 0.5889732585011554,
1583
+ "grad_norm": 625456.3125,
1584
  "learning_rate": 9.998943866085903e-05,
1585
+ "loss": 186.8345,
1586
  "step": 223
1587
  },
1588
  {
1589
  "epoch": 0.5916143941895015,
1590
+ "grad_norm": 491068.96875,
1591
  "learning_rate": 9.998926623985114e-05,
1592
+ "loss": 158.0338,
1593
  "step": 224
1594
  },
1595
  {
1596
  "epoch": 0.5942555298778475,
1597
+ "grad_norm": 626101.125,
1598
  "learning_rate": 9.998909242292235e-05,
1599
+ "loss": 218.7658,
1600
  "step": 225
1601
  },
1602
  {
1603
  "epoch": 0.5968966655661935,
1604
+ "grad_norm": 303837.34375,
1605
  "learning_rate": 9.998891721007752e-05,
1606
+ "loss": 186.0703,
1607
  "step": 226
1608
  },
1609
  {
1610
  "epoch": 0.5995378012545395,
1611
+ "grad_norm": 354231.84375,
1612
  "learning_rate": 9.998874060132155e-05,
1613
+ "loss": 162.2602,
1614
  "step": 227
1615
  },
1616
  {
1617
  "epoch": 0.6021789369428855,
1618
+ "grad_norm": 570096.0625,
1619
  "learning_rate": 9.998856259665936e-05,
1620
+ "loss": 165.2661,
1621
  "step": 228
1622
  },
1623
  {
1624
  "epoch": 0.6048200726312314,
1625
+ "grad_norm": 405688.65625,
1626
  "learning_rate": 9.998838319609591e-05,
1627
+ "loss": 159.5345,
1628
  "step": 229
1629
  },
1630
  {
1631
  "epoch": 0.6074612083195774,
1632
+ "grad_norm": 592211.125,
1633
  "learning_rate": 9.998820239963624e-05,
1634
+ "loss": 141.6046,
1635
  "step": 230
1636
  },
1637
  {
1638
  "epoch": 0.6101023440079234,
1639
+ "grad_norm": 678225.0625,
1640
  "learning_rate": 9.998802020728537e-05,
1641
+ "loss": 84.9725,
1642
  "step": 231
1643
  },
1644
  {
1645
  "epoch": 0.6127434796962694,
1646
+ "grad_norm": 22088.375,
1647
  "learning_rate": 9.998783661904843e-05,
1648
+ "loss": 38.1227,
1649
  "step": 232
1650
  },
1651
  {
1652
  "epoch": 0.6153846153846154,
1653
+ "grad_norm": 19927.962890625,
1654
  "learning_rate": 9.99876516349305e-05,
1655
+ "loss": 37.8816,
1656
  "step": 233
1657
  },
1658
  {
1659
  "epoch": 0.6180257510729614,
1660
+ "grad_norm": 33203.27734375,
1661
  "learning_rate": 9.998746525493674e-05,
1662
+ "loss": 34.0087,
1663
  "step": 234
1664
  },
1665
  {
1666
  "epoch": 0.6206668867613073,
1667
+ "grad_norm": 10135.03515625,
1668
  "learning_rate": 9.99872774790724e-05,
1669
+ "loss": 34.0175,
1670
  "step": 235
1671
  },
1672
  {
1673
  "epoch": 0.6233080224496533,
1674
+ "grad_norm": 11513.166015625,
1675
  "learning_rate": 9.99870883073427e-05,
1676
+ "loss": 32.6651,
1677
  "step": 236
1678
  },
1679
  {
1680
  "epoch": 0.6259491581379993,
1681
+ "grad_norm": 7397.00732421875,
1682
  "learning_rate": 9.998689773975291e-05,
1683
+ "loss": 32.2064,
1684
  "step": 237
1685
  },
1686
  {
1687
  "epoch": 0.6285902938263453,
1688
+ "grad_norm": 10573.4638671875,
1689
  "learning_rate": 9.998670577630838e-05,
1690
+ "loss": 32.1057,
1691
  "step": 238
1692
  },
1693
  {
1694
  "epoch": 0.6312314295146914,
1695
+ "grad_norm": 10578.8310546875,
1696
  "learning_rate": 9.998651241701445e-05,
1697
+ "loss": 32.1381,
1698
  "step": 239
1699
  },
1700
  {
1701
  "epoch": 0.6338725652030373,
1702
+ "grad_norm": 9302.189453125,
1703
  "learning_rate": 9.998631766187651e-05,
1704
+ "loss": 32.8179,
1705
  "step": 240
1706
  },
1707
  {
1708
  "epoch": 0.6365137008913833,
1709
+ "grad_norm": 8694.892578125,
1710
  "learning_rate": 9.998612151090003e-05,
1711
+ "loss": 32.7711,
1712
  "step": 241
1713
  },
1714
  {
1715
  "epoch": 0.6391548365797293,
1716
+ "grad_norm": 10467.7099609375,
1717
  "learning_rate": 9.998592396409047e-05,
1718
+ "loss": 33.1121,
1719
  "step": 242
1720
  },
1721
  {
1722
  "epoch": 0.6417959722680753,
1723
+ "grad_norm": 11832.251953125,
1724
  "learning_rate": 9.998572502145334e-05,
1725
+ "loss": 32.8568,
1726
  "step": 243
1727
  },
1728
  {
1729
  "epoch": 0.6444371079564213,
1730
+ "grad_norm": 14376.9228515625,
1731
  "learning_rate": 9.998552468299421e-05,
1732
+ "loss": 32.5907,
1733
  "step": 244
1734
  },
1735
  {
1736
  "epoch": 0.6470782436447673,
1737
+ "grad_norm": 13190.787109375,
1738
  "learning_rate": 9.998532294871866e-05,
1739
+ "loss": 32.6583,
1740
  "step": 245
1741
  },
1742
  {
1743
  "epoch": 0.6497193793331132,
1744
+ "grad_norm": 10301.1328125,
1745
  "learning_rate": 9.998511981863232e-05,
1746
+ "loss": 31.7794,
1747
  "step": 246
1748
  },
1749
  {
1750
  "epoch": 0.6523605150214592,
1751
+ "grad_norm": 18970.587890625,
1752
  "learning_rate": 9.998491529274089e-05,
1753
+ "loss": 32.5321,
1754
  "step": 247
1755
  },
1756
  {
1757
  "epoch": 0.6550016507098052,
1758
+ "grad_norm": 10323.8408203125,
1759
  "learning_rate": 9.998470937105006e-05,
1760
+ "loss": 32.6962,
1761
  "step": 248
1762
  },
1763
  {
1764
  "epoch": 0.6576427863981512,
1765
+ "grad_norm": 13553.1123046875,
1766
  "learning_rate": 9.998450205356557e-05,
1767
+ "loss": 34.1782,
1768
  "step": 249
1769
  },
1770
  {
1771
  "epoch": 0.6602839220864972,
1772
+ "grad_norm": 34080.28125,
1773
  "learning_rate": 9.998429334029323e-05,
1774
+ "loss": 37.3095,
1775
  "step": 250
1776
  },
1777
  {
1778
  "epoch": 0.6629250577748432,
1779
+ "grad_norm": 12205.15234375,
1780
  "learning_rate": 9.998408323123887e-05,
1781
+ "loss": 33.7182,
1782
  "step": 251
1783
  },
1784
  {
1785
  "epoch": 0.6655661934631891,
1786
+ "grad_norm": 11019.15234375,
1787
  "learning_rate": 9.998387172640834e-05,
1788
+ "loss": 34.2941,
1789
  "step": 252
1790
  },
1791
  {
1792
  "epoch": 0.6682073291515351,
1793
+ "grad_norm": 10185.3310546875,
1794
  "learning_rate": 9.998365882580756e-05,
1795
+ "loss": 34.5573,
1796
  "step": 253
1797
  },
1798
  {
1799
  "epoch": 0.6708484648398811,
1800
+ "grad_norm": 8710.2685546875,
1801
  "learning_rate": 9.998344452944247e-05,
1802
+ "loss": 33.6592,
1803
  "step": 254
1804
  },
1805
  {
1806
  "epoch": 0.6734896005282272,
1807
+ "grad_norm": 8050.28759765625,
1808
  "learning_rate": 9.998322883731903e-05,
1809
+ "loss": 33.1733,
1810
  "step": 255
1811
  },
1812
  {
1813
  "epoch": 0.6761307362165732,
1814
+ "grad_norm": 6891.90673828125,
1815
  "learning_rate": 9.998301174944332e-05,
1816
+ "loss": 32.2699,
1817
  "step": 256
1818
  },
1819
  {
1820
  "epoch": 0.6787718719049192,
1821
+ "grad_norm": 6904.37060546875,
1822
  "learning_rate": 9.998279326582134e-05,
1823
+ "loss": 33.2969,
1824
  "step": 257
1825
  },
1826
  {
1827
  "epoch": 0.6814130075932651,
1828
+ "grad_norm": 6681.41162109375,
1829
  "learning_rate": 9.998257338645924e-05,
1830
+ "loss": 32.5617,
1831
  "step": 258
1832
  },
1833
  {
1834
  "epoch": 0.6840541432816111,
1835
+ "grad_norm": 7499.51025390625,
1836
  "learning_rate": 9.998235211136312e-05,
1837
+ "loss": 31.2502,
1838
  "step": 259
1839
  },
1840
  {
1841
  "epoch": 0.6866952789699571,
1842
+ "grad_norm": 5850.79931640625,
1843
  "learning_rate": 9.99821294405392e-05,
1844
+ "loss": 31.384,
1845
  "step": 260
1846
  },
1847
  {
1848
  "epoch": 0.6893364146583031,
1849
+ "grad_norm": 5846.03271484375,
1850
  "learning_rate": 9.998190537399366e-05,
1851
+ "loss": 31.2545,
1852
  "step": 261
1853
  },
1854
  {
1855
  "epoch": 0.6919775503466491,
1856
+ "grad_norm": 7224.54833984375,
1857
  "learning_rate": 9.998167991173277e-05,
1858
+ "loss": 31.2568,
1859
  "step": 262
1860
  },
1861
  {
1862
  "epoch": 0.6946186860349951,
1863
+ "grad_norm": 6079.56982421875,
1864
  "learning_rate": 9.998145305376286e-05,
1865
+ "loss": 31.7204,
1866
  "step": 263
1867
  },
1868
  {
1869
  "epoch": 0.697259821723341,
1870
+ "grad_norm": 7802.859375,
1871
  "learning_rate": 9.99812248000902e-05,
1872
+ "loss": 30.3375,
1873
  "step": 264
1874
  },
1875
  {
1876
  "epoch": 0.699900957411687,
1877
+ "grad_norm": 7014.5146484375,
1878
  "learning_rate": 9.998099515072122e-05,
1879
+ "loss": 30.6416,
1880
  "step": 265
1881
  },
1882
  {
1883
  "epoch": 0.702542093100033,
1884
+ "grad_norm": 6766.64208984375,
1885
  "learning_rate": 9.998076410566229e-05,
1886
+ "loss": 30.4145,
1887
  "step": 266
1888
  },
1889
  {
1890
  "epoch": 0.705183228788379,
1891
+ "grad_norm": 6723.0986328125,
1892
  "learning_rate": 9.99805316649199e-05,
1893
+ "loss": 29.3229,
1894
  "step": 267
1895
  },
1896
  {
1897
  "epoch": 0.707824364476725,
1898
+ "grad_norm": 8847.9677734375,
1899
  "learning_rate": 9.998029782850051e-05,
1900
+ "loss": 29.2886,
1901
  "step": 268
1902
  },
1903
  {
1904
  "epoch": 0.7104655001650709,
1905
+ "grad_norm": 5896.45458984375,
1906
  "learning_rate": 9.998006259641068e-05,
1907
+ "loss": 29.5852,
1908
  "step": 269
1909
  },
1910
  {
1911
  "epoch": 0.7131066358534169,
1912
+ "grad_norm": 7112.9150390625,
1913
  "learning_rate": 9.997982596865695e-05,
1914
+ "loss": 29.5084,
1915
  "step": 270
1916
  },
1917
  {
1918
  "epoch": 0.715747771541763,
1919
+ "grad_norm": 8039.98876953125,
1920
  "learning_rate": 9.997958794524594e-05,
1921
+ "loss": 31.9893,
1922
  "step": 271
1923
  },
1924
  {
1925
  "epoch": 0.718388907230109,
1926
+ "grad_norm": 179267.265625,
1927
  "learning_rate": 9.99793485261843e-05,
1928
+ "loss": 140.9562,
1929
  "step": 272
1930
  },
1931
  {
1932
  "epoch": 0.721030042918455,
1933
+ "grad_norm": 578681.125,
1934
  "learning_rate": 9.997910771147872e-05,
1935
+ "loss": 262.198,
1936
  "step": 273
1937
  },
1938
  {
1939
  "epoch": 0.723671178606801,
1940
+ "grad_norm": 322541.34375,
1941
  "learning_rate": 9.99788655011359e-05,
1942
+ "loss": 237.3132,
1943
  "step": 274
1944
  },
1945
  {
1946
  "epoch": 0.7263123142951469,
1947
+ "grad_norm": 235946.640625,
1948
  "learning_rate": 9.997862189516263e-05,
1949
+ "loss": 300.6354,
1950
  "step": 275
1951
  },
1952
  {
1953
  "epoch": 0.7289534499834929,
1954
+ "grad_norm": 262057.515625,
1955
  "learning_rate": 9.99783768935657e-05,
1956
+ "loss": 209.6862,
1957
  "step": 276
1958
  },
1959
  {
1960
  "epoch": 0.7315945856718389,
1961
+ "grad_norm": 221274.765625,
1962
  "learning_rate": 9.997813049635195e-05,
1963
+ "loss": 208.7495,
1964
  "step": 277
1965
  },
1966
  {
1967
  "epoch": 0.7342357213601849,
1968
+ "grad_norm": 363778.46875,
1969
  "learning_rate": 9.997788270352827e-05,
1970
+ "loss": 234.0036,
1971
  "step": 278
1972
  },
1973
  {
1974
  "epoch": 0.7368768570485309,
1975
+ "grad_norm": 198016.546875,
1976
  "learning_rate": 9.997763351510157e-05,
1977
+ "loss": 221.2396,
1978
  "step": 279
1979
  },
1980
  {
1981
  "epoch": 0.7395179927368769,
1982
+ "grad_norm": 383717.4375,
1983
  "learning_rate": 9.997738293107881e-05,
1984
+ "loss": 166.7505,
1985
  "step": 280
1986
  },
1987
  {
1988
  "epoch": 0.7421591284252228,
1989
+ "grad_norm": 471310.09375,
1990
  "learning_rate": 9.9977130951467e-05,
1991
+ "loss": 155.5116,
1992
  "step": 281
1993
  },
1994
  {
1995
  "epoch": 0.7448002641135688,
1996
+ "grad_norm": 135402.15625,
1997
  "learning_rate": 9.997687757627316e-05,
1998
+ "loss": 71.9904,
1999
  "step": 282
2000
  },
2001
  {
2002
  "epoch": 0.7474413998019148,
2003
+ "grad_norm": 6735.1005859375,
2004
  "learning_rate": 9.997662280550437e-05,
2005
+ "loss": 30.8698,
2006
  "step": 283
2007
  },
2008
  {
2009
  "epoch": 0.7500825354902608,
2010
+ "grad_norm": 11189.4736328125,
2011
  "learning_rate": 9.997636663916776e-05,
2012
+ "loss": 30.6788,
2013
  "step": 284
2014
  },
2015
  {
2016
  "epoch": 0.7527236711786068,
2017
+ "grad_norm": 9472.00390625,
2018
  "learning_rate": 9.997610907727046e-05,
2019
+ "loss": 32.5548,
2020
  "step": 285
2021
  },
2022
  {
2023
  "epoch": 0.7553648068669528,
2024
+ "grad_norm": 10074.7333984375,
2025
  "learning_rate": 9.997585011981966e-05,
2026
+ "loss": 30.9945,
2027
  "step": 286
2028
  },
2029
  {
2030
  "epoch": 0.7580059425552987,
2031
+ "grad_norm": 11928.4619140625,
2032
  "learning_rate": 9.997558976682262e-05,
2033
+ "loss": 30.6684,
2034
  "step": 287
2035
  },
2036
  {
2037
  "epoch": 0.7606470782436447,
2038
+ "grad_norm": 13231.986328125,
2039
  "learning_rate": 9.997532801828658e-05,
2040
+ "loss": 30.9457,
2041
  "step": 288
2042
  },
2043
  {
2044
  "epoch": 0.7632882139319908,
2045
+ "grad_norm": 8904.8466796875,
2046
  "learning_rate": 9.997506487421888e-05,
2047
+ "loss": 31.3361,
2048
  "step": 289
2049
  },
2050
  {
2051
  "epoch": 0.7659293496203368,
2052
+ "grad_norm": 9125.240234375,
2053
  "learning_rate": 9.997480033462683e-05,
2054
+ "loss": 30.7196,
2055
  "step": 290
2056
  },
2057
  {
2058
  "epoch": 0.7685704853086828,
2059
+ "grad_norm": 9812.6181640625,
2060
  "learning_rate": 9.997453439951784e-05,
2061
+ "loss": 30.7277,
2062
  "step": 291
2063
  },
2064
  {
2065
  "epoch": 0.7712116209970287,
2066
+ "grad_norm": 7082.22607421875,
2067
  "learning_rate": 9.997426706889935e-05,
2068
+ "loss": 31.2053,
2069
  "step": 292
2070
  },
2071
  {
2072
  "epoch": 0.7738527566853747,
2073
+ "grad_norm": 9316.9384765625,
2074
  "learning_rate": 9.997399834277878e-05,
2075
+ "loss": 31.5169,
2076
  "step": 293
2077
  },
2078
  {
2079
  "epoch": 0.7764938923737207,
2080
+ "grad_norm": 19302.771484375,
2081
  "learning_rate": 9.997372822116368e-05,
2082
+ "loss": 31.651,
2083
  "step": 294
2084
  },
2085
  {
2086
  "epoch": 0.7791350280620667,
2087
+ "grad_norm": 10954.8271484375,
2088
  "learning_rate": 9.99734567040616e-05,
2089
+ "loss": 30.4,
2090
  "step": 295
2091
  },
2092
  {
2093
  "epoch": 0.7817761637504127,
2094
+ "grad_norm": 9081.9521484375,
2095
  "learning_rate": 9.997318379148007e-05,
2096
+ "loss": 30.8718,
2097
  "step": 296
2098
  },
2099
  {
2100
  "epoch": 0.7844172994387587,
2101
+ "grad_norm": 6827.958984375,
2102
  "learning_rate": 9.997290948342673e-05,
2103
+ "loss": 31.0843,
2104
  "step": 297
2105
  },
2106
  {
2107
  "epoch": 0.7870584351271046,
2108
+ "grad_norm": 10805.7939453125,
2109
  "learning_rate": 9.997263377990926e-05,
2110
+ "loss": 31.6845,
2111
  "step": 298
2112
  },
2113
  {
2114
  "epoch": 0.7896995708154506,
2115
+ "grad_norm": 11347.0078125,
2116
  "learning_rate": 9.997235668093535e-05,
2117
+ "loss": 33.4166,
2118
  "step": 299
2119
  },
2120
  {
2121
  "epoch": 0.7923407065037966,
2122
+ "grad_norm": 16983.841796875,
2123
  "learning_rate": 9.997207818651274e-05,
2124
+ "loss": 35.7603,
2125
  "step": 300
2126
  },
2127
  {
2128
  "epoch": 0.7949818421921426,
2129
+ "grad_norm": 3815.614990234375,
2130
  "learning_rate": 9.997179829664918e-05,
2131
+ "loss": 33.1237,
2132
  "step": 301
2133
  },
2134
  {
2135
  "epoch": 0.7976229778804886,
2136
+ "grad_norm": 4439.759765625,
2137
  "learning_rate": 9.997151701135253e-05,
2138
+ "loss": 32.6201,
2139
  "step": 302
2140
  },
2141
  {
2142
  "epoch": 0.8002641135688346,
2143
+ "grad_norm": 6584.0,
2144
  "learning_rate": 9.997123433063062e-05,
2145
+ "loss": 31.9738,
2146
  "step": 303
2147
  },
2148
  {
2149
  "epoch": 0.8029052492571805,
2150
+ "grad_norm": 8394.333984375,
2151
  "learning_rate": 9.997095025449134e-05,
2152
+ "loss": 34.1952,
2153
  "step": 304
2154
  },
2155
  {
2156
  "epoch": 0.8055463849455266,
2157
+ "grad_norm": 8264.888671875,
2158
  "learning_rate": 9.997066478294262e-05,
2159
+ "loss": 34.1646,
2160
  "step": 305
2161
  },
2162
  {
2163
  "epoch": 0.8081875206338726,
2164
+ "grad_norm": 6815.27587890625,
2165
  "learning_rate": 9.997037791599245e-05,
2166
+ "loss": 32.8399,
2167
  "step": 306
2168
  },
2169
  {
2170
  "epoch": 0.8108286563222186,
2171
+ "grad_norm": 6638.54296875,
2172
  "learning_rate": 9.997008965364884e-05,
2173
+ "loss": 32.737,
2174
  "step": 307
2175
  },
2176
  {
2177
  "epoch": 0.8134697920105646,
2178
+ "grad_norm": 6356.19287109375,
2179
  "learning_rate": 9.996979999591983e-05,
2180
+ "loss": 33.2864,
2181
  "step": 308
2182
  },
2183
  {
2184
  "epoch": 0.8161109276989106,
2185
+ "grad_norm": 10876.560546875,
2186
  "learning_rate": 9.996950894281349e-05,
2187
+ "loss": 32.8353,
2188
  "step": 309
2189
  },
2190
  {
2191
  "epoch": 0.8187520633872565,
2192
+ "grad_norm": 18334.380859375,
2193
  "learning_rate": 9.996921649433796e-05,
2194
+ "loss": 33.1125,
2195
  "step": 310
2196
  },
2197
  {
2198
  "epoch": 0.8213931990756025,
2199
+ "grad_norm": 5925.57080078125,
2200
  "learning_rate": 9.996892265050144e-05,
2201
+ "loss": 33.4775,
2202
  "step": 311
2203
  },
2204
  {
2205
  "epoch": 0.8240343347639485,
2206
+ "grad_norm": 5512.29541015625,
2207
  "learning_rate": 9.99686274113121e-05,
2208
+ "loss": 32.4073,
2209
  "step": 312
2210
  },
2211
  {
2212
  "epoch": 0.8266754704522945,
2213
+ "grad_norm": 6770.63232421875,
2214
  "learning_rate": 9.996833077677819e-05,
2215
+ "loss": 33.0255,
2216
  "step": 313
2217
  },
2218
  {
2219
  "epoch": 0.8293166061406405,
2220
+ "grad_norm": 9025.830078125,
2221
  "learning_rate": 9.9968032746908e-05,
2222
+ "loss": 31.7732,
2223
  "step": 314
2224
  },
2225
  {
2226
  "epoch": 0.8319577418289865,
2227
+ "grad_norm": 5815.4296875,
2228
  "learning_rate": 9.996773332170983e-05,
2229
+ "loss": 31.5946,
2230
  "step": 315
2231
  },
2232
  {
2233
  "epoch": 0.8345988775173324,
2234
+ "grad_norm": 7221.68603515625,
2235
  "learning_rate": 9.996743250119209e-05,
2236
+ "loss": 31.5973,
2237
  "step": 316
2238
  },
2239
  {
2240
  "epoch": 0.8372400132056784,
2241
+ "grad_norm": 7172.86962890625,
2242
  "learning_rate": 9.996713028536313e-05,
2243
+ "loss": 31.4948,
2244
  "step": 317
2245
  },
2246
  {
2247
  "epoch": 0.8398811488940244,
2248
+ "grad_norm": 11000.0458984375,
2249
  "learning_rate": 9.99668266742314e-05,
2250
+ "loss": 31.3127,
2251
  "step": 318
2252
  },
2253
  {
2254
  "epoch": 0.8425222845823704,
2255
+ "grad_norm": 8431.4716796875,
2256
  "learning_rate": 9.99665216678054e-05,
2257
+ "loss": 30.8608,
2258
  "step": 319
2259
  },
2260
  {
2261
  "epoch": 0.8451634202707164,
2262
+ "grad_norm": 7308.78466796875,
2263
  "learning_rate": 9.996621526609364e-05,
2264
+ "loss": 30.8716,
2265
  "step": 320
2266
  },
2267
  {
2268
  "epoch": 0.8478045559590623,
2269
+ "grad_norm": 8358.787109375,
2270
  "learning_rate": 9.996590746910467e-05,
2271
+ "loss": 31.0737,
2272
  "step": 321
2273
  },
2274
  {
2275
  "epoch": 0.8504456916474084,
2276
+ "grad_norm": 29319.46484375,
2277
  "learning_rate": 9.996559827684709e-05,
2278
+ "loss": 46.2402,
2279
  "step": 322
2280
  },
2281
  {
2282
  "epoch": 0.8530868273357544,
2283
+ "grad_norm": 903961.25,
2284
  "learning_rate": 9.996528768932951e-05,
2285
+ "loss": 161.367,
2286
  "step": 323
2287
  },
2288
  {
2289
  "epoch": 0.8557279630241004,
2290
+ "grad_norm": 436229.9375,
2291
  "learning_rate": 9.996497570656062e-05,
2292
+ "loss": 215.2534,
2293
  "step": 324
2294
  },
2295
  {
2296
  "epoch": 0.8583690987124464,
2297
+ "grad_norm": 310716.5,
2298
  "learning_rate": 9.996466232854915e-05,
2299
+ "loss": 218.9532,
2300
  "step": 325
2301
  },
2302
  {
2303
  "epoch": 0.8610102344007924,
2304
+ "grad_norm": 935038.75,
2305
  "learning_rate": 9.996434755530384e-05,
2306
+ "loss": 204.1668,
2307
  "step": 326
2308
  },
2309
  {
2310
  "epoch": 0.8636513700891383,
2311
+ "grad_norm": 577125.0625,
2312
  "learning_rate": 9.996403138683347e-05,
2313
+ "loss": 225.0228,
2314
  "step": 327
2315
  },
2316
  {
2317
  "epoch": 0.8662925057774843,
2318
+ "grad_norm": 429562.09375,
2319
  "learning_rate": 9.996371382314686e-05,
2320
+ "loss": 221.4529,
2321
  "step": 328
2322
  },
2323
  {
2324
  "epoch": 0.8689336414658303,
2325
+ "grad_norm": 469087.0625,
2326
  "learning_rate": 9.996339486425291e-05,
2327
+ "loss": 161.132,
2328
  "step": 329
2329
  },
2330
  {
2331
  "epoch": 0.8715747771541763,
2332
+ "grad_norm": 946113.1875,
2333
  "learning_rate": 9.99630745101605e-05,
2334
+ "loss": 169.4336,
2335
  "step": 330
2336
  },
2337
  {
2338
  "epoch": 0.8742159128425223,
2339
+ "grad_norm": 537740.1875,
2340
  "learning_rate": 9.996275276087859e-05,
2341
+ "loss": 166.9042,
2342
  "step": 331
2343
  },
2344
  {
2345
  "epoch": 0.8768570485308683,
2346
+ "grad_norm": 377986.5,
2347
  "learning_rate": 9.996242961641615e-05,
2348
+ "loss": 139.4483,
2349
  "step": 332
2350
  },
2351
  {
2352
  "epoch": 0.8794981842192142,
2353
+ "grad_norm": 6949.21044921875,
2354
  "learning_rate": 9.996210507678223e-05,
2355
+ "loss": 32.8323,
2356
  "step": 333
2357
  },
2358
  {
2359
  "epoch": 0.8821393199075602,
2360
+ "grad_norm": 6551.869140625,
2361
  "learning_rate": 9.996177914198586e-05,
2362
+ "loss": 31.1956,
2363
  "step": 334
2364
  },
2365
  {
2366
  "epoch": 0.8847804555959062,
2367
+ "grad_norm": 8210.8974609375,
2368
  "learning_rate": 9.996145181203615e-05,
2369
+ "loss": 30.2494,
2370
  "step": 335
2371
  },
2372
  {
2373
  "epoch": 0.8874215912842522,
2374
+ "grad_norm": 12632.7666015625,
2375
  "learning_rate": 9.996112308694225e-05,
2376
+ "loss": 30.7789,
2377
  "step": 336
2378
  },
2379
  {
2380
  "epoch": 0.8900627269725983,
2381
+ "grad_norm": 11905.80078125,
2382
  "learning_rate": 9.996079296671334e-05,
2383
+ "loss": 30.9992,
2384
  "step": 337
2385
  },
2386
  {
2387
  "epoch": 0.8927038626609443,
2388
+ "grad_norm": 11776.396484375,
2389
  "learning_rate": 9.996046145135865e-05,
2390
+ "loss": 30.6118,
2391
  "step": 338
2392
  },
2393
  {
2394
  "epoch": 0.8953449983492902,
2395
+ "grad_norm": 10494.625,
2396
  "learning_rate": 9.99601285408874e-05,
2397
+ "loss": 30.6983,
2398
  "step": 339
2399
  },
2400
  {
2401
  "epoch": 0.8979861340376362,
2402
+ "grad_norm": 8309.9296875,
2403
  "learning_rate": 9.995979423530892e-05,
2404
+ "loss": 30.6617,
2405
  "step": 340
2406
  },
2407
  {
2408
  "epoch": 0.9006272697259822,
2409
+ "grad_norm": 11482.9853515625,
2410
  "learning_rate": 9.995945853463253e-05,
2411
+ "loss": 30.5696,
2412
  "step": 341
2413
  },
2414
  {
2415
  "epoch": 0.9032684054143282,
2416
+ "grad_norm": 8950.994140625,
2417
  "learning_rate": 9.995912143886763e-05,
2418
+ "loss": 29.6077,
2419
  "step": 342
2420
  },
2421
  {
2422
  "epoch": 0.9059095411026742,
2423
+ "grad_norm": 8950.931640625,
2424
  "learning_rate": 9.995878294802357e-05,
2425
+ "loss": 30.4176,
2426
  "step": 343
2427
  },
2428
  {
2429
  "epoch": 0.9085506767910201,
2430
+ "grad_norm": 6688.57470703125,
2431
  "learning_rate": 9.995844306210988e-05,
2432
+ "loss": 29.8723,
2433
  "step": 344
2434
  },
2435
  {
2436
  "epoch": 0.9111918124793661,
2437
+ "grad_norm": 7882.67431640625,
2438
  "learning_rate": 9.995810178113599e-05,
2439
+ "loss": 30.049,
2440
  "step": 345
2441
  },
2442
  {
2443
  "epoch": 0.9138329481677121,
2444
+ "grad_norm": 9309.5625,
2445
  "learning_rate": 9.995775910511147e-05,
2446
+ "loss": 30.2998,
2447
  "step": 346
2448
  },
2449
  {
2450
  "epoch": 0.9164740838560581,
2451
+ "grad_norm": 9403.8974609375,
2452
  "learning_rate": 9.995741503404587e-05,
2453
+ "loss": 30.4171,
2454
  "step": 347
2455
  },
2456
  {
2457
  "epoch": 0.9191152195444041,
2458
+ "grad_norm": 10254.1376953125,
2459
  "learning_rate": 9.995706956794879e-05,
2460
+ "loss": 32.398,
2461
  "step": 348
2462
  },
2463
  {
2464
  "epoch": 0.9217563552327501,
2465
+ "grad_norm": 11519.509765625,
2466
  "learning_rate": 9.99567227068299e-05,
2467
+ "loss": 33.4377,
2468
  "step": 349
2469
  },
2470
  {
2471
  "epoch": 0.924397490921096,
2472
+ "grad_norm": 17227.236328125,
2473
  "learning_rate": 9.995637445069887e-05,
2474
+ "loss": 36.9788,
2475
  "step": 350
2476
  },
2477
  {
2478
  "epoch": 0.927038626609442,
2479
+ "grad_norm": 8033.53369140625,
2480
  "learning_rate": 9.995602479956545e-05,
2481
+ "loss": 32.1,
2482
  "step": 351
2483
  },
2484
  {
2485
  "epoch": 0.929679762297788,
2486
+ "grad_norm": 10333.927734375,
2487
  "learning_rate": 9.995567375343937e-05,
2488
+ "loss": 32.4024,
2489
  "step": 352
2490
  },
2491
  {
2492
  "epoch": 0.932320897986134,
2493
+ "grad_norm": 5577.73486328125,
2494
  "learning_rate": 9.995532131233044e-05,
2495
+ "loss": 33.2651,
2496
  "step": 353
2497
  },
2498
  {
2499
  "epoch": 0.93496203367448,
2500
+ "grad_norm": 5001.80615234375,
2501
  "learning_rate": 9.99549674762485e-05,
2502
+ "loss": 33.2199,
2503
  "step": 354
2504
  },
2505
  {
2506
  "epoch": 0.9376031693628261,
2507
+ "grad_norm": 6995.62255859375,
2508
  "learning_rate": 9.995461224520345e-05,
2509
+ "loss": 33.0332,
2510
  "step": 355
2511
  },
2512
  {
2513
  "epoch": 0.940244305051172,
2514
+ "grad_norm": 5345.10888671875,
2515
  "learning_rate": 9.995425561920519e-05,
2516
+ "loss": 32.4465,
2517
  "step": 356
2518
  },
2519
  {
2520
  "epoch": 0.942885440739518,
2521
+ "grad_norm": 5311.36376953125,
2522
  "learning_rate": 9.99538975982637e-05,
2523
+ "loss": 33.3183,
2524
  "step": 357
2525
  },
2526
  {
2527
  "epoch": 0.945526576427864,
2528
+ "grad_norm": 4239.72021484375,
2529
  "learning_rate": 9.995353818238895e-05,
2530
+ "loss": 30.5123,
2531
  "step": 358
2532
  },
2533
  {
2534
  "epoch": 0.94816771211621,
2535
+ "grad_norm": 6135.8544921875,
2536
  "learning_rate": 9.9953177371591e-05,
2537
+ "loss": 30.1126,
2538
  "step": 359
2539
  },
2540
  {
2541
  "epoch": 0.950808847804556,
2542
+ "grad_norm": 3885.701904296875,
2543
  "learning_rate": 9.995281516587991e-05,
2544
+ "loss": 30.1448,
2545
  "step": 360
2546
  },
2547
  {
2548
  "epoch": 0.953449983492902,
2549
+ "grad_norm": 17259.177734375,
2550
  "learning_rate": 9.99524515652658e-05,
2551
+ "loss": 30.9694,
2552
  "step": 361
2553
  },
2554
  {
2555
  "epoch": 0.9560911191812479,
2556
+ "grad_norm": 5949.1728515625,
2557
  "learning_rate": 9.995208656975884e-05,
2558
+ "loss": 30.8493,
2559
  "step": 362
2560
  },
2561
  {
2562
  "epoch": 0.9587322548695939,
2563
+ "grad_norm": 231986.453125,
2564
  "learning_rate": 9.995172017936919e-05,
2565
+ "loss": 141.9035,
2566
  "step": 363
2567
  },
2568
  {
2569
  "epoch": 0.9613733905579399,
2570
+ "grad_norm": 103330.5546875,
2571
  "learning_rate": 9.99513523941071e-05,
2572
+ "loss": 188.4911,
2573
  "step": 364
2574
  },
2575
  {
2576
  "epoch": 0.9640145262462859,
2577
+ "grad_norm": 307991.03125,
2578
  "learning_rate": 9.995098321398284e-05,
2579
+ "loss": 160.2285,
2580
  "step": 365
2581
  },
2582
  {
2583
  "epoch": 0.9666556619346319,
2584
+ "grad_norm": 190517.765625,
2585
  "learning_rate": 9.995061263900671e-05,
2586
+ "loss": 152.4148,
2587
  "step": 366
2588
  },
2589
  {
2590
  "epoch": 0.9692967976229779,
2591
+ "grad_norm": 134986.078125,
2592
  "learning_rate": 9.995024066918908e-05,
2593
+ "loss": 119.8174,
2594
  "step": 367
2595
  },
2596
  {
2597
  "epoch": 0.9719379333113238,
2598
+ "grad_norm": 215117.609375,
2599
  "learning_rate": 9.994986730454031e-05,
2600
+ "loss": 125.8479,
2601
  "step": 368
2602
  },
2603
  {
2604
  "epoch": 0.9745790689996698,
2605
+ "grad_norm": 22283.35546875,
2606
  "learning_rate": 9.994949254507084e-05,
2607
+ "loss": 34.5446,
2608
  "step": 369
2609
  },
2610
  {
2611
  "epoch": 0.9772202046880158,
2612
+ "grad_norm": 12405.2236328125,
2613
  "learning_rate": 9.994911639079112e-05,
2614
+ "loss": 35.1761,
2615
  "step": 370
2616
  },
2617
  {
2618
  "epoch": 0.9798613403763619,
2619
+ "grad_norm": 13683.02734375,
2620
  "learning_rate": 9.994873884171167e-05,
2621
+ "loss": 32.3272,
2622
  "step": 371
2623
  },
2624
  {
2625
  "epoch": 0.9825024760647079,
2626
+ "grad_norm": 8963.904296875,
2627
  "learning_rate": 9.994835989784305e-05,
2628
+ "loss": 31.0019,
2629
  "step": 372
2630
  },
2631
  {
2632
  "epoch": 0.9851436117530538,
2633
+ "grad_norm": 19926.2734375,
2634
  "learning_rate": 9.994797955919581e-05,
2635
+ "loss": 30.4514,
2636
  "step": 373
2637
  },
2638
  {
2639
  "epoch": 0.9877847474413998,
2640
+ "grad_norm": 37221.25,
2641
  "learning_rate": 9.994759782578058e-05,
2642
+ "loss": 32.0492,
2643
  "step": 374
2644
  },
2645
  {
2646
  "epoch": 0.9904258831297458,
2647
+ "grad_norm": 10019.828125,
2648
  "learning_rate": 9.994721469760801e-05,
2649
+ "loss": 31.7783,
2650
  "step": 375
2651
  },
2652
  {
2653
  "epoch": 0.9930670188180918,
2654
+ "grad_norm": 8898.4228515625,
2655
  "learning_rate": 9.994683017468883e-05,
2656
+ "loss": 30.9381,
2657
  "step": 376
2658
  },
2659
  {
2660
  "epoch": 0.9957081545064378,
2661
+ "grad_norm": 13350.8203125,
2662
  "learning_rate": 9.994644425703374e-05,
2663
+ "loss": 32.4939,
2664
  "step": 377
2665
  },
2666
  {
2667
  "epoch": 0.9983492901947838,
2668
+ "grad_norm": 29945.037109375,
2669
  "learning_rate": 9.994605694465355e-05,
2670
+ "loss": 34.0366,
2671
  "step": 378
2672
  },
2673
  {
2674
+ "epoch": 1.0009904258831297,
2675
+ "grad_norm": 19480.009765625,
2676
  "learning_rate": 9.994566823755907e-05,
2677
+ "loss": 37.1069,
2678
  "step": 379
2679
  },
2680
  {
2681
+ "epoch": 1.0036315615714757,
2682
+ "grad_norm": 4824.83544921875,
2683
  "learning_rate": 9.99452781357611e-05,
2684
+ "loss": 35.9486,
2685
  "step": 380
2686
  },
2687
  {
2688
+ "epoch": 1.0062726972598217,
2689
+ "grad_norm": 4898.34423828125,
2690
  "learning_rate": 9.994488663927062e-05,
2691
+ "loss": 34.3521,
2692
  "step": 381
2693
  },
2694
  {
2695
+ "epoch": 1.0089138329481677,
2696
+ "grad_norm": 7551.79736328125,
2697
  "learning_rate": 9.994449374809851e-05,
2698
+ "loss": 36.7028,
2699
  "step": 382
2700
  },
2701
  {
2702
+ "epoch": 1.0115549686365137,
2703
+ "grad_norm": 8357.705078125,
2704
  "learning_rate": 9.994409946225574e-05,
2705
+ "loss": 36.5134,
2706
  "step": 383
2707
  },
2708
  {
2709
+ "epoch": 1.0141961043248597,
2710
+ "grad_norm": 5780.6787109375,
2711
  "learning_rate": 9.994370378175332e-05,
2712
+ "loss": 37.3621,
2713
  "step": 384
2714
  },
2715
  {
2716
+ "epoch": 1.0168372400132057,
2717
+ "grad_norm": 5624.93896484375,
2718
  "learning_rate": 9.994330670660235e-05,
2719
+ "loss": 37.6676,
2720
  "step": 385
2721
  },
2722
  {
2723
+ "epoch": 1.0194783757015518,
2724
+ "grad_norm": 6545.541015625,
2725
  "learning_rate": 9.994290823681385e-05,
2726
+ "loss": 37.2885,
2727
  "step": 386
2728
  },
2729
  {
2730
+ "epoch": 1.0221195113898978,
2731
+ "grad_norm": 9896.431640625,
2732
  "learning_rate": 9.994250837239897e-05,
2733
+ "loss": 37.8031,
2734
  "step": 387
2735
  },
2736
  {
2737
+ "epoch": 1.0247606470782435,
2738
+ "grad_norm": 6628.89453125,
2739
  "learning_rate": 9.994210711336891e-05,
2740
+ "loss": 39.4998,
2741
  "step": 388
2742
  },
2743
  {
2744
+ "epoch": 1.0274017827665896,
2745
+ "grad_norm": 7230.349609375,
2746
  "learning_rate": 9.994170445973483e-05,
2747
+ "loss": 37.6952,
2748
  "step": 389
2749
  },
2750
  {
2751
+ "epoch": 1.0300429184549356,
2752
+ "grad_norm": 5001.923828125,
2753
  "learning_rate": 9.994130041150798e-05,
2754
+ "loss": 37.2387,
2755
  "step": 390
2756
  },
2757
  {
2758
+ "epoch": 1.0326840541432816,
2759
+ "grad_norm": 8473.236328125,
2760
  "learning_rate": 9.994089496869968e-05,
2761
+ "loss": 37.7243,
2762
  "step": 391
2763
  },
2764
  {
2765
+ "epoch": 1.0353251898316276,
2766
+ "grad_norm": 12679.2109375,
2767
  "learning_rate": 9.994048813132119e-05,
2768
+ "loss": 35.9025,
2769
  "step": 392
2770
  },
2771
  {
2772
+ "epoch": 1.0379663255199736,
2773
+ "grad_norm": 7488.9248046875,
2774
  "learning_rate": 9.994007989938392e-05,
2775
+ "loss": 36.2572,
2776
  "step": 393
2777
  },
2778
  {
2779
+ "epoch": 1.0406074612083196,
2780
+ "grad_norm": 8192.458984375,
2781
  "learning_rate": 9.993967027289927e-05,
2782
+ "loss": 38.7854,
2783
  "step": 394
2784
  },
2785
  {
2786
+ "epoch": 1.0432485968966656,
2787
+ "grad_norm": 6160.6787109375,
2788
  "learning_rate": 9.993925925187865e-05,
2789
+ "loss": 35.9352,
2790
  "step": 395
2791
  },
2792
  {
2793
+ "epoch": 1.0458897325850116,
2794
+ "grad_norm": 6419.31103515625,
2795
  "learning_rate": 9.993884683633354e-05,
2796
+ "loss": 37.7825,
2797
  "step": 396
2798
  },
2799
  {
2800
+ "epoch": 1.0485308682733576,
2801
+ "grad_norm": 8226.6005859375,
2802
  "learning_rate": 9.993843302627549e-05,
2803
+ "loss": 35.5052,
2804
  "step": 397
2805
  },
2806
  {
2807
+ "epoch": 1.0511720039617036,
2808
+ "grad_norm": 8380.81640625,
2809
  "learning_rate": 9.993801782171603e-05,
2810
+ "loss": 36.5649,
2811
  "step": 398
2812
  },
2813
  {
2814
+ "epoch": 1.0538131396500496,
2815
+ "grad_norm": 10895.78515625,
2816
  "learning_rate": 9.993760122266676e-05,
2817
+ "loss": 37.1919,
2818
  "step": 399
2819
  },
2820
  {
2821
+ "epoch": 1.0564542753383954,
2822
+ "grad_norm": 14454.5390625,
2823
  "learning_rate": 9.99371832291393e-05,
2824
+ "loss": 38.3564,
2825
  "step": 400
2826
  },
2827
  {
2828
+ "epoch": 1.0564542753383954,
2829
+ "eval_loss": 8.69857406616211,
2830
+ "eval_runtime": 2.1301,
2831
+ "eval_samples_per_second": 232.388,
2832
+ "eval_steps_per_second": 29.107,
2833
  "step": 400
2834
  }
2835
  ],
 
2850
  "attributes": {}
2851
  }
2852
  },
2853
+ "total_flos": 1043682507620352.0,
2854
+ "train_batch_size": 8,
2855
  "trial_name": null,
2856
  "trial_params": null
2857
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cebda972c95fa3f00630bd956242e3cea6f2245dbff842addee435dc764428e
3
- size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c25f947011cdcfeb2e2cd6b6797fb101f0c43353904140320fa5ceb0b490467
3
+ size 6712