CocoRoF commited on
Commit
45e087b
·
verified ·
1 Parent(s): a166413

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:203b87ab4b155942f182a48c96314677708618d17d587d13aa6a90c599683112
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0650b34e8d23b3446525371f6f73e6e00280572d8ce6b52c973f4cc138898ffe
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29fe896ba1f2f6a44409fa7b844c95428ca05929de3bae3b32438d62e2c721df
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7a0e457ec4cc9ee087e5fbed60cfc5368dc0e8fafbafd4677643d087e0a0b94
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c461c9d337dfc684e9352ec72bfa344e2f5d377f7cfc4475de9acae294dca89
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06fea830cf5ad73ec00d500ea6fb952740ac936f18e93fa2d32abde1ea3ead92
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be561d1df19be227394d8ea607c54262a06c9bf880af0aa5e04a52596a2a6cb0
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f3e24417a59435f5a8450a4aeb0f09cc92734b5c3b45a0701b2c043c415c05
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bea02744c29f30024590ab1629a0e7b7dabbf1e8476456c2e7c5ce46dc35c28
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:041be966454b60c86af576fc1eb7f34189114689abff8f9622b947110f7334c8
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85766f6596d15a810177d77dd259d9b50588cf100ec5f8ebff5fed881d57957
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8be75d04b1ebe614241b88fd010a5dda1b7bf703c00c6ebe310ca07975830fe7
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4699833a7ab4cb692996ef7567f934c0bac79d6a067963a873f89a38e412bd48
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deec0367b7616485abab4c5d341abba8ad8319ff1b2f118048701f6b48b69fb6
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e677c668f388d20cb2a51630b38a0e9e72b08c3b4c96a7a44de0c1086265eab
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5424464334147003,
5
  "eval_steps": 2500,
6
- "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5281,6 +5281,1764 @@
5281
  "eval_samples_per_second": 1955.019,
5282
  "eval_steps_per_second": 30.548,
5283
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5284
  }
5285
  ],
5286
  "logging_steps": 10,
@@ -5300,7 +7058,7 @@
5300
  "attributes": {}
5301
  }
5302
  },
5303
- "total_flos": 1.0476964175413248e+19,
5304
  "train_batch_size": 16,
5305
  "trial_name": null,
5306
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7232619112196004,
5
  "eval_steps": 2500,
6
+ "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5281
  "eval_samples_per_second": 1955.019,
5282
  "eval_steps_per_second": 30.548,
5283
  "step": 7500
5284
+ },
5285
+ {
5286
+ "epoch": 0.5431696953259199,
5287
+ "grad_norm": 17.65625,
5288
+ "learning_rate": 9.957565035397314e-07,
5289
+ "loss": 25.4949,
5290
+ "step": 7510
5291
+ },
5292
+ {
5293
+ "epoch": 0.5438929572371395,
5294
+ "grad_norm": 15.7265625,
5295
+ "learning_rate": 9.957508530783997e-07,
5296
+ "loss": 25.2191,
5297
+ "step": 7520
5298
+ },
5299
+ {
5300
+ "epoch": 0.5446162191483591,
5301
+ "grad_norm": 16.109375,
5302
+ "learning_rate": 9.957452026170676e-07,
5303
+ "loss": 25.1094,
5304
+ "step": 7530
5305
+ },
5306
+ {
5307
+ "epoch": 0.5453394810595787,
5308
+ "grad_norm": 16.703125,
5309
+ "learning_rate": 9.957395521557356e-07,
5310
+ "loss": 24.9443,
5311
+ "step": 7540
5312
+ },
5313
+ {
5314
+ "epoch": 0.5460627429707983,
5315
+ "grad_norm": 15.3515625,
5316
+ "learning_rate": 9.957339016944036e-07,
5317
+ "loss": 25.173,
5318
+ "step": 7550
5319
+ },
5320
+ {
5321
+ "epoch": 0.5467860048820179,
5322
+ "grad_norm": 16.21875,
5323
+ "learning_rate": 9.957282512330719e-07,
5324
+ "loss": 24.9756,
5325
+ "step": 7560
5326
+ },
5327
+ {
5328
+ "epoch": 0.5475092667932375,
5329
+ "grad_norm": 16.21875,
5330
+ "learning_rate": 9.9572260077174e-07,
5331
+ "loss": 24.9386,
5332
+ "step": 7570
5333
+ },
5334
+ {
5335
+ "epoch": 0.5482325287044572,
5336
+ "grad_norm": 16.109375,
5337
+ "learning_rate": 9.95716950310408e-07,
5338
+ "loss": 24.7594,
5339
+ "step": 7580
5340
+ },
5341
+ {
5342
+ "epoch": 0.5489557906156767,
5343
+ "grad_norm": 16.75,
5344
+ "learning_rate": 9.95711299849076e-07,
5345
+ "loss": 25.0118,
5346
+ "step": 7590
5347
+ },
5348
+ {
5349
+ "epoch": 0.5496790525268963,
5350
+ "grad_norm": 16.34375,
5351
+ "learning_rate": 9.957056493877443e-07,
5352
+ "loss": 24.9402,
5353
+ "step": 7600
5354
+ },
5355
+ {
5356
+ "epoch": 0.5504023144381159,
5357
+ "grad_norm": 16.5,
5358
+ "learning_rate": 9.956999989264123e-07,
5359
+ "loss": 25.188,
5360
+ "step": 7610
5361
+ },
5362
+ {
5363
+ "epoch": 0.5511255763493355,
5364
+ "grad_norm": 14.90625,
5365
+ "learning_rate": 9.956943484650805e-07,
5366
+ "loss": 25.4746,
5367
+ "step": 7620
5368
+ },
5369
+ {
5370
+ "epoch": 0.5518488382605551,
5371
+ "grad_norm": 16.265625,
5372
+ "learning_rate": 9.956886980037485e-07,
5373
+ "loss": 25.1864,
5374
+ "step": 7630
5375
+ },
5376
+ {
5377
+ "epoch": 0.5525721001717747,
5378
+ "grad_norm": 16.328125,
5379
+ "learning_rate": 9.956830475424167e-07,
5380
+ "loss": 25.0443,
5381
+ "step": 7640
5382
+ },
5383
+ {
5384
+ "epoch": 0.5532953620829943,
5385
+ "grad_norm": 15.890625,
5386
+ "learning_rate": 9.956773970810847e-07,
5387
+ "loss": 24.9956,
5388
+ "step": 7650
5389
+ },
5390
+ {
5391
+ "epoch": 0.5540186239942139,
5392
+ "grad_norm": 16.203125,
5393
+ "learning_rate": 9.956717466197527e-07,
5394
+ "loss": 25.133,
5395
+ "step": 7660
5396
+ },
5397
+ {
5398
+ "epoch": 0.5547418859054335,
5399
+ "grad_norm": 15.78125,
5400
+ "learning_rate": 9.956660961584209e-07,
5401
+ "loss": 25.1481,
5402
+ "step": 7670
5403
+ },
5404
+ {
5405
+ "epoch": 0.5554651478166531,
5406
+ "grad_norm": 16.015625,
5407
+ "learning_rate": 9.956604456970889e-07,
5408
+ "loss": 25.5133,
5409
+ "step": 7680
5410
+ },
5411
+ {
5412
+ "epoch": 0.5561884097278728,
5413
+ "grad_norm": 15.7265625,
5414
+ "learning_rate": 9.95654795235757e-07,
5415
+ "loss": 25.4187,
5416
+ "step": 7690
5417
+ },
5418
+ {
5419
+ "epoch": 0.5569116716390923,
5420
+ "grad_norm": 17.234375,
5421
+ "learning_rate": 9.95649144774425e-07,
5422
+ "loss": 24.9255,
5423
+ "step": 7700
5424
+ },
5425
+ {
5426
+ "epoch": 0.5576349335503119,
5427
+ "grad_norm": 16.4375,
5428
+ "learning_rate": 9.95643494313093e-07,
5429
+ "loss": 25.2905,
5430
+ "step": 7710
5431
+ },
5432
+ {
5433
+ "epoch": 0.5583581954615315,
5434
+ "grad_norm": 15.4296875,
5435
+ "learning_rate": 9.956378438517613e-07,
5436
+ "loss": 25.1232,
5437
+ "step": 7720
5438
+ },
5439
+ {
5440
+ "epoch": 0.5590814573727511,
5441
+ "grad_norm": 16.328125,
5442
+ "learning_rate": 9.956321933904293e-07,
5443
+ "loss": 25.0427,
5444
+ "step": 7730
5445
+ },
5446
+ {
5447
+ "epoch": 0.5598047192839707,
5448
+ "grad_norm": 16.203125,
5449
+ "learning_rate": 9.956265429290975e-07,
5450
+ "loss": 24.8122,
5451
+ "step": 7740
5452
+ },
5453
+ {
5454
+ "epoch": 0.5605279811951903,
5455
+ "grad_norm": 16.265625,
5456
+ "learning_rate": 9.956208924677655e-07,
5457
+ "loss": 24.9768,
5458
+ "step": 7750
5459
+ },
5460
+ {
5461
+ "epoch": 0.56125124310641,
5462
+ "grad_norm": 15.9765625,
5463
+ "learning_rate": 9.956152420064335e-07,
5464
+ "loss": 25.1609,
5465
+ "step": 7760
5466
+ },
5467
+ {
5468
+ "epoch": 0.5619745050176295,
5469
+ "grad_norm": 15.4765625,
5470
+ "learning_rate": 9.956095915451017e-07,
5471
+ "loss": 24.8014,
5472
+ "step": 7770
5473
+ },
5474
+ {
5475
+ "epoch": 0.5626977669288491,
5476
+ "grad_norm": 15.8125,
5477
+ "learning_rate": 9.956039410837697e-07,
5478
+ "loss": 25.1054,
5479
+ "step": 7780
5480
+ },
5481
+ {
5482
+ "epoch": 0.5634210288400687,
5483
+ "grad_norm": 15.953125,
5484
+ "learning_rate": 9.95598290622438e-07,
5485
+ "loss": 25.3523,
5486
+ "step": 7790
5487
+ },
5488
+ {
5489
+ "epoch": 0.5641442907512884,
5490
+ "grad_norm": 16.71875,
5491
+ "learning_rate": 9.95592640161106e-07,
5492
+ "loss": 25.1315,
5493
+ "step": 7800
5494
+ },
5495
+ {
5496
+ "epoch": 0.5648675526625079,
5497
+ "grad_norm": 17.046875,
5498
+ "learning_rate": 9.95586989699774e-07,
5499
+ "loss": 25.2146,
5500
+ "step": 7810
5501
+ },
5502
+ {
5503
+ "epoch": 0.5655908145737275,
5504
+ "grad_norm": 16.84375,
5505
+ "learning_rate": 9.955813392384421e-07,
5506
+ "loss": 24.9596,
5507
+ "step": 7820
5508
+ },
5509
+ {
5510
+ "epoch": 0.5663140764849471,
5511
+ "grad_norm": 17.671875,
5512
+ "learning_rate": 9.955756887771101e-07,
5513
+ "loss": 25.124,
5514
+ "step": 7830
5515
+ },
5516
+ {
5517
+ "epoch": 0.5670373383961667,
5518
+ "grad_norm": 16.28125,
5519
+ "learning_rate": 9.955700383157783e-07,
5520
+ "loss": 24.9963,
5521
+ "step": 7840
5522
+ },
5523
+ {
5524
+ "epoch": 0.5677606003073863,
5525
+ "grad_norm": 15.890625,
5526
+ "learning_rate": 9.955643878544463e-07,
5527
+ "loss": 25.4564,
5528
+ "step": 7850
5529
+ },
5530
+ {
5531
+ "epoch": 0.5684838622186059,
5532
+ "grad_norm": 17.0625,
5533
+ "learning_rate": 9.955587373931145e-07,
5534
+ "loss": 25.0387,
5535
+ "step": 7860
5536
+ },
5537
+ {
5538
+ "epoch": 0.5692071241298255,
5539
+ "grad_norm": 16.625,
5540
+ "learning_rate": 9.955530869317825e-07,
5541
+ "loss": 25.2127,
5542
+ "step": 7870
5543
+ },
5544
+ {
5545
+ "epoch": 0.5699303860410451,
5546
+ "grad_norm": 16.203125,
5547
+ "learning_rate": 9.955474364704505e-07,
5548
+ "loss": 25.1584,
5549
+ "step": 7880
5550
+ },
5551
+ {
5552
+ "epoch": 0.5706536479522647,
5553
+ "grad_norm": 16.515625,
5554
+ "learning_rate": 9.955417860091187e-07,
5555
+ "loss": 25.2296,
5556
+ "step": 7890
5557
+ },
5558
+ {
5559
+ "epoch": 0.5713769098634843,
5560
+ "grad_norm": 16.5,
5561
+ "learning_rate": 9.955361355477867e-07,
5562
+ "loss": 25.0436,
5563
+ "step": 7900
5564
+ },
5565
+ {
5566
+ "epoch": 0.572100171774704,
5567
+ "grad_norm": 16.0625,
5568
+ "learning_rate": 9.95530485086455e-07,
5569
+ "loss": 25.4235,
5570
+ "step": 7910
5571
+ },
5572
+ {
5573
+ "epoch": 0.5728234336859235,
5574
+ "grad_norm": 16.640625,
5575
+ "learning_rate": 9.95524834625123e-07,
5576
+ "loss": 25.1063,
5577
+ "step": 7920
5578
+ },
5579
+ {
5580
+ "epoch": 0.5735466955971431,
5581
+ "grad_norm": 16.484375,
5582
+ "learning_rate": 9.95519184163791e-07,
5583
+ "loss": 24.9058,
5584
+ "step": 7930
5585
+ },
5586
+ {
5587
+ "epoch": 0.5742699575083627,
5588
+ "grad_norm": 15.8671875,
5589
+ "learning_rate": 9.955135337024591e-07,
5590
+ "loss": 24.9924,
5591
+ "step": 7940
5592
+ },
5593
+ {
5594
+ "epoch": 0.5749932194195824,
5595
+ "grad_norm": 16.296875,
5596
+ "learning_rate": 9.955078832411271e-07,
5597
+ "loss": 24.8562,
5598
+ "step": 7950
5599
+ },
5600
+ {
5601
+ "epoch": 0.5757164813308019,
5602
+ "grad_norm": 16.390625,
5603
+ "learning_rate": 9.955022327797954e-07,
5604
+ "loss": 25.0083,
5605
+ "step": 7960
5606
+ },
5607
+ {
5608
+ "epoch": 0.5764397432420215,
5609
+ "grad_norm": 17.484375,
5610
+ "learning_rate": 9.954965823184633e-07,
5611
+ "loss": 24.5593,
5612
+ "step": 7970
5613
+ },
5614
+ {
5615
+ "epoch": 0.5771630051532411,
5616
+ "grad_norm": 17.234375,
5617
+ "learning_rate": 9.954909318571313e-07,
5618
+ "loss": 25.0094,
5619
+ "step": 7980
5620
+ },
5621
+ {
5622
+ "epoch": 0.5778862670644607,
5623
+ "grad_norm": 16.34375,
5624
+ "learning_rate": 9.954852813957996e-07,
5625
+ "loss": 25.4218,
5626
+ "step": 7990
5627
+ },
5628
+ {
5629
+ "epoch": 0.5786095289756803,
5630
+ "grad_norm": 16.515625,
5631
+ "learning_rate": 9.954796309344676e-07,
5632
+ "loss": 24.8672,
5633
+ "step": 8000
5634
+ },
5635
+ {
5636
+ "epoch": 0.5793327908868999,
5637
+ "grad_norm": 17.0,
5638
+ "learning_rate": 9.954739804731358e-07,
5639
+ "loss": 25.1298,
5640
+ "step": 8010
5641
+ },
5642
+ {
5643
+ "epoch": 0.5800560527981196,
5644
+ "grad_norm": 15.703125,
5645
+ "learning_rate": 9.954683300118038e-07,
5646
+ "loss": 24.8071,
5647
+ "step": 8020
5648
+ },
5649
+ {
5650
+ "epoch": 0.5807793147093391,
5651
+ "grad_norm": 15.8125,
5652
+ "learning_rate": 9.954626795504718e-07,
5653
+ "loss": 25.4645,
5654
+ "step": 8030
5655
+ },
5656
+ {
5657
+ "epoch": 0.5815025766205587,
5658
+ "grad_norm": 16.484375,
5659
+ "learning_rate": 9.9545702908914e-07,
5660
+ "loss": 25.2866,
5661
+ "step": 8040
5662
+ },
5663
+ {
5664
+ "epoch": 0.5822258385317783,
5665
+ "grad_norm": 16.59375,
5666
+ "learning_rate": 9.95451378627808e-07,
5667
+ "loss": 25.0499,
5668
+ "step": 8050
5669
+ },
5670
+ {
5671
+ "epoch": 0.582949100442998,
5672
+ "grad_norm": 15.9453125,
5673
+ "learning_rate": 9.954457281664762e-07,
5674
+ "loss": 25.265,
5675
+ "step": 8060
5676
+ },
5677
+ {
5678
+ "epoch": 0.5836723623542175,
5679
+ "grad_norm": 16.75,
5680
+ "learning_rate": 9.954400777051442e-07,
5681
+ "loss": 25.1575,
5682
+ "step": 8070
5683
+ },
5684
+ {
5685
+ "epoch": 0.5843956242654371,
5686
+ "grad_norm": 16.796875,
5687
+ "learning_rate": 9.954344272438124e-07,
5688
+ "loss": 25.0611,
5689
+ "step": 8080
5690
+ },
5691
+ {
5692
+ "epoch": 0.5851188861766567,
5693
+ "grad_norm": 16.328125,
5694
+ "learning_rate": 9.954287767824804e-07,
5695
+ "loss": 25.374,
5696
+ "step": 8090
5697
+ },
5698
+ {
5699
+ "epoch": 0.5858421480878763,
5700
+ "grad_norm": 15.5390625,
5701
+ "learning_rate": 9.954231263211484e-07,
5702
+ "loss": 24.4109,
5703
+ "step": 8100
5704
+ },
5705
+ {
5706
+ "epoch": 0.5865654099990959,
5707
+ "grad_norm": 15.9453125,
5708
+ "learning_rate": 9.954174758598166e-07,
5709
+ "loss": 25.0325,
5710
+ "step": 8110
5711
+ },
5712
+ {
5713
+ "epoch": 0.5872886719103155,
5714
+ "grad_norm": 16.796875,
5715
+ "learning_rate": 9.954118253984846e-07,
5716
+ "loss": 25.0507,
5717
+ "step": 8120
5718
+ },
5719
+ {
5720
+ "epoch": 0.5880119338215352,
5721
+ "grad_norm": 16.390625,
5722
+ "learning_rate": 9.954061749371528e-07,
5723
+ "loss": 25.0707,
5724
+ "step": 8130
5725
+ },
5726
+ {
5727
+ "epoch": 0.5887351957327547,
5728
+ "grad_norm": 16.546875,
5729
+ "learning_rate": 9.954005244758208e-07,
5730
+ "loss": 25.5378,
5731
+ "step": 8140
5732
+ },
5733
+ {
5734
+ "epoch": 0.5894584576439743,
5735
+ "grad_norm": 16.0625,
5736
+ "learning_rate": 9.953948740144888e-07,
5737
+ "loss": 25.1591,
5738
+ "step": 8150
5739
+ },
5740
+ {
5741
+ "epoch": 0.5901817195551939,
5742
+ "grad_norm": 17.171875,
5743
+ "learning_rate": 9.95389223553157e-07,
5744
+ "loss": 25.3384,
5745
+ "step": 8160
5746
+ },
5747
+ {
5748
+ "epoch": 0.5909049814664136,
5749
+ "grad_norm": 16.5,
5750
+ "learning_rate": 9.95383573091825e-07,
5751
+ "loss": 25.1535,
5752
+ "step": 8170
5753
+ },
5754
+ {
5755
+ "epoch": 0.5916282433776331,
5756
+ "grad_norm": 15.65625,
5757
+ "learning_rate": 9.953779226304932e-07,
5758
+ "loss": 24.9001,
5759
+ "step": 8180
5760
+ },
5761
+ {
5762
+ "epoch": 0.5923515052888527,
5763
+ "grad_norm": 16.375,
5764
+ "learning_rate": 9.953722721691612e-07,
5765
+ "loss": 25.1227,
5766
+ "step": 8190
5767
+ },
5768
+ {
5769
+ "epoch": 0.5930747672000724,
5770
+ "grad_norm": 16.078125,
5771
+ "learning_rate": 9.953666217078292e-07,
5772
+ "loss": 25.1242,
5773
+ "step": 8200
5774
+ },
5775
+ {
5776
+ "epoch": 0.593798029111292,
5777
+ "grad_norm": 17.21875,
5778
+ "learning_rate": 9.953609712464974e-07,
5779
+ "loss": 25.037,
5780
+ "step": 8210
5781
+ },
5782
+ {
5783
+ "epoch": 0.5945212910225115,
5784
+ "grad_norm": 17.265625,
5785
+ "learning_rate": 9.953553207851654e-07,
5786
+ "loss": 25.1524,
5787
+ "step": 8220
5788
+ },
5789
+ {
5790
+ "epoch": 0.5952445529337311,
5791
+ "grad_norm": 17.125,
5792
+ "learning_rate": 9.953496703238336e-07,
5793
+ "loss": 25.221,
5794
+ "step": 8230
5795
+ },
5796
+ {
5797
+ "epoch": 0.5959678148449508,
5798
+ "grad_norm": 17.109375,
5799
+ "learning_rate": 9.953440198625016e-07,
5800
+ "loss": 24.8512,
5801
+ "step": 8240
5802
+ },
5803
+ {
5804
+ "epoch": 0.5966910767561703,
5805
+ "grad_norm": 16.03125,
5806
+ "learning_rate": 9.953383694011696e-07,
5807
+ "loss": 25.1353,
5808
+ "step": 8250
5809
+ },
5810
+ {
5811
+ "epoch": 0.5974143386673899,
5812
+ "grad_norm": 15.09375,
5813
+ "learning_rate": 9.953327189398378e-07,
5814
+ "loss": 24.9567,
5815
+ "step": 8260
5816
+ },
5817
+ {
5818
+ "epoch": 0.5981376005786095,
5819
+ "grad_norm": 17.640625,
5820
+ "learning_rate": 9.953270684785058e-07,
5821
+ "loss": 24.9228,
5822
+ "step": 8270
5823
+ },
5824
+ {
5825
+ "epoch": 0.5988608624898292,
5826
+ "grad_norm": 15.8984375,
5827
+ "learning_rate": 9.95321418017174e-07,
5828
+ "loss": 25.0197,
5829
+ "step": 8280
5830
+ },
5831
+ {
5832
+ "epoch": 0.5995841244010487,
5833
+ "grad_norm": 17.03125,
5834
+ "learning_rate": 9.95315767555842e-07,
5835
+ "loss": 25.3745,
5836
+ "step": 8290
5837
+ },
5838
+ {
5839
+ "epoch": 0.6003073863122683,
5840
+ "grad_norm": 16.453125,
5841
+ "learning_rate": 9.953101170945102e-07,
5842
+ "loss": 25.306,
5843
+ "step": 8300
5844
+ },
5845
+ {
5846
+ "epoch": 0.601030648223488,
5847
+ "grad_norm": 14.90625,
5848
+ "learning_rate": 9.953044666331782e-07,
5849
+ "loss": 24.8696,
5850
+ "step": 8310
5851
+ },
5852
+ {
5853
+ "epoch": 0.6017539101347076,
5854
+ "grad_norm": 15.59375,
5855
+ "learning_rate": 9.952988161718462e-07,
5856
+ "loss": 25.2775,
5857
+ "step": 8320
5858
+ },
5859
+ {
5860
+ "epoch": 0.6024771720459271,
5861
+ "grad_norm": 15.8515625,
5862
+ "learning_rate": 9.952931657105144e-07,
5863
+ "loss": 24.8399,
5864
+ "step": 8330
5865
+ },
5866
+ {
5867
+ "epoch": 0.6032004339571467,
5868
+ "grad_norm": 16.484375,
5869
+ "learning_rate": 9.952875152491824e-07,
5870
+ "loss": 24.971,
5871
+ "step": 8340
5872
+ },
5873
+ {
5874
+ "epoch": 0.6039236958683664,
5875
+ "grad_norm": 16.0625,
5876
+ "learning_rate": 9.952818647878506e-07,
5877
+ "loss": 25.0859,
5878
+ "step": 8350
5879
+ },
5880
+ {
5881
+ "epoch": 0.6046469577795859,
5882
+ "grad_norm": 16.3125,
5883
+ "learning_rate": 9.952762143265186e-07,
5884
+ "loss": 25.063,
5885
+ "step": 8360
5886
+ },
5887
+ {
5888
+ "epoch": 0.6053702196908055,
5889
+ "grad_norm": 15.828125,
5890
+ "learning_rate": 9.952705638651866e-07,
5891
+ "loss": 25.029,
5892
+ "step": 8370
5893
+ },
5894
+ {
5895
+ "epoch": 0.6060934816020251,
5896
+ "grad_norm": 16.984375,
5897
+ "learning_rate": 9.952649134038548e-07,
5898
+ "loss": 24.7311,
5899
+ "step": 8380
5900
+ },
5901
+ {
5902
+ "epoch": 0.6068167435132448,
5903
+ "grad_norm": 15.703125,
5904
+ "learning_rate": 9.952592629425228e-07,
5905
+ "loss": 24.7605,
5906
+ "step": 8390
5907
+ },
5908
+ {
5909
+ "epoch": 0.6075400054244643,
5910
+ "grad_norm": 16.140625,
5911
+ "learning_rate": 9.95253612481191e-07,
5912
+ "loss": 25.1468,
5913
+ "step": 8400
5914
+ },
5915
+ {
5916
+ "epoch": 0.6082632673356839,
5917
+ "grad_norm": 15.5546875,
5918
+ "learning_rate": 9.95247962019859e-07,
5919
+ "loss": 24.8656,
5920
+ "step": 8410
5921
+ },
5922
+ {
5923
+ "epoch": 0.6089865292469036,
5924
+ "grad_norm": 16.1875,
5925
+ "learning_rate": 9.95242311558527e-07,
5926
+ "loss": 24.9105,
5927
+ "step": 8420
5928
+ },
5929
+ {
5930
+ "epoch": 0.6097097911581232,
5931
+ "grad_norm": 16.03125,
5932
+ "learning_rate": 9.952366610971953e-07,
5933
+ "loss": 25.0424,
5934
+ "step": 8430
5935
+ },
5936
+ {
5937
+ "epoch": 0.6104330530693427,
5938
+ "grad_norm": 16.28125,
5939
+ "learning_rate": 9.952310106358632e-07,
5940
+ "loss": 25.3547,
5941
+ "step": 8440
5942
+ },
5943
+ {
5944
+ "epoch": 0.6111563149805623,
5945
+ "grad_norm": 15.5625,
5946
+ "learning_rate": 9.952253601745315e-07,
5947
+ "loss": 24.777,
5948
+ "step": 8450
5949
+ },
5950
+ {
5951
+ "epoch": 0.611879576891782,
5952
+ "grad_norm": 17.03125,
5953
+ "learning_rate": 9.952197097131995e-07,
5954
+ "loss": 24.958,
5955
+ "step": 8460
5956
+ },
5957
+ {
5958
+ "epoch": 0.6126028388030015,
5959
+ "grad_norm": 16.703125,
5960
+ "learning_rate": 9.952140592518675e-07,
5961
+ "loss": 25.1826,
5962
+ "step": 8470
5963
+ },
5964
+ {
5965
+ "epoch": 0.6133261007142211,
5966
+ "grad_norm": 16.078125,
5967
+ "learning_rate": 9.952084087905357e-07,
5968
+ "loss": 25.2109,
5969
+ "step": 8480
5970
+ },
5971
+ {
5972
+ "epoch": 0.6140493626254407,
5973
+ "grad_norm": 15.921875,
5974
+ "learning_rate": 9.952027583292037e-07,
5975
+ "loss": 25.3674,
5976
+ "step": 8490
5977
+ },
5978
+ {
5979
+ "epoch": 0.6147726245366604,
5980
+ "grad_norm": 15.6875,
5981
+ "learning_rate": 9.951971078678719e-07,
5982
+ "loss": 25.07,
5983
+ "step": 8500
5984
+ },
5985
+ {
5986
+ "epoch": 0.6154958864478799,
5987
+ "grad_norm": 16.0625,
5988
+ "learning_rate": 9.951914574065399e-07,
5989
+ "loss": 24.7706,
5990
+ "step": 8510
5991
+ },
5992
+ {
5993
+ "epoch": 0.6162191483590995,
5994
+ "grad_norm": 16.25,
5995
+ "learning_rate": 9.95185806945208e-07,
5996
+ "loss": 25.121,
5997
+ "step": 8520
5998
+ },
5999
+ {
6000
+ "epoch": 0.6169424102703192,
6001
+ "grad_norm": 16.875,
6002
+ "learning_rate": 9.95180156483876e-07,
6003
+ "loss": 24.9344,
6004
+ "step": 8530
6005
+ },
6006
+ {
6007
+ "epoch": 0.6176656721815388,
6008
+ "grad_norm": 15.8671875,
6009
+ "learning_rate": 9.95174506022544e-07,
6010
+ "loss": 25.3272,
6011
+ "step": 8540
6012
+ },
6013
+ {
6014
+ "epoch": 0.6183889340927583,
6015
+ "grad_norm": 16.484375,
6016
+ "learning_rate": 9.951688555612123e-07,
6017
+ "loss": 25.1814,
6018
+ "step": 8550
6019
+ },
6020
+ {
6021
+ "epoch": 0.6191121960039779,
6022
+ "grad_norm": 15.4921875,
6023
+ "learning_rate": 9.951632050998805e-07,
6024
+ "loss": 24.6664,
6025
+ "step": 8560
6026
+ },
6027
+ {
6028
+ "epoch": 0.6198354579151976,
6029
+ "grad_norm": 17.25,
6030
+ "learning_rate": 9.951575546385485e-07,
6031
+ "loss": 25.6137,
6032
+ "step": 8570
6033
+ },
6034
+ {
6035
+ "epoch": 0.6205587198264172,
6036
+ "grad_norm": 16.984375,
6037
+ "learning_rate": 9.951519041772165e-07,
6038
+ "loss": 25.0584,
6039
+ "step": 8580
6040
+ },
6041
+ {
6042
+ "epoch": 0.6212819817376367,
6043
+ "grad_norm": 15.4765625,
6044
+ "learning_rate": 9.951462537158845e-07,
6045
+ "loss": 25.0154,
6046
+ "step": 8590
6047
+ },
6048
+ {
6049
+ "epoch": 0.6220052436488563,
6050
+ "grad_norm": 16.796875,
6051
+ "learning_rate": 9.951406032545527e-07,
6052
+ "loss": 24.9978,
6053
+ "step": 8600
6054
+ },
6055
+ {
6056
+ "epoch": 0.622728505560076,
6057
+ "grad_norm": 15.6640625,
6058
+ "learning_rate": 9.95134952793221e-07,
6059
+ "loss": 25.5721,
6060
+ "step": 8610
6061
+ },
6062
+ {
6063
+ "epoch": 0.6234517674712955,
6064
+ "grad_norm": 16.640625,
6065
+ "learning_rate": 9.95129302331889e-07,
6066
+ "loss": 25.3643,
6067
+ "step": 8620
6068
+ },
6069
+ {
6070
+ "epoch": 0.6241750293825151,
6071
+ "grad_norm": 16.140625,
6072
+ "learning_rate": 9.951236518705569e-07,
6073
+ "loss": 25.4492,
6074
+ "step": 8630
6075
+ },
6076
+ {
6077
+ "epoch": 0.6248982912937348,
6078
+ "grad_norm": 16.25,
6079
+ "learning_rate": 9.951180014092249e-07,
6080
+ "loss": 24.9718,
6081
+ "step": 8640
6082
+ },
6083
+ {
6084
+ "epoch": 0.6256215532049544,
6085
+ "grad_norm": 16.65625,
6086
+ "learning_rate": 9.95112350947893e-07,
6087
+ "loss": 24.6082,
6088
+ "step": 8650
6089
+ },
6090
+ {
6091
+ "epoch": 0.6263448151161739,
6092
+ "grad_norm": 15.8828125,
6093
+ "learning_rate": 9.95106700486561e-07,
6094
+ "loss": 24.9935,
6095
+ "step": 8660
6096
+ },
6097
+ {
6098
+ "epoch": 0.6270680770273935,
6099
+ "grad_norm": 15.328125,
6100
+ "learning_rate": 9.951010500252293e-07,
6101
+ "loss": 25.0441,
6102
+ "step": 8670
6103
+ },
6104
+ {
6105
+ "epoch": 0.6277913389386132,
6106
+ "grad_norm": 15.6015625,
6107
+ "learning_rate": 9.950953995638973e-07,
6108
+ "loss": 24.6859,
6109
+ "step": 8680
6110
+ },
6111
+ {
6112
+ "epoch": 0.6285146008498328,
6113
+ "grad_norm": 16.03125,
6114
+ "learning_rate": 9.950897491025653e-07,
6115
+ "loss": 24.9053,
6116
+ "step": 8690
6117
+ },
6118
+ {
6119
+ "epoch": 0.6292378627610523,
6120
+ "grad_norm": 16.1875,
6121
+ "learning_rate": 9.950840986412335e-07,
6122
+ "loss": 24.9122,
6123
+ "step": 8700
6124
+ },
6125
+ {
6126
+ "epoch": 0.6299611246722719,
6127
+ "grad_norm": 16.984375,
6128
+ "learning_rate": 9.950784481799015e-07,
6129
+ "loss": 25.4552,
6130
+ "step": 8710
6131
+ },
6132
+ {
6133
+ "epoch": 0.6306843865834916,
6134
+ "grad_norm": 15.515625,
6135
+ "learning_rate": 9.950727977185697e-07,
6136
+ "loss": 24.8516,
6137
+ "step": 8720
6138
+ },
6139
+ {
6140
+ "epoch": 0.6314076484947111,
6141
+ "grad_norm": 15.8125,
6142
+ "learning_rate": 9.950671472572377e-07,
6143
+ "loss": 24.9679,
6144
+ "step": 8730
6145
+ },
6146
+ {
6147
+ "epoch": 0.6321309104059307,
6148
+ "grad_norm": 16.375,
6149
+ "learning_rate": 9.95061496795906e-07,
6150
+ "loss": 25.5788,
6151
+ "step": 8740
6152
+ },
6153
+ {
6154
+ "epoch": 0.6328541723171504,
6155
+ "grad_norm": 17.078125,
6156
+ "learning_rate": 9.95055846334574e-07,
6157
+ "loss": 25.3259,
6158
+ "step": 8750
6159
+ },
6160
+ {
6161
+ "epoch": 0.63357743422837,
6162
+ "grad_norm": 15.7109375,
6163
+ "learning_rate": 9.95050195873242e-07,
6164
+ "loss": 24.7144,
6165
+ "step": 8760
6166
+ },
6167
+ {
6168
+ "epoch": 0.6343006961395895,
6169
+ "grad_norm": 16.015625,
6170
+ "learning_rate": 9.950445454119101e-07,
6171
+ "loss": 24.7972,
6172
+ "step": 8770
6173
+ },
6174
+ {
6175
+ "epoch": 0.6350239580508091,
6176
+ "grad_norm": 16.828125,
6177
+ "learning_rate": 9.950388949505783e-07,
6178
+ "loss": 25.0281,
6179
+ "step": 8780
6180
+ },
6181
+ {
6182
+ "epoch": 0.6357472199620288,
6183
+ "grad_norm": 16.65625,
6184
+ "learning_rate": 9.950332444892463e-07,
6185
+ "loss": 24.8294,
6186
+ "step": 8790
6187
+ },
6188
+ {
6189
+ "epoch": 0.6364704818732484,
6190
+ "grad_norm": 16.90625,
6191
+ "learning_rate": 9.950275940279143e-07,
6192
+ "loss": 25.2674,
6193
+ "step": 8800
6194
+ },
6195
+ {
6196
+ "epoch": 0.6371937437844679,
6197
+ "grad_norm": 16.5,
6198
+ "learning_rate": 9.950219435665823e-07,
6199
+ "loss": 25.1726,
6200
+ "step": 8810
6201
+ },
6202
+ {
6203
+ "epoch": 0.6379170056956875,
6204
+ "grad_norm": 17.265625,
6205
+ "learning_rate": 9.950162931052505e-07,
6206
+ "loss": 24.7559,
6207
+ "step": 8820
6208
+ },
6209
+ {
6210
+ "epoch": 0.6386402676069072,
6211
+ "grad_norm": 15.828125,
6212
+ "learning_rate": 9.950106426439187e-07,
6213
+ "loss": 25.1427,
6214
+ "step": 8830
6215
+ },
6216
+ {
6217
+ "epoch": 0.6393635295181268,
6218
+ "grad_norm": 16.796875,
6219
+ "learning_rate": 9.950049921825867e-07,
6220
+ "loss": 24.9646,
6221
+ "step": 8840
6222
+ },
6223
+ {
6224
+ "epoch": 0.6400867914293463,
6225
+ "grad_norm": 15.2265625,
6226
+ "learning_rate": 9.949993417212547e-07,
6227
+ "loss": 25.1181,
6228
+ "step": 8850
6229
+ },
6230
+ {
6231
+ "epoch": 0.640810053340566,
6232
+ "grad_norm": 15.84375,
6233
+ "learning_rate": 9.949936912599227e-07,
6234
+ "loss": 24.9643,
6235
+ "step": 8860
6236
+ },
6237
+ {
6238
+ "epoch": 0.6415333152517856,
6239
+ "grad_norm": 15.3515625,
6240
+ "learning_rate": 9.94988040798591e-07,
6241
+ "loss": 25.2106,
6242
+ "step": 8870
6243
+ },
6244
+ {
6245
+ "epoch": 0.6422565771630051,
6246
+ "grad_norm": 16.484375,
6247
+ "learning_rate": 9.949823903372592e-07,
6248
+ "loss": 25.0076,
6249
+ "step": 8880
6250
+ },
6251
+ {
6252
+ "epoch": 0.6429798390742247,
6253
+ "grad_norm": 16.953125,
6254
+ "learning_rate": 9.949767398759272e-07,
6255
+ "loss": 25.3634,
6256
+ "step": 8890
6257
+ },
6258
+ {
6259
+ "epoch": 0.6437031009854444,
6260
+ "grad_norm": 15.96875,
6261
+ "learning_rate": 9.949710894145952e-07,
6262
+ "loss": 25.1013,
6263
+ "step": 8900
6264
+ },
6265
+ {
6266
+ "epoch": 0.644426362896664,
6267
+ "grad_norm": 15.453125,
6268
+ "learning_rate": 9.949654389532634e-07,
6269
+ "loss": 24.9565,
6270
+ "step": 8910
6271
+ },
6272
+ {
6273
+ "epoch": 0.6451496248078835,
6274
+ "grad_norm": 15.7578125,
6275
+ "learning_rate": 9.949597884919314e-07,
6276
+ "loss": 24.6701,
6277
+ "step": 8920
6278
+ },
6279
+ {
6280
+ "epoch": 0.6458728867191031,
6281
+ "grad_norm": 16.9375,
6282
+ "learning_rate": 9.949541380305996e-07,
6283
+ "loss": 25.131,
6284
+ "step": 8930
6285
+ },
6286
+ {
6287
+ "epoch": 0.6465961486303228,
6288
+ "grad_norm": 16.625,
6289
+ "learning_rate": 9.949484875692676e-07,
6290
+ "loss": 25.0346,
6291
+ "step": 8940
6292
+ },
6293
+ {
6294
+ "epoch": 0.6473194105415424,
6295
+ "grad_norm": 16.046875,
6296
+ "learning_rate": 9.949428371079356e-07,
6297
+ "loss": 25.1957,
6298
+ "step": 8950
6299
+ },
6300
+ {
6301
+ "epoch": 0.6480426724527619,
6302
+ "grad_norm": 16.171875,
6303
+ "learning_rate": 9.949371866466038e-07,
6304
+ "loss": 25.2245,
6305
+ "step": 8960
6306
+ },
6307
+ {
6308
+ "epoch": 0.6487659343639816,
6309
+ "grad_norm": 16.125,
6310
+ "learning_rate": 9.949315361852718e-07,
6311
+ "loss": 25.3402,
6312
+ "step": 8970
6313
+ },
6314
+ {
6315
+ "epoch": 0.6494891962752012,
6316
+ "grad_norm": 16.046875,
6317
+ "learning_rate": 9.949258857239398e-07,
6318
+ "loss": 25.0492,
6319
+ "step": 8980
6320
+ },
6321
+ {
6322
+ "epoch": 0.6502124581864207,
6323
+ "grad_norm": 16.171875,
6324
+ "learning_rate": 9.94920235262608e-07,
6325
+ "loss": 25.3217,
6326
+ "step": 8990
6327
+ },
6328
+ {
6329
+ "epoch": 0.6509357200976403,
6330
+ "grad_norm": 15.8984375,
6331
+ "learning_rate": 9.949145848012762e-07,
6332
+ "loss": 25.2028,
6333
+ "step": 9000
6334
+ },
6335
+ {
6336
+ "epoch": 0.65165898200886,
6337
+ "grad_norm": 16.9375,
6338
+ "learning_rate": 9.949089343399442e-07,
6339
+ "loss": 25.19,
6340
+ "step": 9010
6341
+ },
6342
+ {
6343
+ "epoch": 0.6523822439200796,
6344
+ "grad_norm": 16.6875,
6345
+ "learning_rate": 9.949032838786122e-07,
6346
+ "loss": 25.143,
6347
+ "step": 9020
6348
+ },
6349
+ {
6350
+ "epoch": 0.6531055058312991,
6351
+ "grad_norm": 16.828125,
6352
+ "learning_rate": 9.948976334172802e-07,
6353
+ "loss": 25.1178,
6354
+ "step": 9030
6355
+ },
6356
+ {
6357
+ "epoch": 0.6538287677425187,
6358
+ "grad_norm": 16.9375,
6359
+ "learning_rate": 9.948919829559484e-07,
6360
+ "loss": 25.0101,
6361
+ "step": 9040
6362
+ },
6363
+ {
6364
+ "epoch": 0.6545520296537384,
6365
+ "grad_norm": 16.15625,
6366
+ "learning_rate": 9.948863324946166e-07,
6367
+ "loss": 25.3933,
6368
+ "step": 9050
6369
+ },
6370
+ {
6371
+ "epoch": 0.655275291564958,
6372
+ "grad_norm": 16.984375,
6373
+ "learning_rate": 9.948806820332846e-07,
6374
+ "loss": 24.9395,
6375
+ "step": 9060
6376
+ },
6377
+ {
6378
+ "epoch": 0.6559985534761775,
6379
+ "grad_norm": 16.53125,
6380
+ "learning_rate": 9.948750315719526e-07,
6381
+ "loss": 24.8895,
6382
+ "step": 9070
6383
+ },
6384
+ {
6385
+ "epoch": 0.6567218153873972,
6386
+ "grad_norm": 15.796875,
6387
+ "learning_rate": 9.948693811106206e-07,
6388
+ "loss": 24.8654,
6389
+ "step": 9080
6390
+ },
6391
+ {
6392
+ "epoch": 0.6574450772986168,
6393
+ "grad_norm": 15.6953125,
6394
+ "learning_rate": 9.948637306492888e-07,
6395
+ "loss": 24.8138,
6396
+ "step": 9090
6397
+ },
6398
+ {
6399
+ "epoch": 0.6581683392098364,
6400
+ "grad_norm": 16.171875,
6401
+ "learning_rate": 9.94858080187957e-07,
6402
+ "loss": 25.1842,
6403
+ "step": 9100
6404
+ },
6405
+ {
6406
+ "epoch": 0.6588916011210559,
6407
+ "grad_norm": 16.109375,
6408
+ "learning_rate": 9.94852429726625e-07,
6409
+ "loss": 24.8857,
6410
+ "step": 9110
6411
+ },
6412
+ {
6413
+ "epoch": 0.6596148630322756,
6414
+ "grad_norm": 16.234375,
6415
+ "learning_rate": 9.94846779265293e-07,
6416
+ "loss": 25.2401,
6417
+ "step": 9120
6418
+ },
6419
+ {
6420
+ "epoch": 0.6603381249434952,
6421
+ "grad_norm": 16.28125,
6422
+ "learning_rate": 9.948411288039612e-07,
6423
+ "loss": 24.8999,
6424
+ "step": 9130
6425
+ },
6426
+ {
6427
+ "epoch": 0.6610613868547147,
6428
+ "grad_norm": 16.40625,
6429
+ "learning_rate": 9.948354783426292e-07,
6430
+ "loss": 24.8383,
6431
+ "step": 9140
6432
+ },
6433
+ {
6434
+ "epoch": 0.6617846487659343,
6435
+ "grad_norm": 16.53125,
6436
+ "learning_rate": 9.948298278812974e-07,
6437
+ "loss": 24.9992,
6438
+ "step": 9150
6439
+ },
6440
+ {
6441
+ "epoch": 0.662507910677154,
6442
+ "grad_norm": 15.8828125,
6443
+ "learning_rate": 9.948241774199654e-07,
6444
+ "loss": 24.7903,
6445
+ "step": 9160
6446
+ },
6447
+ {
6448
+ "epoch": 0.6632311725883736,
6449
+ "grad_norm": 16.265625,
6450
+ "learning_rate": 9.948185269586334e-07,
6451
+ "loss": 25.0166,
6452
+ "step": 9170
6453
+ },
6454
+ {
6455
+ "epoch": 0.6639544344995931,
6456
+ "grad_norm": 16.109375,
6457
+ "learning_rate": 9.948128764973016e-07,
6458
+ "loss": 25.1261,
6459
+ "step": 9180
6460
+ },
6461
+ {
6462
+ "epoch": 0.6646776964108128,
6463
+ "grad_norm": 15.8046875,
6464
+ "learning_rate": 9.948072260359696e-07,
6465
+ "loss": 25.2559,
6466
+ "step": 9190
6467
+ },
6468
+ {
6469
+ "epoch": 0.6654009583220324,
6470
+ "grad_norm": 16.703125,
6471
+ "learning_rate": 9.948015755746378e-07,
6472
+ "loss": 24.8266,
6473
+ "step": 9200
6474
+ },
6475
+ {
6476
+ "epoch": 0.666124220233252,
6477
+ "grad_norm": 16.203125,
6478
+ "learning_rate": 9.947959251133058e-07,
6479
+ "loss": 25.231,
6480
+ "step": 9210
6481
+ },
6482
+ {
6483
+ "epoch": 0.6668474821444715,
6484
+ "grad_norm": 16.53125,
6485
+ "learning_rate": 9.94790274651974e-07,
6486
+ "loss": 25.3356,
6487
+ "step": 9220
6488
+ },
6489
+ {
6490
+ "epoch": 0.6675707440556912,
6491
+ "grad_norm": 15.234375,
6492
+ "learning_rate": 9.94784624190642e-07,
6493
+ "loss": 25.1178,
6494
+ "step": 9230
6495
+ },
6496
+ {
6497
+ "epoch": 0.6682940059669108,
6498
+ "grad_norm": 16.171875,
6499
+ "learning_rate": 9.9477897372931e-07,
6500
+ "loss": 25.1904,
6501
+ "step": 9240
6502
+ },
6503
+ {
6504
+ "epoch": 0.6690172678781303,
6505
+ "grad_norm": 15.515625,
6506
+ "learning_rate": 9.947733232679782e-07,
6507
+ "loss": 25.057,
6508
+ "step": 9250
6509
+ },
6510
+ {
6511
+ "epoch": 0.66974052978935,
6512
+ "grad_norm": 16.9375,
6513
+ "learning_rate": 9.947676728066462e-07,
6514
+ "loss": 25.0339,
6515
+ "step": 9260
6516
+ },
6517
+ {
6518
+ "epoch": 0.6704637917005696,
6519
+ "grad_norm": 16.0625,
6520
+ "learning_rate": 9.947620223453144e-07,
6521
+ "loss": 25.0058,
6522
+ "step": 9270
6523
+ },
6524
+ {
6525
+ "epoch": 0.6711870536117892,
6526
+ "grad_norm": 17.0625,
6527
+ "learning_rate": 9.947563718839824e-07,
6528
+ "loss": 24.9719,
6529
+ "step": 9280
6530
+ },
6531
+ {
6532
+ "epoch": 0.6719103155230087,
6533
+ "grad_norm": 15.65625,
6534
+ "learning_rate": 9.947507214226504e-07,
6535
+ "loss": 25.0308,
6536
+ "step": 9290
6537
+ },
6538
+ {
6539
+ "epoch": 0.6726335774342284,
6540
+ "grad_norm": 16.03125,
6541
+ "learning_rate": 9.947450709613186e-07,
6542
+ "loss": 25.2972,
6543
+ "step": 9300
6544
+ },
6545
+ {
6546
+ "epoch": 0.673356839345448,
6547
+ "grad_norm": 15.671875,
6548
+ "learning_rate": 9.947394204999866e-07,
6549
+ "loss": 25.0732,
6550
+ "step": 9310
6551
+ },
6552
+ {
6553
+ "epoch": 0.6740801012566676,
6554
+ "grad_norm": 16.234375,
6555
+ "learning_rate": 9.947337700386549e-07,
6556
+ "loss": 25.0754,
6557
+ "step": 9320
6558
+ },
6559
+ {
6560
+ "epoch": 0.6748033631678871,
6561
+ "grad_norm": 15.2578125,
6562
+ "learning_rate": 9.947281195773229e-07,
6563
+ "loss": 25.5137,
6564
+ "step": 9330
6565
+ },
6566
+ {
6567
+ "epoch": 0.6755266250791068,
6568
+ "grad_norm": 15.859375,
6569
+ "learning_rate": 9.947224691159909e-07,
6570
+ "loss": 25.1258,
6571
+ "step": 9340
6572
+ },
6573
+ {
6574
+ "epoch": 0.6762498869903264,
6575
+ "grad_norm": 16.65625,
6576
+ "learning_rate": 9.94716818654659e-07,
6577
+ "loss": 25.1174,
6578
+ "step": 9350
6579
+ },
6580
+ {
6581
+ "epoch": 0.6769731489015459,
6582
+ "grad_norm": 16.25,
6583
+ "learning_rate": 9.94711168193327e-07,
6584
+ "loss": 25.1364,
6585
+ "step": 9360
6586
+ },
6587
+ {
6588
+ "epoch": 0.6776964108127655,
6589
+ "grad_norm": 15.8359375,
6590
+ "learning_rate": 9.947055177319953e-07,
6591
+ "loss": 25.13,
6592
+ "step": 9370
6593
+ },
6594
+ {
6595
+ "epoch": 0.6784196727239852,
6596
+ "grad_norm": 17.828125,
6597
+ "learning_rate": 9.946998672706633e-07,
6598
+ "loss": 25.317,
6599
+ "step": 9380
6600
+ },
6601
+ {
6602
+ "epoch": 0.6791429346352048,
6603
+ "grad_norm": 15.5703125,
6604
+ "learning_rate": 9.946942168093313e-07,
6605
+ "loss": 25.0771,
6606
+ "step": 9390
6607
+ },
6608
+ {
6609
+ "epoch": 0.6798661965464243,
6610
+ "grad_norm": 16.3125,
6611
+ "learning_rate": 9.946885663479995e-07,
6612
+ "loss": 25.558,
6613
+ "step": 9400
6614
+ },
6615
+ {
6616
+ "epoch": 0.680589458457644,
6617
+ "grad_norm": 15.7265625,
6618
+ "learning_rate": 9.946829158866675e-07,
6619
+ "loss": 24.8281,
6620
+ "step": 9410
6621
+ },
6622
+ {
6623
+ "epoch": 0.6813127203688636,
6624
+ "grad_norm": 17.03125,
6625
+ "learning_rate": 9.946772654253357e-07,
6626
+ "loss": 24.8212,
6627
+ "step": 9420
6628
+ },
6629
+ {
6630
+ "epoch": 0.6820359822800832,
6631
+ "grad_norm": 16.296875,
6632
+ "learning_rate": 9.946716149640037e-07,
6633
+ "loss": 25.2519,
6634
+ "step": 9430
6635
+ },
6636
+ {
6637
+ "epoch": 0.6827592441913027,
6638
+ "grad_norm": 16.328125,
6639
+ "learning_rate": 9.946659645026719e-07,
6640
+ "loss": 25.086,
6641
+ "step": 9440
6642
+ },
6643
+ {
6644
+ "epoch": 0.6834825061025224,
6645
+ "grad_norm": 16.34375,
6646
+ "learning_rate": 9.946603140413399e-07,
6647
+ "loss": 25.1416,
6648
+ "step": 9450
6649
+ },
6650
+ {
6651
+ "epoch": 0.684205768013742,
6652
+ "grad_norm": 15.1328125,
6653
+ "learning_rate": 9.946546635800079e-07,
6654
+ "loss": 25.0502,
6655
+ "step": 9460
6656
+ },
6657
+ {
6658
+ "epoch": 0.6849290299249616,
6659
+ "grad_norm": 16.140625,
6660
+ "learning_rate": 9.94649013118676e-07,
6661
+ "loss": 25.018,
6662
+ "step": 9470
6663
+ },
6664
+ {
6665
+ "epoch": 0.6856522918361811,
6666
+ "grad_norm": 16.359375,
6667
+ "learning_rate": 9.94643362657344e-07,
6668
+ "loss": 24.6714,
6669
+ "step": 9480
6670
+ },
6671
+ {
6672
+ "epoch": 0.6863755537474008,
6673
+ "grad_norm": 16.03125,
6674
+ "learning_rate": 9.946377121960123e-07,
6675
+ "loss": 25.0868,
6676
+ "step": 9490
6677
+ },
6678
+ {
6679
+ "epoch": 0.6870988156586204,
6680
+ "grad_norm": 15.8515625,
6681
+ "learning_rate": 9.946320617346803e-07,
6682
+ "loss": 25.0213,
6683
+ "step": 9500
6684
+ },
6685
+ {
6686
+ "epoch": 0.6878220775698399,
6687
+ "grad_norm": 17.421875,
6688
+ "learning_rate": 9.946264112733483e-07,
6689
+ "loss": 25.2668,
6690
+ "step": 9510
6691
+ },
6692
+ {
6693
+ "epoch": 0.6885453394810596,
6694
+ "grad_norm": 16.703125,
6695
+ "learning_rate": 9.946207608120165e-07,
6696
+ "loss": 25.1576,
6697
+ "step": 9520
6698
+ },
6699
+ {
6700
+ "epoch": 0.6892686013922792,
6701
+ "grad_norm": 16.453125,
6702
+ "learning_rate": 9.946151103506845e-07,
6703
+ "loss": 25.0762,
6704
+ "step": 9530
6705
+ },
6706
+ {
6707
+ "epoch": 0.6899918633034988,
6708
+ "grad_norm": 15.5859375,
6709
+ "learning_rate": 9.946094598893527e-07,
6710
+ "loss": 24.9267,
6711
+ "step": 9540
6712
+ },
6713
+ {
6714
+ "epoch": 0.6907151252147183,
6715
+ "grad_norm": 17.0625,
6716
+ "learning_rate": 9.946038094280207e-07,
6717
+ "loss": 25.3065,
6718
+ "step": 9550
6719
+ },
6720
+ {
6721
+ "epoch": 0.691438387125938,
6722
+ "grad_norm": 17.46875,
6723
+ "learning_rate": 9.945981589666887e-07,
6724
+ "loss": 24.975,
6725
+ "step": 9560
6726
+ },
6727
+ {
6728
+ "epoch": 0.6921616490371576,
6729
+ "grad_norm": 17.625,
6730
+ "learning_rate": 9.94592508505357e-07,
6731
+ "loss": 25.006,
6732
+ "step": 9570
6733
+ },
6734
+ {
6735
+ "epoch": 0.6928849109483772,
6736
+ "grad_norm": 15.5546875,
6737
+ "learning_rate": 9.94586858044025e-07,
6738
+ "loss": 25.3168,
6739
+ "step": 9580
6740
+ },
6741
+ {
6742
+ "epoch": 0.6936081728595967,
6743
+ "grad_norm": 17.15625,
6744
+ "learning_rate": 9.945812075826931e-07,
6745
+ "loss": 25.2658,
6746
+ "step": 9590
6747
+ },
6748
+ {
6749
+ "epoch": 0.6943314347708164,
6750
+ "grad_norm": 16.90625,
6751
+ "learning_rate": 9.945755571213611e-07,
6752
+ "loss": 25.3412,
6753
+ "step": 9600
6754
+ },
6755
+ {
6756
+ "epoch": 0.695054696682036,
6757
+ "grad_norm": 17.359375,
6758
+ "learning_rate": 9.945699066600293e-07,
6759
+ "loss": 24.7888,
6760
+ "step": 9610
6761
+ },
6762
+ {
6763
+ "epoch": 0.6957779585932555,
6764
+ "grad_norm": 15.4296875,
6765
+ "learning_rate": 9.945642561986973e-07,
6766
+ "loss": 24.7344,
6767
+ "step": 9620
6768
+ },
6769
+ {
6770
+ "epoch": 0.6965012205044752,
6771
+ "grad_norm": 15.84375,
6772
+ "learning_rate": 9.945586057373653e-07,
6773
+ "loss": 24.9273,
6774
+ "step": 9630
6775
+ },
6776
+ {
6777
+ "epoch": 0.6972244824156948,
6778
+ "grad_norm": 15.859375,
6779
+ "learning_rate": 9.945529552760335e-07,
6780
+ "loss": 25.0604,
6781
+ "step": 9640
6782
+ },
6783
+ {
6784
+ "epoch": 0.6979477443269144,
6785
+ "grad_norm": 15.5078125,
6786
+ "learning_rate": 9.945473048147015e-07,
6787
+ "loss": 25.0373,
6788
+ "step": 9650
6789
+ },
6790
+ {
6791
+ "epoch": 0.6986710062381339,
6792
+ "grad_norm": 16.734375,
6793
+ "learning_rate": 9.945416543533697e-07,
6794
+ "loss": 24.89,
6795
+ "step": 9660
6796
+ },
6797
+ {
6798
+ "epoch": 0.6993942681493536,
6799
+ "grad_norm": 16.984375,
6800
+ "learning_rate": 9.945360038920377e-07,
6801
+ "loss": 25.5452,
6802
+ "step": 9670
6803
+ },
6804
+ {
6805
+ "epoch": 0.7001175300605732,
6806
+ "grad_norm": 15.609375,
6807
+ "learning_rate": 9.945303534307057e-07,
6808
+ "loss": 25.3505,
6809
+ "step": 9680
6810
+ },
6811
+ {
6812
+ "epoch": 0.7008407919717928,
6813
+ "grad_norm": 16.140625,
6814
+ "learning_rate": 9.94524702969374e-07,
6815
+ "loss": 25.1811,
6816
+ "step": 9690
6817
+ },
6818
+ {
6819
+ "epoch": 0.7015640538830124,
6820
+ "grad_norm": 17.21875,
6821
+ "learning_rate": 9.94519052508042e-07,
6822
+ "loss": 25.2871,
6823
+ "step": 9700
6824
+ },
6825
+ {
6826
+ "epoch": 0.702287315794232,
6827
+ "grad_norm": 17.234375,
6828
+ "learning_rate": 9.945134020467101e-07,
6829
+ "loss": 25.123,
6830
+ "step": 9710
6831
+ },
6832
+ {
6833
+ "epoch": 0.7030105777054516,
6834
+ "grad_norm": 16.578125,
6835
+ "learning_rate": 9.945077515853781e-07,
6836
+ "loss": 25.1046,
6837
+ "step": 9720
6838
+ },
6839
+ {
6840
+ "epoch": 0.7037338396166712,
6841
+ "grad_norm": 17.0625,
6842
+ "learning_rate": 9.945021011240461e-07,
6843
+ "loss": 24.8252,
6844
+ "step": 9730
6845
+ },
6846
+ {
6847
+ "epoch": 0.7044571015278908,
6848
+ "grad_norm": 15.671875,
6849
+ "learning_rate": 9.944964506627143e-07,
6850
+ "loss": 25.0734,
6851
+ "step": 9740
6852
+ },
6853
+ {
6854
+ "epoch": 0.7051803634391104,
6855
+ "grad_norm": 16.546875,
6856
+ "learning_rate": 9.944908002013823e-07,
6857
+ "loss": 25.0185,
6858
+ "step": 9750
6859
+ },
6860
+ {
6861
+ "epoch": 0.70590362535033,
6862
+ "grad_norm": 16.1875,
6863
+ "learning_rate": 9.944851497400506e-07,
6864
+ "loss": 25.2923,
6865
+ "step": 9760
6866
+ },
6867
+ {
6868
+ "epoch": 0.7066268872615495,
6869
+ "grad_norm": 15.984375,
6870
+ "learning_rate": 9.944794992787186e-07,
6871
+ "loss": 24.9048,
6872
+ "step": 9770
6873
+ },
6874
+ {
6875
+ "epoch": 0.7073501491727692,
6876
+ "grad_norm": 15.90625,
6877
+ "learning_rate": 9.944738488173865e-07,
6878
+ "loss": 25.2429,
6879
+ "step": 9780
6880
+ },
6881
+ {
6882
+ "epoch": 0.7080734110839888,
6883
+ "grad_norm": 16.390625,
6884
+ "learning_rate": 9.944681983560548e-07,
6885
+ "loss": 24.9763,
6886
+ "step": 9790
6887
+ },
6888
+ {
6889
+ "epoch": 0.7087966729952084,
6890
+ "grad_norm": 17.609375,
6891
+ "learning_rate": 9.944625478947228e-07,
6892
+ "loss": 24.7899,
6893
+ "step": 9800
6894
+ },
6895
+ {
6896
+ "epoch": 0.709519934906428,
6897
+ "grad_norm": 15.5234375,
6898
+ "learning_rate": 9.94456897433391e-07,
6899
+ "loss": 25.1678,
6900
+ "step": 9810
6901
+ },
6902
+ {
6903
+ "epoch": 0.7102431968176476,
6904
+ "grad_norm": 16.296875,
6905
+ "learning_rate": 9.94451246972059e-07,
6906
+ "loss": 24.7197,
6907
+ "step": 9820
6908
+ },
6909
+ {
6910
+ "epoch": 0.7109664587288672,
6911
+ "grad_norm": 16.921875,
6912
+ "learning_rate": 9.944455965107272e-07,
6913
+ "loss": 25.1769,
6914
+ "step": 9830
6915
+ },
6916
+ {
6917
+ "epoch": 0.7116897206400868,
6918
+ "grad_norm": 15.6796875,
6919
+ "learning_rate": 9.944399460493952e-07,
6920
+ "loss": 24.9675,
6921
+ "step": 9840
6922
+ },
6923
+ {
6924
+ "epoch": 0.7124129825513064,
6925
+ "grad_norm": 16.5625,
6926
+ "learning_rate": 9.944342955880632e-07,
6927
+ "loss": 25.0993,
6928
+ "step": 9850
6929
+ },
6930
+ {
6931
+ "epoch": 0.713136244462526,
6932
+ "grad_norm": 16.203125,
6933
+ "learning_rate": 9.944286451267314e-07,
6934
+ "loss": 24.9978,
6935
+ "step": 9860
6936
+ },
6937
+ {
6938
+ "epoch": 0.7138595063737456,
6939
+ "grad_norm": 16.78125,
6940
+ "learning_rate": 9.944229946653994e-07,
6941
+ "loss": 24.8196,
6942
+ "step": 9870
6943
+ },
6944
+ {
6945
+ "epoch": 0.7145827682849651,
6946
+ "grad_norm": 16.140625,
6947
+ "learning_rate": 9.944173442040676e-07,
6948
+ "loss": 25.2091,
6949
+ "step": 9880
6950
+ },
6951
+ {
6952
+ "epoch": 0.7153060301961848,
6953
+ "grad_norm": 16.484375,
6954
+ "learning_rate": 9.944116937427356e-07,
6955
+ "loss": 25.1268,
6956
+ "step": 9890
6957
+ },
6958
+ {
6959
+ "epoch": 0.7160292921074044,
6960
+ "grad_norm": 16.171875,
6961
+ "learning_rate": 9.944060432814036e-07,
6962
+ "loss": 25.0055,
6963
+ "step": 9900
6964
+ },
6965
+ {
6966
+ "epoch": 0.716752554018624,
6967
+ "grad_norm": 16.203125,
6968
+ "learning_rate": 9.944003928200718e-07,
6969
+ "loss": 25.0594,
6970
+ "step": 9910
6971
+ },
6972
+ {
6973
+ "epoch": 0.7174758159298436,
6974
+ "grad_norm": 16.78125,
6975
+ "learning_rate": 9.943947423587398e-07,
6976
+ "loss": 25.3482,
6977
+ "step": 9920
6978
+ },
6979
+ {
6980
+ "epoch": 0.7181990778410632,
6981
+ "grad_norm": 16.03125,
6982
+ "learning_rate": 9.94389091897408e-07,
6983
+ "loss": 25.1836,
6984
+ "step": 9930
6985
+ },
6986
+ {
6987
+ "epoch": 0.7189223397522828,
6988
+ "grad_norm": 16.609375,
6989
+ "learning_rate": 9.94383441436076e-07,
6990
+ "loss": 25.2864,
6991
+ "step": 9940
6992
+ },
6993
+ {
6994
+ "epoch": 0.7196456016635024,
6995
+ "grad_norm": 16.046875,
6996
+ "learning_rate": 9.94377790974744e-07,
6997
+ "loss": 25.058,
6998
+ "step": 9950
6999
+ },
7000
+ {
7001
+ "epoch": 0.720368863574722,
7002
+ "grad_norm": 17.21875,
7003
+ "learning_rate": 9.943721405134122e-07,
7004
+ "loss": 24.9827,
7005
+ "step": 9960
7006
+ },
7007
+ {
7008
+ "epoch": 0.7210921254859416,
7009
+ "grad_norm": 17.34375,
7010
+ "learning_rate": 9.943664900520802e-07,
7011
+ "loss": 25.0163,
7012
+ "step": 9970
7013
+ },
7014
+ {
7015
+ "epoch": 0.7218153873971612,
7016
+ "grad_norm": 16.109375,
7017
+ "learning_rate": 9.943608395907484e-07,
7018
+ "loss": 25.2643,
7019
+ "step": 9980
7020
+ },
7021
+ {
7022
+ "epoch": 0.7225386493083807,
7023
+ "grad_norm": 16.390625,
7024
+ "learning_rate": 9.943551891294164e-07,
7025
+ "loss": 25.2327,
7026
+ "step": 9990
7027
+ },
7028
+ {
7029
+ "epoch": 0.7232619112196004,
7030
+ "grad_norm": 16.8125,
7031
+ "learning_rate": 9.943495386680844e-07,
7032
+ "loss": 24.98,
7033
+ "step": 10000
7034
+ },
7035
+ {
7036
+ "epoch": 0.7232619112196004,
7037
+ "eval_loss": 1.5873770713806152,
7038
+ "eval_runtime": 366.777,
7039
+ "eval_samples_per_second": 1979.565,
7040
+ "eval_steps_per_second": 30.932,
7041
+ "step": 10000
7042
  }
7043
  ],
7044
  "logging_steps": 10,
 
7058
  "attributes": {}
7059
  }
7060
  },
7061
+ "total_flos": 1.3969285567217664e+19,
7062
  "train_batch_size": 16,
7063
  "trial_name": null,
7064
  "trial_params": null