CocoRoF commited on
Commit
a166413
·
verified ·
1 Parent(s): e8dfede

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f673b823be36f3eb97a0d1d83dac231599658a1bf8bcffea23f67cafcc109b48
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:203b87ab4b155942f182a48c96314677708618d17d587d13aa6a90c599683112
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe2f2bef345acda275dcd6003cb182f7f1e032680f3a4edeefc1b29e48691847
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29fe896ba1f2f6a44409fa7b844c95428ca05929de3bae3b32438d62e2c721df
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c461c9d337dfc684e9352ec72bfa344e2f5d377f7cfc4475de9acae294dca89
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34fccfe9285534cc1461fe7c44d5b307057be9ab740b9c40800411e6c74ade04
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deec0367b7616485abab4c5d341abba8ad8319ff1b2f118048701f6b48b69fb6
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.3616309556098002,
5
  "eval_steps": 2500,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3523,6 +3523,1764 @@
3523
  "eval_samples_per_second": 1930.72,
3524
  "eval_steps_per_second": 30.168,
3525
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3526
  }
3527
  ],
3528
  "logging_steps": 10,
@@ -3542,7 +5300,7 @@
3542
  "attributes": {}
3543
  }
3544
  },
3545
- "total_flos": 6.984642783608832e+18,
3546
  "train_batch_size": 16,
3547
  "trial_name": null,
3548
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5424464334147003,
5
  "eval_steps": 2500,
6
+ "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3523
  "eval_samples_per_second": 1930.72,
3524
  "eval_steps_per_second": 30.168,
3525
  "step": 5000
3526
+ },
3527
+ {
3528
+ "epoch": 0.3623542175210198,
3529
+ "grad_norm": 16.78125,
3530
+ "learning_rate": 9.971691188727103e-07,
3531
+ "loss": 25.2169,
3532
+ "step": 5010
3533
+ },
3534
+ {
3535
+ "epoch": 0.3630774794322394,
3536
+ "grad_norm": 15.9765625,
3537
+ "learning_rate": 9.971634684113785e-07,
3538
+ "loss": 25.2699,
3539
+ "step": 5020
3540
+ },
3541
+ {
3542
+ "epoch": 0.363800741343459,
3543
+ "grad_norm": 16.3125,
3544
+ "learning_rate": 9.971578179500465e-07,
3545
+ "loss": 25.0431,
3546
+ "step": 5030
3547
+ },
3548
+ {
3549
+ "epoch": 0.3645240032546786,
3550
+ "grad_norm": 16.609375,
3551
+ "learning_rate": 9.971521674887145e-07,
3552
+ "loss": 25.1181,
3553
+ "step": 5040
3554
+ },
3555
+ {
3556
+ "epoch": 0.3652472651658982,
3557
+ "grad_norm": 16.421875,
3558
+ "learning_rate": 9.971465170273827e-07,
3559
+ "loss": 25.0253,
3560
+ "step": 5050
3561
+ },
3562
+ {
3563
+ "epoch": 0.3659705270771178,
3564
+ "grad_norm": 16.171875,
3565
+ "learning_rate": 9.971408665660507e-07,
3566
+ "loss": 25.1289,
3567
+ "step": 5060
3568
+ },
3569
+ {
3570
+ "epoch": 0.3666937889883374,
3571
+ "grad_norm": 16.484375,
3572
+ "learning_rate": 9.97135216104719e-07,
3573
+ "loss": 25.298,
3574
+ "step": 5070
3575
+ },
3576
+ {
3577
+ "epoch": 0.367417050899557,
3578
+ "grad_norm": 15.8984375,
3579
+ "learning_rate": 9.97129565643387e-07,
3580
+ "loss": 24.8407,
3581
+ "step": 5080
3582
+ },
3583
+ {
3584
+ "epoch": 0.3681403128107766,
3585
+ "grad_norm": 16.421875,
3586
+ "learning_rate": 9.971239151820549e-07,
3587
+ "loss": 24.7991,
3588
+ "step": 5090
3589
+ },
3590
+ {
3591
+ "epoch": 0.3688635747219962,
3592
+ "grad_norm": 17.015625,
3593
+ "learning_rate": 9.97118264720723e-07,
3594
+ "loss": 25.4768,
3595
+ "step": 5100
3596
+ },
3597
+ {
3598
+ "epoch": 0.3695868366332158,
3599
+ "grad_norm": 16.15625,
3600
+ "learning_rate": 9.97112614259391e-07,
3601
+ "loss": 25.0624,
3602
+ "step": 5110
3603
+ },
3604
+ {
3605
+ "epoch": 0.3703100985444354,
3606
+ "grad_norm": 15.9765625,
3607
+ "learning_rate": 9.971069637980593e-07,
3608
+ "loss": 24.9802,
3609
+ "step": 5120
3610
+ },
3611
+ {
3612
+ "epoch": 0.371033360455655,
3613
+ "grad_norm": 16.21875,
3614
+ "learning_rate": 9.971013133367273e-07,
3615
+ "loss": 25.2557,
3616
+ "step": 5130
3617
+ },
3618
+ {
3619
+ "epoch": 0.3717566223668746,
3620
+ "grad_norm": 15.625,
3621
+ "learning_rate": 9.970956628753953e-07,
3622
+ "loss": 25.0409,
3623
+ "step": 5140
3624
+ },
3625
+ {
3626
+ "epoch": 0.3724798842780942,
3627
+ "grad_norm": 16.0625,
3628
+ "learning_rate": 9.970900124140635e-07,
3629
+ "loss": 24.8528,
3630
+ "step": 5150
3631
+ },
3632
+ {
3633
+ "epoch": 0.37320314618931383,
3634
+ "grad_norm": 17.171875,
3635
+ "learning_rate": 9.970843619527315e-07,
3636
+ "loss": 25.2391,
3637
+ "step": 5160
3638
+ },
3639
+ {
3640
+ "epoch": 0.3739264081005334,
3641
+ "grad_norm": 15.7265625,
3642
+ "learning_rate": 9.970787114913997e-07,
3643
+ "loss": 25.1315,
3644
+ "step": 5170
3645
+ },
3646
+ {
3647
+ "epoch": 0.37464967001175303,
3648
+ "grad_norm": 15.890625,
3649
+ "learning_rate": 9.970730610300677e-07,
3650
+ "loss": 24.908,
3651
+ "step": 5180
3652
+ },
3653
+ {
3654
+ "epoch": 0.3753729319229726,
3655
+ "grad_norm": 16.203125,
3656
+ "learning_rate": 9.97067410568736e-07,
3657
+ "loss": 24.8777,
3658
+ "step": 5190
3659
+ },
3660
+ {
3661
+ "epoch": 0.3760961938341922,
3662
+ "grad_norm": 16.265625,
3663
+ "learning_rate": 9.97061760107404e-07,
3664
+ "loss": 24.7149,
3665
+ "step": 5200
3666
+ },
3667
+ {
3668
+ "epoch": 0.3768194557454118,
3669
+ "grad_norm": 16.140625,
3670
+ "learning_rate": 9.97056109646072e-07,
3671
+ "loss": 25.073,
3672
+ "step": 5210
3673
+ },
3674
+ {
3675
+ "epoch": 0.3775427176566314,
3676
+ "grad_norm": 16.078125,
3677
+ "learning_rate": 9.970504591847401e-07,
3678
+ "loss": 25.1022,
3679
+ "step": 5220
3680
+ },
3681
+ {
3682
+ "epoch": 0.378265979567851,
3683
+ "grad_norm": 15.765625,
3684
+ "learning_rate": 9.970448087234081e-07,
3685
+ "loss": 25.1426,
3686
+ "step": 5230
3687
+ },
3688
+ {
3689
+ "epoch": 0.3789892414790706,
3690
+ "grad_norm": 15.296875,
3691
+ "learning_rate": 9.970391582620763e-07,
3692
+ "loss": 25.0163,
3693
+ "step": 5240
3694
+ },
3695
+ {
3696
+ "epoch": 0.3797125033902902,
3697
+ "grad_norm": 17.453125,
3698
+ "learning_rate": 9.970335078007443e-07,
3699
+ "loss": 25.334,
3700
+ "step": 5250
3701
+ },
3702
+ {
3703
+ "epoch": 0.3804357653015098,
3704
+ "grad_norm": 15.5703125,
3705
+ "learning_rate": 9.970278573394123e-07,
3706
+ "loss": 24.9564,
3707
+ "step": 5260
3708
+ },
3709
+ {
3710
+ "epoch": 0.38115902721272943,
3711
+ "grad_norm": 15.78125,
3712
+ "learning_rate": 9.970222068780805e-07,
3713
+ "loss": 25.264,
3714
+ "step": 5270
3715
+ },
3716
+ {
3717
+ "epoch": 0.381882289123949,
3718
+ "grad_norm": 16.109375,
3719
+ "learning_rate": 9.970165564167485e-07,
3720
+ "loss": 25.3679,
3721
+ "step": 5280
3722
+ },
3723
+ {
3724
+ "epoch": 0.38260555103516863,
3725
+ "grad_norm": 16.3125,
3726
+ "learning_rate": 9.970109059554167e-07,
3727
+ "loss": 25.1622,
3728
+ "step": 5290
3729
+ },
3730
+ {
3731
+ "epoch": 0.3833288129463882,
3732
+ "grad_norm": 15.5703125,
3733
+ "learning_rate": 9.970052554940847e-07,
3734
+ "loss": 25.0249,
3735
+ "step": 5300
3736
+ },
3737
+ {
3738
+ "epoch": 0.3840520748576078,
3739
+ "grad_norm": 16.453125,
3740
+ "learning_rate": 9.969996050327527e-07,
3741
+ "loss": 25.1322,
3742
+ "step": 5310
3743
+ },
3744
+ {
3745
+ "epoch": 0.3847753367688274,
3746
+ "grad_norm": 17.53125,
3747
+ "learning_rate": 9.96993954571421e-07,
3748
+ "loss": 25.4187,
3749
+ "step": 5320
3750
+ },
3751
+ {
3752
+ "epoch": 0.385498598680047,
3753
+ "grad_norm": 16.96875,
3754
+ "learning_rate": 9.96988304110089e-07,
3755
+ "loss": 24.8692,
3756
+ "step": 5330
3757
+ },
3758
+ {
3759
+ "epoch": 0.3862218605912666,
3760
+ "grad_norm": 14.84375,
3761
+ "learning_rate": 9.969826536487572e-07,
3762
+ "loss": 25.1561,
3763
+ "step": 5340
3764
+ },
3765
+ {
3766
+ "epoch": 0.3869451225024862,
3767
+ "grad_norm": 16.34375,
3768
+ "learning_rate": 9.969770031874252e-07,
3769
+ "loss": 25.4448,
3770
+ "step": 5350
3771
+ },
3772
+ {
3773
+ "epoch": 0.3876683844137058,
3774
+ "grad_norm": 17.515625,
3775
+ "learning_rate": 9.969713527260932e-07,
3776
+ "loss": 25.3175,
3777
+ "step": 5360
3778
+ },
3779
+ {
3780
+ "epoch": 0.3883916463249254,
3781
+ "grad_norm": 16.421875,
3782
+ "learning_rate": 9.969657022647614e-07,
3783
+ "loss": 24.7974,
3784
+ "step": 5370
3785
+ },
3786
+ {
3787
+ "epoch": 0.38911490823614503,
3788
+ "grad_norm": 17.34375,
3789
+ "learning_rate": 9.969600518034294e-07,
3790
+ "loss": 24.831,
3791
+ "step": 5380
3792
+ },
3793
+ {
3794
+ "epoch": 0.3898381701473646,
3795
+ "grad_norm": 16.34375,
3796
+ "learning_rate": 9.969544013420976e-07,
3797
+ "loss": 25.1299,
3798
+ "step": 5390
3799
+ },
3800
+ {
3801
+ "epoch": 0.39056143205858423,
3802
+ "grad_norm": 17.765625,
3803
+ "learning_rate": 9.969487508807656e-07,
3804
+ "loss": 24.9919,
3805
+ "step": 5400
3806
+ },
3807
+ {
3808
+ "epoch": 0.3912846939698038,
3809
+ "grad_norm": 15.75,
3810
+ "learning_rate": 9.969431004194338e-07,
3811
+ "loss": 25.1571,
3812
+ "step": 5410
3813
+ },
3814
+ {
3815
+ "epoch": 0.39200795588102344,
3816
+ "grad_norm": 15.3671875,
3817
+ "learning_rate": 9.969374499581018e-07,
3818
+ "loss": 25.1051,
3819
+ "step": 5420
3820
+ },
3821
+ {
3822
+ "epoch": 0.392731217792243,
3823
+ "grad_norm": 15.71875,
3824
+ "learning_rate": 9.969317994967698e-07,
3825
+ "loss": 24.9415,
3826
+ "step": 5430
3827
+ },
3828
+ {
3829
+ "epoch": 0.3934544797034626,
3830
+ "grad_norm": 16.109375,
3831
+ "learning_rate": 9.96926149035438e-07,
3832
+ "loss": 25.4558,
3833
+ "step": 5440
3834
+ },
3835
+ {
3836
+ "epoch": 0.3941777416146822,
3837
+ "grad_norm": 16.5,
3838
+ "learning_rate": 9.96920498574106e-07,
3839
+ "loss": 25.084,
3840
+ "step": 5450
3841
+ },
3842
+ {
3843
+ "epoch": 0.3949010035259018,
3844
+ "grad_norm": 17.625,
3845
+ "learning_rate": 9.969148481127742e-07,
3846
+ "loss": 24.7463,
3847
+ "step": 5460
3848
+ },
3849
+ {
3850
+ "epoch": 0.3956242654371214,
3851
+ "grad_norm": 16.234375,
3852
+ "learning_rate": 9.969091976514422e-07,
3853
+ "loss": 24.7555,
3854
+ "step": 5470
3855
+ },
3856
+ {
3857
+ "epoch": 0.396347527348341,
3858
+ "grad_norm": 15.7421875,
3859
+ "learning_rate": 9.969035471901102e-07,
3860
+ "loss": 25.408,
3861
+ "step": 5480
3862
+ },
3863
+ {
3864
+ "epoch": 0.39707078925956063,
3865
+ "grad_norm": 16.859375,
3866
+ "learning_rate": 9.968978967287784e-07,
3867
+ "loss": 25.1242,
3868
+ "step": 5490
3869
+ },
3870
+ {
3871
+ "epoch": 0.3977940511707802,
3872
+ "grad_norm": 16.78125,
3873
+ "learning_rate": 9.968922462674466e-07,
3874
+ "loss": 25.2741,
3875
+ "step": 5500
3876
+ },
3877
+ {
3878
+ "epoch": 0.39851731308199984,
3879
+ "grad_norm": 15.078125,
3880
+ "learning_rate": 9.968865958061146e-07,
3881
+ "loss": 24.884,
3882
+ "step": 5510
3883
+ },
3884
+ {
3885
+ "epoch": 0.3992405749932194,
3886
+ "grad_norm": 16.421875,
3887
+ "learning_rate": 9.968809453447826e-07,
3888
+ "loss": 25.2885,
3889
+ "step": 5520
3890
+ },
3891
+ {
3892
+ "epoch": 0.39996383690443904,
3893
+ "grad_norm": 16.40625,
3894
+ "learning_rate": 9.968752948834506e-07,
3895
+ "loss": 25.4121,
3896
+ "step": 5530
3897
+ },
3898
+ {
3899
+ "epoch": 0.4006870988156586,
3900
+ "grad_norm": 16.78125,
3901
+ "learning_rate": 9.968696444221188e-07,
3902
+ "loss": 25.0091,
3903
+ "step": 5540
3904
+ },
3905
+ {
3906
+ "epoch": 0.40141036072687825,
3907
+ "grad_norm": 16.140625,
3908
+ "learning_rate": 9.96863993960787e-07,
3909
+ "loss": 25.1445,
3910
+ "step": 5550
3911
+ },
3912
+ {
3913
+ "epoch": 0.4021336226380978,
3914
+ "grad_norm": 15.875,
3915
+ "learning_rate": 9.96858343499455e-07,
3916
+ "loss": 24.9004,
3917
+ "step": 5560
3918
+ },
3919
+ {
3920
+ "epoch": 0.4028568845493174,
3921
+ "grad_norm": 15.65625,
3922
+ "learning_rate": 9.96852693038123e-07,
3923
+ "loss": 25.0035,
3924
+ "step": 5570
3925
+ },
3926
+ {
3927
+ "epoch": 0.403580146460537,
3928
+ "grad_norm": 15.796875,
3929
+ "learning_rate": 9.96847042576791e-07,
3930
+ "loss": 25.2146,
3931
+ "step": 5580
3932
+ },
3933
+ {
3934
+ "epoch": 0.4043034083717566,
3935
+ "grad_norm": 15.6875,
3936
+ "learning_rate": 9.968413921154592e-07,
3937
+ "loss": 24.7197,
3938
+ "step": 5590
3939
+ },
3940
+ {
3941
+ "epoch": 0.40502667028297623,
3942
+ "grad_norm": 16.6875,
3943
+ "learning_rate": 9.968357416541274e-07,
3944
+ "loss": 25.1168,
3945
+ "step": 5600
3946
+ },
3947
+ {
3948
+ "epoch": 0.4057499321941958,
3949
+ "grad_norm": 16.109375,
3950
+ "learning_rate": 9.968300911927954e-07,
3951
+ "loss": 25.379,
3952
+ "step": 5610
3953
+ },
3954
+ {
3955
+ "epoch": 0.40647319410541544,
3956
+ "grad_norm": 16.578125,
3957
+ "learning_rate": 9.968244407314634e-07,
3958
+ "loss": 24.966,
3959
+ "step": 5620
3960
+ },
3961
+ {
3962
+ "epoch": 0.407196456016635,
3963
+ "grad_norm": 15.5390625,
3964
+ "learning_rate": 9.968187902701316e-07,
3965
+ "loss": 24.6402,
3966
+ "step": 5630
3967
+ },
3968
+ {
3969
+ "epoch": 0.40791971792785464,
3970
+ "grad_norm": 16.46875,
3971
+ "learning_rate": 9.968131398087996e-07,
3972
+ "loss": 25.3287,
3973
+ "step": 5640
3974
+ },
3975
+ {
3976
+ "epoch": 0.4086429798390742,
3977
+ "grad_norm": 16.34375,
3978
+ "learning_rate": 9.968074893474676e-07,
3979
+ "loss": 24.8552,
3980
+ "step": 5650
3981
+ },
3982
+ {
3983
+ "epoch": 0.40936624175029385,
3984
+ "grad_norm": 16.78125,
3985
+ "learning_rate": 9.968018388861358e-07,
3986
+ "loss": 25.2226,
3987
+ "step": 5660
3988
+ },
3989
+ {
3990
+ "epoch": 0.4100895036615134,
3991
+ "grad_norm": 16.3125,
3992
+ "learning_rate": 9.96796188424804e-07,
3993
+ "loss": 25.3465,
3994
+ "step": 5670
3995
+ },
3996
+ {
3997
+ "epoch": 0.41081276557273305,
3998
+ "grad_norm": 16.28125,
3999
+ "learning_rate": 9.96790537963472e-07,
4000
+ "loss": 25.0835,
4001
+ "step": 5680
4002
+ },
4003
+ {
4004
+ "epoch": 0.4115360274839526,
4005
+ "grad_norm": 16.4375,
4006
+ "learning_rate": 9.9678488750214e-07,
4007
+ "loss": 25.0594,
4008
+ "step": 5690
4009
+ },
4010
+ {
4011
+ "epoch": 0.4122592893951722,
4012
+ "grad_norm": 16.046875,
4013
+ "learning_rate": 9.96779237040808e-07,
4014
+ "loss": 25.1546,
4015
+ "step": 5700
4016
+ },
4017
+ {
4018
+ "epoch": 0.41298255130639183,
4019
+ "grad_norm": 16.53125,
4020
+ "learning_rate": 9.967735865794762e-07,
4021
+ "loss": 25.1157,
4022
+ "step": 5710
4023
+ },
4024
+ {
4025
+ "epoch": 0.4137058132176114,
4026
+ "grad_norm": 16.265625,
4027
+ "learning_rate": 9.967679361181444e-07,
4028
+ "loss": 25.1408,
4029
+ "step": 5720
4030
+ },
4031
+ {
4032
+ "epoch": 0.41442907512883104,
4033
+ "grad_norm": 15.8515625,
4034
+ "learning_rate": 9.967622856568124e-07,
4035
+ "loss": 24.9652,
4036
+ "step": 5730
4037
+ },
4038
+ {
4039
+ "epoch": 0.4151523370400506,
4040
+ "grad_norm": 17.25,
4041
+ "learning_rate": 9.967566351954804e-07,
4042
+ "loss": 25.4921,
4043
+ "step": 5740
4044
+ },
4045
+ {
4046
+ "epoch": 0.41587559895127024,
4047
+ "grad_norm": 16.765625,
4048
+ "learning_rate": 9.967509847341484e-07,
4049
+ "loss": 24.9062,
4050
+ "step": 5750
4051
+ },
4052
+ {
4053
+ "epoch": 0.4165988608624898,
4054
+ "grad_norm": 16.59375,
4055
+ "learning_rate": 9.967453342728166e-07,
4056
+ "loss": 25.5228,
4057
+ "step": 5760
4058
+ },
4059
+ {
4060
+ "epoch": 0.41732212277370945,
4061
+ "grad_norm": 16.3125,
4062
+ "learning_rate": 9.967396838114849e-07,
4063
+ "loss": 24.9683,
4064
+ "step": 5770
4065
+ },
4066
+ {
4067
+ "epoch": 0.418045384684929,
4068
+ "grad_norm": 16.15625,
4069
+ "learning_rate": 9.967340333501529e-07,
4070
+ "loss": 24.8785,
4071
+ "step": 5780
4072
+ },
4073
+ {
4074
+ "epoch": 0.41876864659614865,
4075
+ "grad_norm": 16.234375,
4076
+ "learning_rate": 9.967283828888209e-07,
4077
+ "loss": 25.188,
4078
+ "step": 5790
4079
+ },
4080
+ {
4081
+ "epoch": 0.4194919085073682,
4082
+ "grad_norm": 16.453125,
4083
+ "learning_rate": 9.967227324274889e-07,
4084
+ "loss": 25.6373,
4085
+ "step": 5800
4086
+ },
4087
+ {
4088
+ "epoch": 0.42021517041858786,
4089
+ "grad_norm": 16.8125,
4090
+ "learning_rate": 9.96717081966157e-07,
4091
+ "loss": 25.1054,
4092
+ "step": 5810
4093
+ },
4094
+ {
4095
+ "epoch": 0.42093843232980743,
4096
+ "grad_norm": 16.921875,
4097
+ "learning_rate": 9.967114315048253e-07,
4098
+ "loss": 25.075,
4099
+ "step": 5820
4100
+ },
4101
+ {
4102
+ "epoch": 0.421661694241027,
4103
+ "grad_norm": 16.28125,
4104
+ "learning_rate": 9.967057810434933e-07,
4105
+ "loss": 24.8702,
4106
+ "step": 5830
4107
+ },
4108
+ {
4109
+ "epoch": 0.42238495615224664,
4110
+ "grad_norm": 15.7890625,
4111
+ "learning_rate": 9.967001305821613e-07,
4112
+ "loss": 25.1268,
4113
+ "step": 5840
4114
+ },
4115
+ {
4116
+ "epoch": 0.4231082180634662,
4117
+ "grad_norm": 16.53125,
4118
+ "learning_rate": 9.966944801208295e-07,
4119
+ "loss": 25.222,
4120
+ "step": 5850
4121
+ },
4122
+ {
4123
+ "epoch": 0.42383147997468584,
4124
+ "grad_norm": 15.78125,
4125
+ "learning_rate": 9.966888296594975e-07,
4126
+ "loss": 24.7222,
4127
+ "step": 5860
4128
+ },
4129
+ {
4130
+ "epoch": 0.4245547418859054,
4131
+ "grad_norm": 17.375,
4132
+ "learning_rate": 9.966831791981657e-07,
4133
+ "loss": 24.8381,
4134
+ "step": 5870
4135
+ },
4136
+ {
4137
+ "epoch": 0.42527800379712505,
4138
+ "grad_norm": 15.6953125,
4139
+ "learning_rate": 9.966775287368337e-07,
4140
+ "loss": 25.3032,
4141
+ "step": 5880
4142
+ },
4143
+ {
4144
+ "epoch": 0.4260012657083446,
4145
+ "grad_norm": 16.484375,
4146
+ "learning_rate": 9.966718782755019e-07,
4147
+ "loss": 25.0616,
4148
+ "step": 5890
4149
+ },
4150
+ {
4151
+ "epoch": 0.42672452761956425,
4152
+ "grad_norm": 16.296875,
4153
+ "learning_rate": 9.966662278141699e-07,
4154
+ "loss": 25.1365,
4155
+ "step": 5900
4156
+ },
4157
+ {
4158
+ "epoch": 0.42744778953078383,
4159
+ "grad_norm": 16.296875,
4160
+ "learning_rate": 9.966605773528379e-07,
4161
+ "loss": 24.87,
4162
+ "step": 5910
4163
+ },
4164
+ {
4165
+ "epoch": 0.42817105144200346,
4166
+ "grad_norm": 16.421875,
4167
+ "learning_rate": 9.96654926891506e-07,
4168
+ "loss": 25.0335,
4169
+ "step": 5920
4170
+ },
4171
+ {
4172
+ "epoch": 0.42889431335322303,
4173
+ "grad_norm": 15.359375,
4174
+ "learning_rate": 9.96649276430174e-07,
4175
+ "loss": 25.2336,
4176
+ "step": 5930
4177
+ },
4178
+ {
4179
+ "epoch": 0.42961757526444266,
4180
+ "grad_norm": 15.484375,
4181
+ "learning_rate": 9.966436259688423e-07,
4182
+ "loss": 24.7188,
4183
+ "step": 5940
4184
+ },
4185
+ {
4186
+ "epoch": 0.43034083717566224,
4187
+ "grad_norm": 16.296875,
4188
+ "learning_rate": 9.966379755075103e-07,
4189
+ "loss": 25.243,
4190
+ "step": 5950
4191
+ },
4192
+ {
4193
+ "epoch": 0.4310640990868818,
4194
+ "grad_norm": 15.8359375,
4195
+ "learning_rate": 9.966323250461783e-07,
4196
+ "loss": 25.4129,
4197
+ "step": 5960
4198
+ },
4199
+ {
4200
+ "epoch": 0.43178736099810144,
4201
+ "grad_norm": 16.828125,
4202
+ "learning_rate": 9.966266745848463e-07,
4203
+ "loss": 25.113,
4204
+ "step": 5970
4205
+ },
4206
+ {
4207
+ "epoch": 0.432510622909321,
4208
+ "grad_norm": 16.6875,
4209
+ "learning_rate": 9.966210241235145e-07,
4210
+ "loss": 25.1597,
4211
+ "step": 5980
4212
+ },
4213
+ {
4214
+ "epoch": 0.43323388482054065,
4215
+ "grad_norm": 17.65625,
4216
+ "learning_rate": 9.966153736621827e-07,
4217
+ "loss": 25.085,
4218
+ "step": 5990
4219
+ },
4220
+ {
4221
+ "epoch": 0.4339571467317602,
4222
+ "grad_norm": 17.078125,
4223
+ "learning_rate": 9.966097232008507e-07,
4224
+ "loss": 25.0908,
4225
+ "step": 6000
4226
+ },
4227
+ {
4228
+ "epoch": 0.43468040864297985,
4229
+ "grad_norm": 16.578125,
4230
+ "learning_rate": 9.966040727395187e-07,
4231
+ "loss": 25.1842,
4232
+ "step": 6010
4233
+ },
4234
+ {
4235
+ "epoch": 0.43540367055419943,
4236
+ "grad_norm": 16.890625,
4237
+ "learning_rate": 9.96598422278187e-07,
4238
+ "loss": 25.157,
4239
+ "step": 6020
4240
+ },
4241
+ {
4242
+ "epoch": 0.43612693246541906,
4243
+ "grad_norm": 15.8984375,
4244
+ "learning_rate": 9.96592771816855e-07,
4245
+ "loss": 25.04,
4246
+ "step": 6030
4247
+ },
4248
+ {
4249
+ "epoch": 0.43685019437663863,
4250
+ "grad_norm": 17.546875,
4251
+ "learning_rate": 9.965871213555231e-07,
4252
+ "loss": 25.1817,
4253
+ "step": 6040
4254
+ },
4255
+ {
4256
+ "epoch": 0.43757345628785826,
4257
+ "grad_norm": 17.375,
4258
+ "learning_rate": 9.965814708941911e-07,
4259
+ "loss": 25.4064,
4260
+ "step": 6050
4261
+ },
4262
+ {
4263
+ "epoch": 0.43829671819907784,
4264
+ "grad_norm": 16.703125,
4265
+ "learning_rate": 9.965758204328591e-07,
4266
+ "loss": 25.0107,
4267
+ "step": 6060
4268
+ },
4269
+ {
4270
+ "epoch": 0.4390199801102974,
4271
+ "grad_norm": 16.828125,
4272
+ "learning_rate": 9.965701699715273e-07,
4273
+ "loss": 25.1997,
4274
+ "step": 6070
4275
+ },
4276
+ {
4277
+ "epoch": 0.43974324202151704,
4278
+ "grad_norm": 16.171875,
4279
+ "learning_rate": 9.965645195101953e-07,
4280
+ "loss": 25.2761,
4281
+ "step": 6080
4282
+ },
4283
+ {
4284
+ "epoch": 0.4404665039327366,
4285
+ "grad_norm": 15.578125,
4286
+ "learning_rate": 9.965588690488635e-07,
4287
+ "loss": 24.9903,
4288
+ "step": 6090
4289
+ },
4290
+ {
4291
+ "epoch": 0.44118976584395625,
4292
+ "grad_norm": 15.625,
4293
+ "learning_rate": 9.965532185875315e-07,
4294
+ "loss": 24.8739,
4295
+ "step": 6100
4296
+ },
4297
+ {
4298
+ "epoch": 0.4419130277551758,
4299
+ "grad_norm": 15.7734375,
4300
+ "learning_rate": 9.965475681261997e-07,
4301
+ "loss": 25.2024,
4302
+ "step": 6110
4303
+ },
4304
+ {
4305
+ "epoch": 0.44263628966639545,
4306
+ "grad_norm": 15.9296875,
4307
+ "learning_rate": 9.965419176648677e-07,
4308
+ "loss": 25.122,
4309
+ "step": 6120
4310
+ },
4311
+ {
4312
+ "epoch": 0.44335955157761503,
4313
+ "grad_norm": 17.40625,
4314
+ "learning_rate": 9.965362672035357e-07,
4315
+ "loss": 25.1236,
4316
+ "step": 6130
4317
+ },
4318
+ {
4319
+ "epoch": 0.44408281348883466,
4320
+ "grad_norm": 16.234375,
4321
+ "learning_rate": 9.96530616742204e-07,
4322
+ "loss": 24.9833,
4323
+ "step": 6140
4324
+ },
4325
+ {
4326
+ "epoch": 0.44480607540005423,
4327
+ "grad_norm": 16.28125,
4328
+ "learning_rate": 9.96524966280872e-07,
4329
+ "loss": 25.1377,
4330
+ "step": 6150
4331
+ },
4332
+ {
4333
+ "epoch": 0.44552933731127387,
4334
+ "grad_norm": 16.078125,
4335
+ "learning_rate": 9.965193158195401e-07,
4336
+ "loss": 25.0597,
4337
+ "step": 6160
4338
+ },
4339
+ {
4340
+ "epoch": 0.44625259922249344,
4341
+ "grad_norm": 15.96875,
4342
+ "learning_rate": 9.965136653582081e-07,
4343
+ "loss": 25.0991,
4344
+ "step": 6170
4345
+ },
4346
+ {
4347
+ "epoch": 0.44697586113371307,
4348
+ "grad_norm": 15.8984375,
4349
+ "learning_rate": 9.965080148968761e-07,
4350
+ "loss": 25.0012,
4351
+ "step": 6180
4352
+ },
4353
+ {
4354
+ "epoch": 0.44769912304493265,
4355
+ "grad_norm": 15.984375,
4356
+ "learning_rate": 9.965023644355444e-07,
4357
+ "loss": 24.924,
4358
+ "step": 6190
4359
+ },
4360
+ {
4361
+ "epoch": 0.4484223849561522,
4362
+ "grad_norm": 16.21875,
4363
+ "learning_rate": 9.964967139742123e-07,
4364
+ "loss": 25.307,
4365
+ "step": 6200
4366
+ },
4367
+ {
4368
+ "epoch": 0.44914564686737185,
4369
+ "grad_norm": 16.3125,
4370
+ "learning_rate": 9.964910635128806e-07,
4371
+ "loss": 25.1692,
4372
+ "step": 6210
4373
+ },
4374
+ {
4375
+ "epoch": 0.4498689087785914,
4376
+ "grad_norm": 17.046875,
4377
+ "learning_rate": 9.964854130515486e-07,
4378
+ "loss": 25.219,
4379
+ "step": 6220
4380
+ },
4381
+ {
4382
+ "epoch": 0.45059217068981106,
4383
+ "grad_norm": 16.5625,
4384
+ "learning_rate": 9.964797625902166e-07,
4385
+ "loss": 24.9296,
4386
+ "step": 6230
4387
+ },
4388
+ {
4389
+ "epoch": 0.45131543260103063,
4390
+ "grad_norm": 16.78125,
4391
+ "learning_rate": 9.964741121288848e-07,
4392
+ "loss": 25.1963,
4393
+ "step": 6240
4394
+ },
4395
+ {
4396
+ "epoch": 0.45203869451225026,
4397
+ "grad_norm": 17.078125,
4398
+ "learning_rate": 9.964684616675528e-07,
4399
+ "loss": 24.9102,
4400
+ "step": 6250
4401
+ },
4402
+ {
4403
+ "epoch": 0.45276195642346984,
4404
+ "grad_norm": 15.9765625,
4405
+ "learning_rate": 9.96462811206221e-07,
4406
+ "loss": 25.297,
4407
+ "step": 6260
4408
+ },
4409
+ {
4410
+ "epoch": 0.45348521833468947,
4411
+ "grad_norm": 16.109375,
4412
+ "learning_rate": 9.96457160744889e-07,
4413
+ "loss": 24.8933,
4414
+ "step": 6270
4415
+ },
4416
+ {
4417
+ "epoch": 0.45420848024590904,
4418
+ "grad_norm": 15.9609375,
4419
+ "learning_rate": 9.96451510283557e-07,
4420
+ "loss": 25.0007,
4421
+ "step": 6280
4422
+ },
4423
+ {
4424
+ "epoch": 0.45493174215712867,
4425
+ "grad_norm": 16.28125,
4426
+ "learning_rate": 9.964458598222252e-07,
4427
+ "loss": 25.013,
4428
+ "step": 6290
4429
+ },
4430
+ {
4431
+ "epoch": 0.45565500406834825,
4432
+ "grad_norm": 18.0625,
4433
+ "learning_rate": 9.964402093608932e-07,
4434
+ "loss": 25.5618,
4435
+ "step": 6300
4436
+ },
4437
+ {
4438
+ "epoch": 0.4563782659795679,
4439
+ "grad_norm": 16.21875,
4440
+ "learning_rate": 9.964345588995614e-07,
4441
+ "loss": 25.056,
4442
+ "step": 6310
4443
+ },
4444
+ {
4445
+ "epoch": 0.45710152789078745,
4446
+ "grad_norm": 16.59375,
4447
+ "learning_rate": 9.964289084382294e-07,
4448
+ "loss": 24.9934,
4449
+ "step": 6320
4450
+ },
4451
+ {
4452
+ "epoch": 0.457824789802007,
4453
+ "grad_norm": 16.03125,
4454
+ "learning_rate": 9.964232579768976e-07,
4455
+ "loss": 25.0928,
4456
+ "step": 6330
4457
+ },
4458
+ {
4459
+ "epoch": 0.45854805171322666,
4460
+ "grad_norm": 16.75,
4461
+ "learning_rate": 9.964176075155656e-07,
4462
+ "loss": 25.194,
4463
+ "step": 6340
4464
+ },
4465
+ {
4466
+ "epoch": 0.45927131362444623,
4467
+ "grad_norm": 16.296875,
4468
+ "learning_rate": 9.964119570542336e-07,
4469
+ "loss": 24.94,
4470
+ "step": 6350
4471
+ },
4472
+ {
4473
+ "epoch": 0.45999457553566586,
4474
+ "grad_norm": 17.671875,
4475
+ "learning_rate": 9.964063065929018e-07,
4476
+ "loss": 25.4812,
4477
+ "step": 6360
4478
+ },
4479
+ {
4480
+ "epoch": 0.46071783744688544,
4481
+ "grad_norm": 16.578125,
4482
+ "learning_rate": 9.964006561315698e-07,
4483
+ "loss": 25.5154,
4484
+ "step": 6370
4485
+ },
4486
+ {
4487
+ "epoch": 0.46144109935810507,
4488
+ "grad_norm": 15.703125,
4489
+ "learning_rate": 9.96395005670238e-07,
4490
+ "loss": 25.3724,
4491
+ "step": 6380
4492
+ },
4493
+ {
4494
+ "epoch": 0.46216436126932464,
4495
+ "grad_norm": 16.890625,
4496
+ "learning_rate": 9.96389355208906e-07,
4497
+ "loss": 25.0829,
4498
+ "step": 6390
4499
+ },
4500
+ {
4501
+ "epoch": 0.46288762318054427,
4502
+ "grad_norm": 15.7734375,
4503
+ "learning_rate": 9.96383704747574e-07,
4504
+ "loss": 25.0513,
4505
+ "step": 6400
4506
+ },
4507
+ {
4508
+ "epoch": 0.46361088509176385,
4509
+ "grad_norm": 16.96875,
4510
+ "learning_rate": 9.963780542862422e-07,
4511
+ "loss": 25.1159,
4512
+ "step": 6410
4513
+ },
4514
+ {
4515
+ "epoch": 0.4643341470029835,
4516
+ "grad_norm": 15.6484375,
4517
+ "learning_rate": 9.963724038249102e-07,
4518
+ "loss": 24.9635,
4519
+ "step": 6420
4520
+ },
4521
+ {
4522
+ "epoch": 0.46505740891420305,
4523
+ "grad_norm": 17.078125,
4524
+ "learning_rate": 9.963667533635784e-07,
4525
+ "loss": 25.3004,
4526
+ "step": 6430
4527
+ },
4528
+ {
4529
+ "epoch": 0.4657806708254227,
4530
+ "grad_norm": 16.453125,
4531
+ "learning_rate": 9.963611029022464e-07,
4532
+ "loss": 25.2261,
4533
+ "step": 6440
4534
+ },
4535
+ {
4536
+ "epoch": 0.46650393273664226,
4537
+ "grad_norm": 16.875,
4538
+ "learning_rate": 9.963554524409144e-07,
4539
+ "loss": 25.2098,
4540
+ "step": 6450
4541
+ },
4542
+ {
4543
+ "epoch": 0.46722719464786183,
4544
+ "grad_norm": 15.6640625,
4545
+ "learning_rate": 9.963498019795826e-07,
4546
+ "loss": 25.0369,
4547
+ "step": 6460
4548
+ },
4549
+ {
4550
+ "epoch": 0.46795045655908146,
4551
+ "grad_norm": 16.203125,
4552
+ "learning_rate": 9.963441515182506e-07,
4553
+ "loss": 25.028,
4554
+ "step": 6470
4555
+ },
4556
+ {
4557
+ "epoch": 0.46867371847030104,
4558
+ "grad_norm": 15.9140625,
4559
+ "learning_rate": 9.963385010569188e-07,
4560
+ "loss": 24.9715,
4561
+ "step": 6480
4562
+ },
4563
+ {
4564
+ "epoch": 0.46939698038152067,
4565
+ "grad_norm": 17.828125,
4566
+ "learning_rate": 9.963328505955868e-07,
4567
+ "loss": 25.007,
4568
+ "step": 6490
4569
+ },
4570
+ {
4571
+ "epoch": 0.47012024229274024,
4572
+ "grad_norm": 16.1875,
4573
+ "learning_rate": 9.963272001342548e-07,
4574
+ "loss": 25.0585,
4575
+ "step": 6500
4576
+ },
4577
+ {
4578
+ "epoch": 0.4708435042039599,
4579
+ "grad_norm": 15.1484375,
4580
+ "learning_rate": 9.96321549672923e-07,
4581
+ "loss": 25.3392,
4582
+ "step": 6510
4583
+ },
4584
+ {
4585
+ "epoch": 0.47156676611517945,
4586
+ "grad_norm": 16.25,
4587
+ "learning_rate": 9.96315899211591e-07,
4588
+ "loss": 25.2201,
4589
+ "step": 6520
4590
+ },
4591
+ {
4592
+ "epoch": 0.4722900280263991,
4593
+ "grad_norm": 16.171875,
4594
+ "learning_rate": 9.963102487502592e-07,
4595
+ "loss": 24.9467,
4596
+ "step": 6530
4597
+ },
4598
+ {
4599
+ "epoch": 0.47301328993761865,
4600
+ "grad_norm": 16.5625,
4601
+ "learning_rate": 9.963045982889272e-07,
4602
+ "loss": 25.1083,
4603
+ "step": 6540
4604
+ },
4605
+ {
4606
+ "epoch": 0.4737365518488383,
4607
+ "grad_norm": 16.515625,
4608
+ "learning_rate": 9.962989478275954e-07,
4609
+ "loss": 25.3217,
4610
+ "step": 6550
4611
+ },
4612
+ {
4613
+ "epoch": 0.47445981376005786,
4614
+ "grad_norm": 15.578125,
4615
+ "learning_rate": 9.962932973662634e-07,
4616
+ "loss": 25.1594,
4617
+ "step": 6560
4618
+ },
4619
+ {
4620
+ "epoch": 0.4751830756712775,
4621
+ "grad_norm": 17.09375,
4622
+ "learning_rate": 9.962876469049314e-07,
4623
+ "loss": 25.0634,
4624
+ "step": 6570
4625
+ },
4626
+ {
4627
+ "epoch": 0.47590633758249706,
4628
+ "grad_norm": 16.984375,
4629
+ "learning_rate": 9.962819964435996e-07,
4630
+ "loss": 25.3349,
4631
+ "step": 6580
4632
+ },
4633
+ {
4634
+ "epoch": 0.47662959949371664,
4635
+ "grad_norm": 15.734375,
4636
+ "learning_rate": 9.962763459822676e-07,
4637
+ "loss": 25.1982,
4638
+ "step": 6590
4639
+ },
4640
+ {
4641
+ "epoch": 0.47735286140493627,
4642
+ "grad_norm": 16.390625,
4643
+ "learning_rate": 9.962706955209358e-07,
4644
+ "loss": 25.1291,
4645
+ "step": 6600
4646
+ },
4647
+ {
4648
+ "epoch": 0.47807612331615584,
4649
+ "grad_norm": 16.3125,
4650
+ "learning_rate": 9.962650450596038e-07,
4651
+ "loss": 25.1213,
4652
+ "step": 6610
4653
+ },
4654
+ {
4655
+ "epoch": 0.4787993852273755,
4656
+ "grad_norm": 16.546875,
4657
+ "learning_rate": 9.962593945982718e-07,
4658
+ "loss": 24.9788,
4659
+ "step": 6620
4660
+ },
4661
+ {
4662
+ "epoch": 0.47952264713859505,
4663
+ "grad_norm": 15.84375,
4664
+ "learning_rate": 9.9625374413694e-07,
4665
+ "loss": 24.6411,
4666
+ "step": 6630
4667
+ },
4668
+ {
4669
+ "epoch": 0.4802459090498147,
4670
+ "grad_norm": 15.8359375,
4671
+ "learning_rate": 9.96248093675608e-07,
4672
+ "loss": 24.8477,
4673
+ "step": 6640
4674
+ },
4675
+ {
4676
+ "epoch": 0.48096917096103425,
4677
+ "grad_norm": 16.234375,
4678
+ "learning_rate": 9.962424432142763e-07,
4679
+ "loss": 25.1838,
4680
+ "step": 6650
4681
+ },
4682
+ {
4683
+ "epoch": 0.4816924328722539,
4684
+ "grad_norm": 15.03125,
4685
+ "learning_rate": 9.962367927529443e-07,
4686
+ "loss": 24.8121,
4687
+ "step": 6660
4688
+ },
4689
+ {
4690
+ "epoch": 0.48241569478347346,
4691
+ "grad_norm": 16.28125,
4692
+ "learning_rate": 9.962311422916122e-07,
4693
+ "loss": 24.8409,
4694
+ "step": 6670
4695
+ },
4696
+ {
4697
+ "epoch": 0.4831389566946931,
4698
+ "grad_norm": 16.6875,
4699
+ "learning_rate": 9.962254918302805e-07,
4700
+ "loss": 24.7554,
4701
+ "step": 6680
4702
+ },
4703
+ {
4704
+ "epoch": 0.48386221860591266,
4705
+ "grad_norm": 17.90625,
4706
+ "learning_rate": 9.962198413689485e-07,
4707
+ "loss": 25.3045,
4708
+ "step": 6690
4709
+ },
4710
+ {
4711
+ "epoch": 0.48458548051713224,
4712
+ "grad_norm": 16.859375,
4713
+ "learning_rate": 9.962141909076167e-07,
4714
+ "loss": 25.1306,
4715
+ "step": 6700
4716
+ },
4717
+ {
4718
+ "epoch": 0.48530874242835187,
4719
+ "grad_norm": 16.65625,
4720
+ "learning_rate": 9.962085404462847e-07,
4721
+ "loss": 25.1144,
4722
+ "step": 6710
4723
+ },
4724
+ {
4725
+ "epoch": 0.48603200433957144,
4726
+ "grad_norm": 16.15625,
4727
+ "learning_rate": 9.962028899849527e-07,
4728
+ "loss": 24.9379,
4729
+ "step": 6720
4730
+ },
4731
+ {
4732
+ "epoch": 0.4867552662507911,
4733
+ "grad_norm": 17.609375,
4734
+ "learning_rate": 9.961972395236209e-07,
4735
+ "loss": 24.8681,
4736
+ "step": 6730
4737
+ },
4738
+ {
4739
+ "epoch": 0.48747852816201065,
4740
+ "grad_norm": 16.359375,
4741
+ "learning_rate": 9.961915890622889e-07,
4742
+ "loss": 24.836,
4743
+ "step": 6740
4744
+ },
4745
+ {
4746
+ "epoch": 0.4882017900732303,
4747
+ "grad_norm": 16.3125,
4748
+ "learning_rate": 9.96185938600957e-07,
4749
+ "loss": 24.7874,
4750
+ "step": 6750
4751
+ },
4752
+ {
4753
+ "epoch": 0.48892505198444985,
4754
+ "grad_norm": 17.46875,
4755
+ "learning_rate": 9.96180288139625e-07,
4756
+ "loss": 25.4011,
4757
+ "step": 6760
4758
+ },
4759
+ {
4760
+ "epoch": 0.4896483138956695,
4761
+ "grad_norm": 15.1484375,
4762
+ "learning_rate": 9.961746376782933e-07,
4763
+ "loss": 25.1523,
4764
+ "step": 6770
4765
+ },
4766
+ {
4767
+ "epoch": 0.49037157580688906,
4768
+ "grad_norm": 17.4375,
4769
+ "learning_rate": 9.961689872169613e-07,
4770
+ "loss": 25.3958,
4771
+ "step": 6780
4772
+ },
4773
+ {
4774
+ "epoch": 0.4910948377181087,
4775
+ "grad_norm": 15.6796875,
4776
+ "learning_rate": 9.961633367556293e-07,
4777
+ "loss": 25.3325,
4778
+ "step": 6790
4779
+ },
4780
+ {
4781
+ "epoch": 0.49181809962932826,
4782
+ "grad_norm": 22.125,
4783
+ "learning_rate": 9.961576862942975e-07,
4784
+ "loss": 25.1442,
4785
+ "step": 6800
4786
+ },
4787
+ {
4788
+ "epoch": 0.4925413615405479,
4789
+ "grad_norm": 16.765625,
4790
+ "learning_rate": 9.961520358329655e-07,
4791
+ "loss": 25.2003,
4792
+ "step": 6810
4793
+ },
4794
+ {
4795
+ "epoch": 0.49326462345176747,
4796
+ "grad_norm": 15.6640625,
4797
+ "learning_rate": 9.961463853716337e-07,
4798
+ "loss": 25.221,
4799
+ "step": 6820
4800
+ },
4801
+ {
4802
+ "epoch": 0.49398788536298704,
4803
+ "grad_norm": 17.015625,
4804
+ "learning_rate": 9.961407349103017e-07,
4805
+ "loss": 24.856,
4806
+ "step": 6830
4807
+ },
4808
+ {
4809
+ "epoch": 0.4947111472742067,
4810
+ "grad_norm": 16.359375,
4811
+ "learning_rate": 9.961350844489697e-07,
4812
+ "loss": 25.1671,
4813
+ "step": 6840
4814
+ },
4815
+ {
4816
+ "epoch": 0.49543440918542625,
4817
+ "grad_norm": 15.8828125,
4818
+ "learning_rate": 9.96129433987638e-07,
4819
+ "loss": 25.145,
4820
+ "step": 6850
4821
+ },
4822
+ {
4823
+ "epoch": 0.4961576710966459,
4824
+ "grad_norm": 16.15625,
4825
+ "learning_rate": 9.961237835263059e-07,
4826
+ "loss": 25.2164,
4827
+ "step": 6860
4828
+ },
4829
+ {
4830
+ "epoch": 0.49688093300786546,
4831
+ "grad_norm": 17.078125,
4832
+ "learning_rate": 9.96118133064974e-07,
4833
+ "loss": 25.113,
4834
+ "step": 6870
4835
+ },
4836
+ {
4837
+ "epoch": 0.4976041949190851,
4838
+ "grad_norm": 16.640625,
4839
+ "learning_rate": 9.96112482603642e-07,
4840
+ "loss": 24.7216,
4841
+ "step": 6880
4842
+ },
4843
+ {
4844
+ "epoch": 0.49832745683030466,
4845
+ "grad_norm": 16.5,
4846
+ "learning_rate": 9.9610683214231e-07,
4847
+ "loss": 24.9603,
4848
+ "step": 6890
4849
+ },
4850
+ {
4851
+ "epoch": 0.4990507187415243,
4852
+ "grad_norm": 16.0,
4853
+ "learning_rate": 9.961011816809783e-07,
4854
+ "loss": 25.1272,
4855
+ "step": 6900
4856
+ },
4857
+ {
4858
+ "epoch": 0.49977398065274387,
4859
+ "grad_norm": 16.296875,
4860
+ "learning_rate": 9.960955312196463e-07,
4861
+ "loss": 24.8981,
4862
+ "step": 6910
4863
+ },
4864
+ {
4865
+ "epoch": 0.5004972425639634,
4866
+ "grad_norm": 16.03125,
4867
+ "learning_rate": 9.960898807583145e-07,
4868
+ "loss": 25.1166,
4869
+ "step": 6920
4870
+ },
4871
+ {
4872
+ "epoch": 0.5012205044751831,
4873
+ "grad_norm": 16.734375,
4874
+ "learning_rate": 9.960842302969825e-07,
4875
+ "loss": 25.522,
4876
+ "step": 6930
4877
+ },
4878
+ {
4879
+ "epoch": 0.5019437663864027,
4880
+ "grad_norm": 16.203125,
4881
+ "learning_rate": 9.960785798356507e-07,
4882
+ "loss": 24.7892,
4883
+ "step": 6940
4884
+ },
4885
+ {
4886
+ "epoch": 0.5026670282976223,
4887
+ "grad_norm": 16.890625,
4888
+ "learning_rate": 9.960729293743187e-07,
4889
+ "loss": 24.6813,
4890
+ "step": 6950
4891
+ },
4892
+ {
4893
+ "epoch": 0.5033902902088419,
4894
+ "grad_norm": 15.59375,
4895
+ "learning_rate": 9.960672789129867e-07,
4896
+ "loss": 24.9678,
4897
+ "step": 6960
4898
+ },
4899
+ {
4900
+ "epoch": 0.5041135521200615,
4901
+ "grad_norm": 16.34375,
4902
+ "learning_rate": 9.96061628451655e-07,
4903
+ "loss": 25.2393,
4904
+ "step": 6970
4905
+ },
4906
+ {
4907
+ "epoch": 0.5048368140312811,
4908
+ "grad_norm": 15.8046875,
4909
+ "learning_rate": 9.96055977990323e-07,
4910
+ "loss": 25.1625,
4911
+ "step": 6980
4912
+ },
4913
+ {
4914
+ "epoch": 0.5055600759425006,
4915
+ "grad_norm": 15.3671875,
4916
+ "learning_rate": 9.960503275289911e-07,
4917
+ "loss": 25.1241,
4918
+ "step": 6990
4919
+ },
4920
+ {
4921
+ "epoch": 0.5062833378537203,
4922
+ "grad_norm": 16.3125,
4923
+ "learning_rate": 9.960446770676591e-07,
4924
+ "loss": 25.0165,
4925
+ "step": 7000
4926
+ },
4927
+ {
4928
+ "epoch": 0.5070065997649399,
4929
+ "grad_norm": 15.8515625,
4930
+ "learning_rate": 9.960390266063271e-07,
4931
+ "loss": 25.5291,
4932
+ "step": 7010
4933
+ },
4934
+ {
4935
+ "epoch": 0.5077298616761595,
4936
+ "grad_norm": 16.46875,
4937
+ "learning_rate": 9.960333761449953e-07,
4938
+ "loss": 25.1357,
4939
+ "step": 7020
4940
+ },
4941
+ {
4942
+ "epoch": 0.508453123587379,
4943
+ "grad_norm": 15.515625,
4944
+ "learning_rate": 9.960277256836635e-07,
4945
+ "loss": 25.2483,
4946
+ "step": 7030
4947
+ },
4948
+ {
4949
+ "epoch": 0.5091763854985987,
4950
+ "grad_norm": 15.9609375,
4951
+ "learning_rate": 9.960220752223315e-07,
4952
+ "loss": 24.7912,
4953
+ "step": 7040
4954
+ },
4955
+ {
4956
+ "epoch": 0.5098996474098183,
4957
+ "grad_norm": 16.515625,
4958
+ "learning_rate": 9.960164247609995e-07,
4959
+ "loss": 25.3561,
4960
+ "step": 7050
4961
+ },
4962
+ {
4963
+ "epoch": 0.5106229093210379,
4964
+ "grad_norm": 16.25,
4965
+ "learning_rate": 9.960107742996675e-07,
4966
+ "loss": 25.3195,
4967
+ "step": 7060
4968
+ },
4969
+ {
4970
+ "epoch": 0.5113461712322575,
4971
+ "grad_norm": 16.859375,
4972
+ "learning_rate": 9.960051238383357e-07,
4973
+ "loss": 25.1948,
4974
+ "step": 7070
4975
+ },
4976
+ {
4977
+ "epoch": 0.5120694331434771,
4978
+ "grad_norm": 15.890625,
4979
+ "learning_rate": 9.95999473377004e-07,
4980
+ "loss": 25.1284,
4981
+ "step": 7080
4982
+ },
4983
+ {
4984
+ "epoch": 0.5127926950546967,
4985
+ "grad_norm": 16.296875,
4986
+ "learning_rate": 9.95993822915672e-07,
4987
+ "loss": 25.0781,
4988
+ "step": 7090
4989
+ },
4990
+ {
4991
+ "epoch": 0.5135159569659162,
4992
+ "grad_norm": 16.65625,
4993
+ "learning_rate": 9.9598817245434e-07,
4994
+ "loss": 25.2929,
4995
+ "step": 7100
4996
+ },
4997
+ {
4998
+ "epoch": 0.5142392188771359,
4999
+ "grad_norm": 16.5,
5000
+ "learning_rate": 9.95982521993008e-07,
5001
+ "loss": 25.0102,
5002
+ "step": 7110
5003
+ },
5004
+ {
5005
+ "epoch": 0.5149624807883555,
5006
+ "grad_norm": 16.46875,
5007
+ "learning_rate": 9.959768715316762e-07,
5008
+ "loss": 25.1572,
5009
+ "step": 7120
5010
+ },
5011
+ {
5012
+ "epoch": 0.5156857426995751,
5013
+ "grad_norm": 15.265625,
5014
+ "learning_rate": 9.959712210703442e-07,
5015
+ "loss": 25.0643,
5016
+ "step": 7130
5017
+ },
5018
+ {
5019
+ "epoch": 0.5164090046107946,
5020
+ "grad_norm": 16.296875,
5021
+ "learning_rate": 9.959655706090124e-07,
5022
+ "loss": 25.0202,
5023
+ "step": 7140
5024
+ },
5025
+ {
5026
+ "epoch": 0.5171322665220143,
5027
+ "grad_norm": 15.6484375,
5028
+ "learning_rate": 9.959599201476804e-07,
5029
+ "loss": 24.9407,
5030
+ "step": 7150
5031
+ },
5032
+ {
5033
+ "epoch": 0.5178555284332339,
5034
+ "grad_norm": 16.765625,
5035
+ "learning_rate": 9.959542696863486e-07,
5036
+ "loss": 25.3482,
5037
+ "step": 7160
5038
+ },
5039
+ {
5040
+ "epoch": 0.5185787903444535,
5041
+ "grad_norm": 16.296875,
5042
+ "learning_rate": 9.959486192250166e-07,
5043
+ "loss": 24.7983,
5044
+ "step": 7170
5045
+ },
5046
+ {
5047
+ "epoch": 0.519302052255673,
5048
+ "grad_norm": 15.140625,
5049
+ "learning_rate": 9.959429687636846e-07,
5050
+ "loss": 24.9558,
5051
+ "step": 7180
5052
+ },
5053
+ {
5054
+ "epoch": 0.5200253141668927,
5055
+ "grad_norm": 16.09375,
5056
+ "learning_rate": 9.959373183023528e-07,
5057
+ "loss": 25.2437,
5058
+ "step": 7190
5059
+ },
5060
+ {
5061
+ "epoch": 0.5207485760781123,
5062
+ "grad_norm": 17.4375,
5063
+ "learning_rate": 9.959316678410208e-07,
5064
+ "loss": 24.9945,
5065
+ "step": 7200
5066
+ },
5067
+ {
5068
+ "epoch": 0.5214718379893318,
5069
+ "grad_norm": 16.78125,
5070
+ "learning_rate": 9.95926017379689e-07,
5071
+ "loss": 24.876,
5072
+ "step": 7210
5073
+ },
5074
+ {
5075
+ "epoch": 0.5221950999005515,
5076
+ "grad_norm": 15.6640625,
5077
+ "learning_rate": 9.95920366918357e-07,
5078
+ "loss": 25.1469,
5079
+ "step": 7220
5080
+ },
5081
+ {
5082
+ "epoch": 0.5229183618117711,
5083
+ "grad_norm": 16.34375,
5084
+ "learning_rate": 9.95914716457025e-07,
5085
+ "loss": 25.0436,
5086
+ "step": 7230
5087
+ },
5088
+ {
5089
+ "epoch": 0.5236416237229907,
5090
+ "grad_norm": 15.25,
5091
+ "learning_rate": 9.959090659956932e-07,
5092
+ "loss": 25.0983,
5093
+ "step": 7240
5094
+ },
5095
+ {
5096
+ "epoch": 0.5243648856342102,
5097
+ "grad_norm": 16.578125,
5098
+ "learning_rate": 9.959034155343614e-07,
5099
+ "loss": 25.1828,
5100
+ "step": 7250
5101
+ },
5102
+ {
5103
+ "epoch": 0.5250881475454299,
5104
+ "grad_norm": 15.375,
5105
+ "learning_rate": 9.958977650730294e-07,
5106
+ "loss": 25.0138,
5107
+ "step": 7260
5108
+ },
5109
+ {
5110
+ "epoch": 0.5258114094566495,
5111
+ "grad_norm": 15.515625,
5112
+ "learning_rate": 9.958921146116974e-07,
5113
+ "loss": 25.336,
5114
+ "step": 7270
5115
+ },
5116
+ {
5117
+ "epoch": 0.5265346713678691,
5118
+ "grad_norm": 16.59375,
5119
+ "learning_rate": 9.958864641503654e-07,
5120
+ "loss": 25.1535,
5121
+ "step": 7280
5122
+ },
5123
+ {
5124
+ "epoch": 0.5272579332790887,
5125
+ "grad_norm": 15.8828125,
5126
+ "learning_rate": 9.958808136890336e-07,
5127
+ "loss": 25.1138,
5128
+ "step": 7290
5129
+ },
5130
+ {
5131
+ "epoch": 0.5279811951903083,
5132
+ "grad_norm": 16.203125,
5133
+ "learning_rate": 9.958751632277018e-07,
5134
+ "loss": 24.8106,
5135
+ "step": 7300
5136
+ },
5137
+ {
5138
+ "epoch": 0.5287044571015279,
5139
+ "grad_norm": 16.296875,
5140
+ "learning_rate": 9.958695127663698e-07,
5141
+ "loss": 24.7198,
5142
+ "step": 7310
5143
+ },
5144
+ {
5145
+ "epoch": 0.5294277190127475,
5146
+ "grad_norm": 16.140625,
5147
+ "learning_rate": 9.958638623050378e-07,
5148
+ "loss": 24.796,
5149
+ "step": 7320
5150
+ },
5151
+ {
5152
+ "epoch": 0.5301509809239671,
5153
+ "grad_norm": 16.390625,
5154
+ "learning_rate": 9.958582118437058e-07,
5155
+ "loss": 24.8893,
5156
+ "step": 7330
5157
+ },
5158
+ {
5159
+ "epoch": 0.5308742428351867,
5160
+ "grad_norm": 15.859375,
5161
+ "learning_rate": 9.95852561382374e-07,
5162
+ "loss": 25.1709,
5163
+ "step": 7340
5164
+ },
5165
+ {
5166
+ "epoch": 0.5315975047464063,
5167
+ "grad_norm": 15.9453125,
5168
+ "learning_rate": 9.958469109210422e-07,
5169
+ "loss": 25.0314,
5170
+ "step": 7350
5171
+ },
5172
+ {
5173
+ "epoch": 0.5323207666576258,
5174
+ "grad_norm": 16.296875,
5175
+ "learning_rate": 9.958412604597102e-07,
5176
+ "loss": 24.7342,
5177
+ "step": 7360
5178
+ },
5179
+ {
5180
+ "epoch": 0.5330440285688455,
5181
+ "grad_norm": 15.9140625,
5182
+ "learning_rate": 9.958356099983782e-07,
5183
+ "loss": 24.7556,
5184
+ "step": 7370
5185
+ },
5186
+ {
5187
+ "epoch": 0.5337672904800651,
5188
+ "grad_norm": 16.375,
5189
+ "learning_rate": 9.958299595370464e-07,
5190
+ "loss": 25.1313,
5191
+ "step": 7380
5192
+ },
5193
+ {
5194
+ "epoch": 0.5344905523912847,
5195
+ "grad_norm": 16.75,
5196
+ "learning_rate": 9.958243090757144e-07,
5197
+ "loss": 25.1914,
5198
+ "step": 7390
5199
+ },
5200
+ {
5201
+ "epoch": 0.5352138143025043,
5202
+ "grad_norm": 15.2109375,
5203
+ "learning_rate": 9.958186586143826e-07,
5204
+ "loss": 25.0681,
5205
+ "step": 7400
5206
+ },
5207
+ {
5208
+ "epoch": 0.5359370762137239,
5209
+ "grad_norm": 16.40625,
5210
+ "learning_rate": 9.958130081530506e-07,
5211
+ "loss": 25.4192,
5212
+ "step": 7410
5213
+ },
5214
+ {
5215
+ "epoch": 0.5366603381249435,
5216
+ "grad_norm": 15.515625,
5217
+ "learning_rate": 9.958073576917186e-07,
5218
+ "loss": 25.2562,
5219
+ "step": 7420
5220
+ },
5221
+ {
5222
+ "epoch": 0.5373836000361631,
5223
+ "grad_norm": 15.7109375,
5224
+ "learning_rate": 9.958017072303868e-07,
5225
+ "loss": 25.1077,
5226
+ "step": 7430
5227
+ },
5228
+ {
5229
+ "epoch": 0.5381068619473827,
5230
+ "grad_norm": 17.1875,
5231
+ "learning_rate": 9.957960567690548e-07,
5232
+ "loss": 25.1754,
5233
+ "step": 7440
5234
+ },
5235
+ {
5236
+ "epoch": 0.5388301238586023,
5237
+ "grad_norm": 16.34375,
5238
+ "learning_rate": 9.95790406307723e-07,
5239
+ "loss": 25.0318,
5240
+ "step": 7450
5241
+ },
5242
+ {
5243
+ "epoch": 0.5395533857698219,
5244
+ "grad_norm": 16.375,
5245
+ "learning_rate": 9.95784755846391e-07,
5246
+ "loss": 25.2289,
5247
+ "step": 7460
5248
+ },
5249
+ {
5250
+ "epoch": 0.5402766476810414,
5251
+ "grad_norm": 15.21875,
5252
+ "learning_rate": 9.957791053850592e-07,
5253
+ "loss": 24.9497,
5254
+ "step": 7470
5255
+ },
5256
+ {
5257
+ "epoch": 0.5409999095922611,
5258
+ "grad_norm": 15.671875,
5259
+ "learning_rate": 9.957734549237272e-07,
5260
+ "loss": 25.0775,
5261
+ "step": 7480
5262
+ },
5263
+ {
5264
+ "epoch": 0.5417231715034807,
5265
+ "grad_norm": 16.5,
5266
+ "learning_rate": 9.957678044623952e-07,
5267
+ "loss": 25.3473,
5268
+ "step": 7490
5269
+ },
5270
+ {
5271
+ "epoch": 0.5424464334147003,
5272
+ "grad_norm": 16.734375,
5273
+ "learning_rate": 9.957621540010632e-07,
5274
+ "loss": 25.1996,
5275
+ "step": 7500
5276
+ },
5277
+ {
5278
+ "epoch": 0.5424464334147003,
5279
+ "eval_loss": 1.5896706581115723,
5280
+ "eval_runtime": 371.382,
5281
+ "eval_samples_per_second": 1955.019,
5282
+ "eval_steps_per_second": 30.548,
5283
+ "step": 7500
5284
  }
5285
  ],
5286
  "logging_steps": 10,
 
5300
  "attributes": {}
5301
  }
5302
  },
5303
+ "total_flos": 1.0476964175413248e+19,
5304
  "train_batch_size": 16,
5305
  "trial_name": null,
5306
  "trial_params": null