CocoRoF commited on
Commit
9afdc91
·
verified ·
1 Parent(s): 45e087b

Training in progress, step 12500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0650b34e8d23b3446525371f6f73e6e00280572d8ce6b52c973f4cc138898ffe
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38d3243659eec430b28c341d8e0c0d646f3019e3b9734426c0e021b7a2895075
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7a0e457ec4cc9ee087e5fbed60cfc5368dc0e8fafbafd4677643d087e0a0b94
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c268b95cc6e03b42c01044e4b60c99e59b2be381305f753ed869b92c7c15b65
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06fea830cf5ad73ec00d500ea6fb952740ac936f18e93fa2d32abde1ea3ead92
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be561d1df19be227394d8ea607c54262a06c9bf880af0aa5e04a52596a2a6cb0
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03f3e24417a59435f5a8450a4aeb0f09cc92734b5c3b45a0701b2c043c415c05
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bea02744c29f30024590ab1629a0e7b7dabbf1e8476456c2e7c5ce46dc35c28
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:041be966454b60c86af576fc1eb7f34189114689abff8f9622b947110f7334c8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b85766f6596d15a810177d77dd259d9b50588cf100ec5f8ebff5fed881d57957
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8be75d04b1ebe614241b88fd010a5dda1b7bf703c00c6ebe310ca07975830fe7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4699833a7ab4cb692996ef7567f934c0bac79d6a067963a873f89a38e412bd48
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e677c668f388d20cb2a51630b38a0e9e72b08c3b4c96a7a44de0c1086265eab
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f9556ec9ffd73ec908034e2eecc199149cf838ee204595e0609732ae237439d
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7232619112196004,
5
  "eval_steps": 2500,
6
- "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -7039,6 +7039,1764 @@
7039
  "eval_samples_per_second": 1979.565,
7040
  "eval_steps_per_second": 30.932,
7041
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7042
  }
7043
  ],
7044
  "logging_steps": 10,
@@ -7058,7 +8816,7 @@
7058
  "attributes": {}
7059
  }
7060
  },
7061
- "total_flos": 1.3969285567217664e+19,
7062
  "train_batch_size": 16,
7063
  "trial_name": null,
7064
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9040773890245005,
5
  "eval_steps": 2500,
6
+ "global_step": 12500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
7039
  "eval_samples_per_second": 1979.565,
7040
  "eval_steps_per_second": 30.932,
7041
  "step": 10000
7042
+ },
7043
+ {
7044
+ "epoch": 0.72398517313082,
7045
+ "grad_norm": 16.75,
7046
+ "learning_rate": 9.943438882067526e-07,
7047
+ "loss": 24.8588,
7048
+ "step": 10010
7049
+ },
7050
+ {
7051
+ "epoch": 0.7247084350420396,
7052
+ "grad_norm": 17.0,
7053
+ "learning_rate": 9.943382377454206e-07,
7054
+ "loss": 25.2057,
7055
+ "step": 10020
7056
+ },
7057
+ {
7058
+ "epoch": 0.7254316969532592,
7059
+ "grad_norm": 17.5625,
7060
+ "learning_rate": 9.943325872840888e-07,
7061
+ "loss": 24.6792,
7062
+ "step": 10030
7063
+ },
7064
+ {
7065
+ "epoch": 0.7261549588644788,
7066
+ "grad_norm": 16.328125,
7067
+ "learning_rate": 9.943269368227568e-07,
7068
+ "loss": 25.142,
7069
+ "step": 10040
7070
+ },
7071
+ {
7072
+ "epoch": 0.7268782207756984,
7073
+ "grad_norm": 16.78125,
7074
+ "learning_rate": 9.94321286361425e-07,
7075
+ "loss": 25.3561,
7076
+ "step": 10050
7077
+ },
7078
+ {
7079
+ "epoch": 0.727601482686918,
7080
+ "grad_norm": 15.09375,
7081
+ "learning_rate": 9.94315635900093e-07,
7082
+ "loss": 24.9095,
7083
+ "step": 10060
7084
+ },
7085
+ {
7086
+ "epoch": 0.7283247445981376,
7087
+ "grad_norm": 15.515625,
7088
+ "learning_rate": 9.94309985438761e-07,
7089
+ "loss": 24.798,
7090
+ "step": 10070
7091
+ },
7092
+ {
7093
+ "epoch": 0.7290480065093572,
7094
+ "grad_norm": 15.7578125,
7095
+ "learning_rate": 9.943043349774292e-07,
7096
+ "loss": 25.0624,
7097
+ "step": 10080
7098
+ },
7099
+ {
7100
+ "epoch": 0.7297712684205768,
7101
+ "grad_norm": 16.15625,
7102
+ "learning_rate": 9.942986845160972e-07,
7103
+ "loss": 24.9009,
7104
+ "step": 10090
7105
+ },
7106
+ {
7107
+ "epoch": 0.7304945303317965,
7108
+ "grad_norm": 16.140625,
7109
+ "learning_rate": 9.942930340547654e-07,
7110
+ "loss": 24.9644,
7111
+ "step": 10100
7112
+ },
7113
+ {
7114
+ "epoch": 0.731217792243016,
7115
+ "grad_norm": 15.4453125,
7116
+ "learning_rate": 9.942873835934334e-07,
7117
+ "loss": 25.1517,
7118
+ "step": 10110
7119
+ },
7120
+ {
7121
+ "epoch": 0.7319410541542356,
7122
+ "grad_norm": 16.0,
7123
+ "learning_rate": 9.942817331321014e-07,
7124
+ "loss": 25.0996,
7125
+ "step": 10120
7126
+ },
7127
+ {
7128
+ "epoch": 0.7326643160654552,
7129
+ "grad_norm": 17.34375,
7130
+ "learning_rate": 9.942760826707696e-07,
7131
+ "loss": 25.0529,
7132
+ "step": 10130
7133
+ },
7134
+ {
7135
+ "epoch": 0.7333875779766748,
7136
+ "grad_norm": 16.484375,
7137
+ "learning_rate": 9.942704322094376e-07,
7138
+ "loss": 24.9834,
7139
+ "step": 10140
7140
+ },
7141
+ {
7142
+ "epoch": 0.7341108398878944,
7143
+ "grad_norm": 16.59375,
7144
+ "learning_rate": 9.942647817481058e-07,
7145
+ "loss": 24.9826,
7146
+ "step": 10150
7147
+ },
7148
+ {
7149
+ "epoch": 0.734834101799114,
7150
+ "grad_norm": 15.5234375,
7151
+ "learning_rate": 9.942591312867738e-07,
7152
+ "loss": 25.2645,
7153
+ "step": 10160
7154
+ },
7155
+ {
7156
+ "epoch": 0.7355573637103336,
7157
+ "grad_norm": 14.96875,
7158
+ "learning_rate": 9.942534808254418e-07,
7159
+ "loss": 25.024,
7160
+ "step": 10170
7161
+ },
7162
+ {
7163
+ "epoch": 0.7362806256215532,
7164
+ "grad_norm": 16.515625,
7165
+ "learning_rate": 9.9424783036411e-07,
7166
+ "loss": 25.1001,
7167
+ "step": 10180
7168
+ },
7169
+ {
7170
+ "epoch": 0.7370038875327728,
7171
+ "grad_norm": 16.125,
7172
+ "learning_rate": 9.94242179902778e-07,
7173
+ "loss": 24.9974,
7174
+ "step": 10190
7175
+ },
7176
+ {
7177
+ "epoch": 0.7377271494439924,
7178
+ "grad_norm": 15.9296875,
7179
+ "learning_rate": 9.942365294414463e-07,
7180
+ "loss": 24.9367,
7181
+ "step": 10200
7182
+ },
7183
+ {
7184
+ "epoch": 0.738450411355212,
7185
+ "grad_norm": 15.7578125,
7186
+ "learning_rate": 9.942308789801142e-07,
7187
+ "loss": 25.1152,
7188
+ "step": 10210
7189
+ },
7190
+ {
7191
+ "epoch": 0.7391736732664316,
7192
+ "grad_norm": 16.34375,
7193
+ "learning_rate": 9.942252285187822e-07,
7194
+ "loss": 24.7284,
7195
+ "step": 10220
7196
+ },
7197
+ {
7198
+ "epoch": 0.7398969351776512,
7199
+ "grad_norm": 15.5234375,
7200
+ "learning_rate": 9.942195780574505e-07,
7201
+ "loss": 24.9462,
7202
+ "step": 10230
7203
+ },
7204
+ {
7205
+ "epoch": 0.7406201970888708,
7206
+ "grad_norm": 16.015625,
7207
+ "learning_rate": 9.942139275961185e-07,
7208
+ "loss": 24.5272,
7209
+ "step": 10240
7210
+ },
7211
+ {
7212
+ "epoch": 0.7413434590000904,
7213
+ "grad_norm": 16.546875,
7214
+ "learning_rate": 9.942082771347867e-07,
7215
+ "loss": 25.286,
7216
+ "step": 10250
7217
+ },
7218
+ {
7219
+ "epoch": 0.74206672091131,
7220
+ "grad_norm": 16.390625,
7221
+ "learning_rate": 9.942026266734547e-07,
7222
+ "loss": 25.3334,
7223
+ "step": 10260
7224
+ },
7225
+ {
7226
+ "epoch": 0.7427899828225296,
7227
+ "grad_norm": 17.328125,
7228
+ "learning_rate": 9.941969762121229e-07,
7229
+ "loss": 24.9494,
7230
+ "step": 10270
7231
+ },
7232
+ {
7233
+ "epoch": 0.7435132447337492,
7234
+ "grad_norm": 16.765625,
7235
+ "learning_rate": 9.941913257507909e-07,
7236
+ "loss": 25.0898,
7237
+ "step": 10280
7238
+ },
7239
+ {
7240
+ "epoch": 0.7442365066449688,
7241
+ "grad_norm": 15.578125,
7242
+ "learning_rate": 9.941856752894589e-07,
7243
+ "loss": 24.8245,
7244
+ "step": 10290
7245
+ },
7246
+ {
7247
+ "epoch": 0.7449597685561884,
7248
+ "grad_norm": 16.359375,
7249
+ "learning_rate": 9.94180024828127e-07,
7250
+ "loss": 25.155,
7251
+ "step": 10300
7252
+ },
7253
+ {
7254
+ "epoch": 0.745683030467408,
7255
+ "grad_norm": 14.734375,
7256
+ "learning_rate": 9.94174374366795e-07,
7257
+ "loss": 24.8256,
7258
+ "step": 10310
7259
+ },
7260
+ {
7261
+ "epoch": 0.7464062923786277,
7262
+ "grad_norm": 16.4375,
7263
+ "learning_rate": 9.941687239054633e-07,
7264
+ "loss": 24.7595,
7265
+ "step": 10320
7266
+ },
7267
+ {
7268
+ "epoch": 0.7471295542898472,
7269
+ "grad_norm": 16.515625,
7270
+ "learning_rate": 9.941630734441313e-07,
7271
+ "loss": 25.2959,
7272
+ "step": 10330
7273
+ },
7274
+ {
7275
+ "epoch": 0.7478528162010668,
7276
+ "grad_norm": 17.3125,
7277
+ "learning_rate": 9.941574229827993e-07,
7278
+ "loss": 25.0216,
7279
+ "step": 10340
7280
+ },
7281
+ {
7282
+ "epoch": 0.7485760781122864,
7283
+ "grad_norm": 17.453125,
7284
+ "learning_rate": 9.941517725214675e-07,
7285
+ "loss": 24.9963,
7286
+ "step": 10350
7287
+ },
7288
+ {
7289
+ "epoch": 0.7492993400235061,
7290
+ "grad_norm": 15.8359375,
7291
+ "learning_rate": 9.941461220601357e-07,
7292
+ "loss": 24.8352,
7293
+ "step": 10360
7294
+ },
7295
+ {
7296
+ "epoch": 0.7500226019347256,
7297
+ "grad_norm": 16.3125,
7298
+ "learning_rate": 9.941404715988037e-07,
7299
+ "loss": 25.1953,
7300
+ "step": 10370
7301
+ },
7302
+ {
7303
+ "epoch": 0.7507458638459452,
7304
+ "grad_norm": 14.984375,
7305
+ "learning_rate": 9.941348211374717e-07,
7306
+ "loss": 25.1771,
7307
+ "step": 10380
7308
+ },
7309
+ {
7310
+ "epoch": 0.7514691257571648,
7311
+ "grad_norm": 16.125,
7312
+ "learning_rate": 9.941291706761397e-07,
7313
+ "loss": 24.8558,
7314
+ "step": 10390
7315
+ },
7316
+ {
7317
+ "epoch": 0.7521923876683844,
7318
+ "grad_norm": 16.21875,
7319
+ "learning_rate": 9.941235202148079e-07,
7320
+ "loss": 25.06,
7321
+ "step": 10400
7322
+ },
7323
+ {
7324
+ "epoch": 0.752915649579604,
7325
+ "grad_norm": 16.1875,
7326
+ "learning_rate": 9.94117869753476e-07,
7327
+ "loss": 25.3342,
7328
+ "step": 10410
7329
+ },
7330
+ {
7331
+ "epoch": 0.7536389114908236,
7332
+ "grad_norm": 16.078125,
7333
+ "learning_rate": 9.94112219292144e-07,
7334
+ "loss": 25.2384,
7335
+ "step": 10420
7336
+ },
7337
+ {
7338
+ "epoch": 0.7543621734020433,
7339
+ "grad_norm": 16.375,
7340
+ "learning_rate": 9.94106568830812e-07,
7341
+ "loss": 24.9319,
7342
+ "step": 10430
7343
+ },
7344
+ {
7345
+ "epoch": 0.7550854353132628,
7346
+ "grad_norm": 15.859375,
7347
+ "learning_rate": 9.9410091836948e-07,
7348
+ "loss": 25.2626,
7349
+ "step": 10440
7350
+ },
7351
+ {
7352
+ "epoch": 0.7558086972244824,
7353
+ "grad_norm": 15.90625,
7354
+ "learning_rate": 9.940952679081483e-07,
7355
+ "loss": 25.184,
7356
+ "step": 10450
7357
+ },
7358
+ {
7359
+ "epoch": 0.756531959135702,
7360
+ "grad_norm": 16.515625,
7361
+ "learning_rate": 9.940896174468165e-07,
7362
+ "loss": 24.8847,
7363
+ "step": 10460
7364
+ },
7365
+ {
7366
+ "epoch": 0.7572552210469217,
7367
+ "grad_norm": 15.984375,
7368
+ "learning_rate": 9.940839669854845e-07,
7369
+ "loss": 24.8185,
7370
+ "step": 10470
7371
+ },
7372
+ {
7373
+ "epoch": 0.7579784829581412,
7374
+ "grad_norm": 16.109375,
7375
+ "learning_rate": 9.940783165241525e-07,
7376
+ "loss": 24.8107,
7377
+ "step": 10480
7378
+ },
7379
+ {
7380
+ "epoch": 0.7587017448693608,
7381
+ "grad_norm": 16.203125,
7382
+ "learning_rate": 9.940726660628207e-07,
7383
+ "loss": 25.0111,
7384
+ "step": 10490
7385
+ },
7386
+ {
7387
+ "epoch": 0.7594250067805804,
7388
+ "grad_norm": 16.140625,
7389
+ "learning_rate": 9.940670156014887e-07,
7390
+ "loss": 24.9658,
7391
+ "step": 10500
7392
+ },
7393
+ {
7394
+ "epoch": 0.7601482686918,
7395
+ "grad_norm": 16.71875,
7396
+ "learning_rate": 9.940613651401567e-07,
7397
+ "loss": 25.07,
7398
+ "step": 10510
7399
+ },
7400
+ {
7401
+ "epoch": 0.7608715306030196,
7402
+ "grad_norm": 16.734375,
7403
+ "learning_rate": 9.94055714678825e-07,
7404
+ "loss": 25.2403,
7405
+ "step": 10520
7406
+ },
7407
+ {
7408
+ "epoch": 0.7615947925142392,
7409
+ "grad_norm": 17.09375,
7410
+ "learning_rate": 9.940500642174931e-07,
7411
+ "loss": 25.0379,
7412
+ "step": 10530
7413
+ },
7414
+ {
7415
+ "epoch": 0.7623180544254589,
7416
+ "grad_norm": 15.59375,
7417
+ "learning_rate": 9.940444137561611e-07,
7418
+ "loss": 25.0897,
7419
+ "step": 10540
7420
+ },
7421
+ {
7422
+ "epoch": 0.7630413163366784,
7423
+ "grad_norm": 15.9453125,
7424
+ "learning_rate": 9.940387632948291e-07,
7425
+ "loss": 24.8206,
7426
+ "step": 10550
7427
+ },
7428
+ {
7429
+ "epoch": 0.763764578247898,
7430
+ "grad_norm": 16.015625,
7431
+ "learning_rate": 9.940331128334971e-07,
7432
+ "loss": 24.9639,
7433
+ "step": 10560
7434
+ },
7435
+ {
7436
+ "epoch": 0.7644878401591176,
7437
+ "grad_norm": 16.453125,
7438
+ "learning_rate": 9.940274623721653e-07,
7439
+ "loss": 24.956,
7440
+ "step": 10570
7441
+ },
7442
+ {
7443
+ "epoch": 0.7652111020703373,
7444
+ "grad_norm": 16.296875,
7445
+ "learning_rate": 9.940218119108335e-07,
7446
+ "loss": 24.5994,
7447
+ "step": 10580
7448
+ },
7449
+ {
7450
+ "epoch": 0.7659343639815568,
7451
+ "grad_norm": 16.765625,
7452
+ "learning_rate": 9.940161614495015e-07,
7453
+ "loss": 25.3306,
7454
+ "step": 10590
7455
+ },
7456
+ {
7457
+ "epoch": 0.7666576258927764,
7458
+ "grad_norm": 14.8671875,
7459
+ "learning_rate": 9.940105109881695e-07,
7460
+ "loss": 24.9966,
7461
+ "step": 10600
7462
+ },
7463
+ {
7464
+ "epoch": 0.767380887803996,
7465
+ "grad_norm": 15.875,
7466
+ "learning_rate": 9.940048605268375e-07,
7467
+ "loss": 24.8955,
7468
+ "step": 10610
7469
+ },
7470
+ {
7471
+ "epoch": 0.7681041497152156,
7472
+ "grad_norm": 15.6875,
7473
+ "learning_rate": 9.939992100655057e-07,
7474
+ "loss": 24.8444,
7475
+ "step": 10620
7476
+ },
7477
+ {
7478
+ "epoch": 0.7688274116264352,
7479
+ "grad_norm": 16.15625,
7480
+ "learning_rate": 9.93993559604174e-07,
7481
+ "loss": 24.8488,
7482
+ "step": 10630
7483
+ },
7484
+ {
7485
+ "epoch": 0.7695506735376548,
7486
+ "grad_norm": 15.875,
7487
+ "learning_rate": 9.93987909142842e-07,
7488
+ "loss": 24.765,
7489
+ "step": 10640
7490
+ },
7491
+ {
7492
+ "epoch": 0.7702739354488745,
7493
+ "grad_norm": 15.3203125,
7494
+ "learning_rate": 9.9398225868151e-07,
7495
+ "loss": 24.8543,
7496
+ "step": 10650
7497
+ },
7498
+ {
7499
+ "epoch": 0.770997197360094,
7500
+ "grad_norm": 16.1875,
7501
+ "learning_rate": 9.93976608220178e-07,
7502
+ "loss": 25.1015,
7503
+ "step": 10660
7504
+ },
7505
+ {
7506
+ "epoch": 0.7717204592713136,
7507
+ "grad_norm": 15.7421875,
7508
+ "learning_rate": 9.939709577588462e-07,
7509
+ "loss": 24.7436,
7510
+ "step": 10670
7511
+ },
7512
+ {
7513
+ "epoch": 0.7724437211825332,
7514
+ "grad_norm": 16.328125,
7515
+ "learning_rate": 9.939653072975144e-07,
7516
+ "loss": 24.8432,
7517
+ "step": 10680
7518
+ },
7519
+ {
7520
+ "epoch": 0.7731669830937529,
7521
+ "grad_norm": 16.453125,
7522
+ "learning_rate": 9.939596568361824e-07,
7523
+ "loss": 25.0282,
7524
+ "step": 10690
7525
+ },
7526
+ {
7527
+ "epoch": 0.7738902450049724,
7528
+ "grad_norm": 17.578125,
7529
+ "learning_rate": 9.939540063748504e-07,
7530
+ "loss": 25.3524,
7531
+ "step": 10700
7532
+ },
7533
+ {
7534
+ "epoch": 0.774613506916192,
7535
+ "grad_norm": 16.484375,
7536
+ "learning_rate": 9.939483559135186e-07,
7537
+ "loss": 25.2524,
7538
+ "step": 10710
7539
+ },
7540
+ {
7541
+ "epoch": 0.7753367688274116,
7542
+ "grad_norm": 15.453125,
7543
+ "learning_rate": 9.939427054521866e-07,
7544
+ "loss": 25.0613,
7545
+ "step": 10720
7546
+ },
7547
+ {
7548
+ "epoch": 0.7760600307386313,
7549
+ "grad_norm": 16.8125,
7550
+ "learning_rate": 9.939370549908548e-07,
7551
+ "loss": 24.856,
7552
+ "step": 10730
7553
+ },
7554
+ {
7555
+ "epoch": 0.7767832926498508,
7556
+ "grad_norm": 15.7890625,
7557
+ "learning_rate": 9.939314045295228e-07,
7558
+ "loss": 24.7389,
7559
+ "step": 10740
7560
+ },
7561
+ {
7562
+ "epoch": 0.7775065545610704,
7563
+ "grad_norm": 17.296875,
7564
+ "learning_rate": 9.93925754068191e-07,
7565
+ "loss": 25.1935,
7566
+ "step": 10750
7567
+ },
7568
+ {
7569
+ "epoch": 0.7782298164722901,
7570
+ "grad_norm": 16.84375,
7571
+ "learning_rate": 9.93920103606859e-07,
7572
+ "loss": 24.871,
7573
+ "step": 10760
7574
+ },
7575
+ {
7576
+ "epoch": 0.7789530783835096,
7577
+ "grad_norm": 17.3125,
7578
+ "learning_rate": 9.93914453145527e-07,
7579
+ "loss": 25.2816,
7580
+ "step": 10770
7581
+ },
7582
+ {
7583
+ "epoch": 0.7796763402947292,
7584
+ "grad_norm": 16.09375,
7585
+ "learning_rate": 9.939088026841952e-07,
7586
+ "loss": 24.941,
7587
+ "step": 10780
7588
+ },
7589
+ {
7590
+ "epoch": 0.7803996022059488,
7591
+ "grad_norm": 15.625,
7592
+ "learning_rate": 9.939031522228632e-07,
7593
+ "loss": 24.8713,
7594
+ "step": 10790
7595
+ },
7596
+ {
7597
+ "epoch": 0.7811228641171685,
7598
+ "grad_norm": 15.84375,
7599
+ "learning_rate": 9.938975017615314e-07,
7600
+ "loss": 25.2026,
7601
+ "step": 10800
7602
+ },
7603
+ {
7604
+ "epoch": 0.781846126028388,
7605
+ "grad_norm": 17.234375,
7606
+ "learning_rate": 9.938918513001994e-07,
7607
+ "loss": 25.0097,
7608
+ "step": 10810
7609
+ },
7610
+ {
7611
+ "epoch": 0.7825693879396076,
7612
+ "grad_norm": 16.6875,
7613
+ "learning_rate": 9.938862008388674e-07,
7614
+ "loss": 24.8178,
7615
+ "step": 10820
7616
+ },
7617
+ {
7618
+ "epoch": 0.7832926498508272,
7619
+ "grad_norm": 16.015625,
7620
+ "learning_rate": 9.938805503775354e-07,
7621
+ "loss": 25.0497,
7622
+ "step": 10830
7623
+ },
7624
+ {
7625
+ "epoch": 0.7840159117620469,
7626
+ "grad_norm": 15.7734375,
7627
+ "learning_rate": 9.938748999162036e-07,
7628
+ "loss": 25.0357,
7629
+ "step": 10840
7630
+ },
7631
+ {
7632
+ "epoch": 0.7847391736732664,
7633
+ "grad_norm": 16.40625,
7634
+ "learning_rate": 9.938692494548718e-07,
7635
+ "loss": 24.9942,
7636
+ "step": 10850
7637
+ },
7638
+ {
7639
+ "epoch": 0.785462435584486,
7640
+ "grad_norm": 15.9140625,
7641
+ "learning_rate": 9.938635989935398e-07,
7642
+ "loss": 25.0035,
7643
+ "step": 10860
7644
+ },
7645
+ {
7646
+ "epoch": 0.7861856974957057,
7647
+ "grad_norm": 16.390625,
7648
+ "learning_rate": 9.938579485322078e-07,
7649
+ "loss": 25.1237,
7650
+ "step": 10870
7651
+ },
7652
+ {
7653
+ "epoch": 0.7869089594069252,
7654
+ "grad_norm": 16.125,
7655
+ "learning_rate": 9.93852298070876e-07,
7656
+ "loss": 24.69,
7657
+ "step": 10880
7658
+ },
7659
+ {
7660
+ "epoch": 0.7876322213181448,
7661
+ "grad_norm": 16.625,
7662
+ "learning_rate": 9.93846647609544e-07,
7663
+ "loss": 25.2109,
7664
+ "step": 10890
7665
+ },
7666
+ {
7667
+ "epoch": 0.7883554832293644,
7668
+ "grad_norm": 16.0625,
7669
+ "learning_rate": 9.938409971482122e-07,
7670
+ "loss": 25.0951,
7671
+ "step": 10900
7672
+ },
7673
+ {
7674
+ "epoch": 0.7890787451405841,
7675
+ "grad_norm": 17.34375,
7676
+ "learning_rate": 9.938353466868802e-07,
7677
+ "loss": 24.9796,
7678
+ "step": 10910
7679
+ },
7680
+ {
7681
+ "epoch": 0.7898020070518036,
7682
+ "grad_norm": 16.75,
7683
+ "learning_rate": 9.938296962255482e-07,
7684
+ "loss": 25.2829,
7685
+ "step": 10920
7686
+ },
7687
+ {
7688
+ "epoch": 0.7905252689630232,
7689
+ "grad_norm": 15.9609375,
7690
+ "learning_rate": 9.938240457642164e-07,
7691
+ "loss": 25.1632,
7692
+ "step": 10930
7693
+ },
7694
+ {
7695
+ "epoch": 0.7912485308742429,
7696
+ "grad_norm": 15.8984375,
7697
+ "learning_rate": 9.938183953028844e-07,
7698
+ "loss": 24.7993,
7699
+ "step": 10940
7700
+ },
7701
+ {
7702
+ "epoch": 0.7919717927854625,
7703
+ "grad_norm": 15.625,
7704
+ "learning_rate": 9.938127448415526e-07,
7705
+ "loss": 25.284,
7706
+ "step": 10950
7707
+ },
7708
+ {
7709
+ "epoch": 0.792695054696682,
7710
+ "grad_norm": 15.9609375,
7711
+ "learning_rate": 9.938070943802206e-07,
7712
+ "loss": 24.979,
7713
+ "step": 10960
7714
+ },
7715
+ {
7716
+ "epoch": 0.7934183166079016,
7717
+ "grad_norm": 15.3984375,
7718
+ "learning_rate": 9.938014439188888e-07,
7719
+ "loss": 24.8737,
7720
+ "step": 10970
7721
+ },
7722
+ {
7723
+ "epoch": 0.7941415785191213,
7724
+ "grad_norm": 16.609375,
7725
+ "learning_rate": 9.937957934575568e-07,
7726
+ "loss": 25.1861,
7727
+ "step": 10980
7728
+ },
7729
+ {
7730
+ "epoch": 0.7948648404303409,
7731
+ "grad_norm": 17.25,
7732
+ "learning_rate": 9.937901429962248e-07,
7733
+ "loss": 25.0383,
7734
+ "step": 10990
7735
+ },
7736
+ {
7737
+ "epoch": 0.7955881023415604,
7738
+ "grad_norm": 15.5234375,
7739
+ "learning_rate": 9.93784492534893e-07,
7740
+ "loss": 24.717,
7741
+ "step": 11000
7742
+ },
7743
+ {
7744
+ "epoch": 0.79631136425278,
7745
+ "grad_norm": 17.3125,
7746
+ "learning_rate": 9.93778842073561e-07,
7747
+ "loss": 25.3394,
7748
+ "step": 11010
7749
+ },
7750
+ {
7751
+ "epoch": 0.7970346261639997,
7752
+ "grad_norm": 16.25,
7753
+ "learning_rate": 9.937731916122292e-07,
7754
+ "loss": 24.9355,
7755
+ "step": 11020
7756
+ },
7757
+ {
7758
+ "epoch": 0.7977578880752192,
7759
+ "grad_norm": 16.515625,
7760
+ "learning_rate": 9.937675411508972e-07,
7761
+ "loss": 25.1295,
7762
+ "step": 11030
7763
+ },
7764
+ {
7765
+ "epoch": 0.7984811499864388,
7766
+ "grad_norm": 16.28125,
7767
+ "learning_rate": 9.937618906895652e-07,
7768
+ "loss": 25.0645,
7769
+ "step": 11040
7770
+ },
7771
+ {
7772
+ "epoch": 0.7992044118976585,
7773
+ "grad_norm": 15.9140625,
7774
+ "learning_rate": 9.937562402282334e-07,
7775
+ "loss": 25.1427,
7776
+ "step": 11050
7777
+ },
7778
+ {
7779
+ "epoch": 0.7999276738088781,
7780
+ "grad_norm": 15.4765625,
7781
+ "learning_rate": 9.937505897669014e-07,
7782
+ "loss": 24.9404,
7783
+ "step": 11060
7784
+ },
7785
+ {
7786
+ "epoch": 0.8006509357200976,
7787
+ "grad_norm": 16.546875,
7788
+ "learning_rate": 9.937449393055696e-07,
7789
+ "loss": 25.1138,
7790
+ "step": 11070
7791
+ },
7792
+ {
7793
+ "epoch": 0.8013741976313172,
7794
+ "grad_norm": 17.078125,
7795
+ "learning_rate": 9.937392888442376e-07,
7796
+ "loss": 24.6309,
7797
+ "step": 11080
7798
+ },
7799
+ {
7800
+ "epoch": 0.8020974595425369,
7801
+ "grad_norm": 15.9921875,
7802
+ "learning_rate": 9.937336383829056e-07,
7803
+ "loss": 24.6495,
7804
+ "step": 11090
7805
+ },
7806
+ {
7807
+ "epoch": 0.8028207214537565,
7808
+ "grad_norm": 15.671875,
7809
+ "learning_rate": 9.937279879215739e-07,
7810
+ "loss": 25.0942,
7811
+ "step": 11100
7812
+ },
7813
+ {
7814
+ "epoch": 0.803543983364976,
7815
+ "grad_norm": 16.9375,
7816
+ "learning_rate": 9.937223374602419e-07,
7817
+ "loss": 25.162,
7818
+ "step": 11110
7819
+ },
7820
+ {
7821
+ "epoch": 0.8042672452761956,
7822
+ "grad_norm": 16.8125,
7823
+ "learning_rate": 9.9371668699891e-07,
7824
+ "loss": 25.1301,
7825
+ "step": 11120
7826
+ },
7827
+ {
7828
+ "epoch": 0.8049905071874153,
7829
+ "grad_norm": 16.171875,
7830
+ "learning_rate": 9.93711036537578e-07,
7831
+ "loss": 24.9047,
7832
+ "step": 11130
7833
+ },
7834
+ {
7835
+ "epoch": 0.8057137690986348,
7836
+ "grad_norm": 16.109375,
7837
+ "learning_rate": 9.93705386076246e-07,
7838
+ "loss": 24.9905,
7839
+ "step": 11140
7840
+ },
7841
+ {
7842
+ "epoch": 0.8064370310098544,
7843
+ "grad_norm": 15.90625,
7844
+ "learning_rate": 9.936997356149143e-07,
7845
+ "loss": 25.4194,
7846
+ "step": 11150
7847
+ },
7848
+ {
7849
+ "epoch": 0.807160292921074,
7850
+ "grad_norm": 16.328125,
7851
+ "learning_rate": 9.936940851535823e-07,
7852
+ "loss": 25.2549,
7853
+ "step": 11160
7854
+ },
7855
+ {
7856
+ "epoch": 0.8078835548322937,
7857
+ "grad_norm": 16.46875,
7858
+ "learning_rate": 9.936884346922505e-07,
7859
+ "loss": 25.3997,
7860
+ "step": 11170
7861
+ },
7862
+ {
7863
+ "epoch": 0.8086068167435132,
7864
+ "grad_norm": 16.953125,
7865
+ "learning_rate": 9.936827842309185e-07,
7866
+ "loss": 25.3272,
7867
+ "step": 11180
7868
+ },
7869
+ {
7870
+ "epoch": 0.8093300786547328,
7871
+ "grad_norm": 16.609375,
7872
+ "learning_rate": 9.936771337695867e-07,
7873
+ "loss": 25.0086,
7874
+ "step": 11190
7875
+ },
7876
+ {
7877
+ "epoch": 0.8100533405659525,
7878
+ "grad_norm": 15.6796875,
7879
+ "learning_rate": 9.936714833082547e-07,
7880
+ "loss": 25.0924,
7881
+ "step": 11200
7882
+ },
7883
+ {
7884
+ "epoch": 0.8107766024771721,
7885
+ "grad_norm": 15.96875,
7886
+ "learning_rate": 9.936658328469227e-07,
7887
+ "loss": 25.0911,
7888
+ "step": 11210
7889
+ },
7890
+ {
7891
+ "epoch": 0.8114998643883916,
7892
+ "grad_norm": 15.765625,
7893
+ "learning_rate": 9.936601823855909e-07,
7894
+ "loss": 24.9176,
7895
+ "step": 11220
7896
+ },
7897
+ {
7898
+ "epoch": 0.8122231262996112,
7899
+ "grad_norm": 15.796875,
7900
+ "learning_rate": 9.936545319242589e-07,
7901
+ "loss": 25.1171,
7902
+ "step": 11230
7903
+ },
7904
+ {
7905
+ "epoch": 0.8129463882108309,
7906
+ "grad_norm": 16.109375,
7907
+ "learning_rate": 9.93648881462927e-07,
7908
+ "loss": 24.7524,
7909
+ "step": 11240
7910
+ },
7911
+ {
7912
+ "epoch": 0.8136696501220505,
7913
+ "grad_norm": 16.625,
7914
+ "learning_rate": 9.93643231001595e-07,
7915
+ "loss": 25.3103,
7916
+ "step": 11250
7917
+ },
7918
+ {
7919
+ "epoch": 0.81439291203327,
7920
+ "grad_norm": 16.046875,
7921
+ "learning_rate": 9.93637580540263e-07,
7922
+ "loss": 24.9071,
7923
+ "step": 11260
7924
+ },
7925
+ {
7926
+ "epoch": 0.8151161739444897,
7927
+ "grad_norm": 17.609375,
7928
+ "learning_rate": 9.936319300789313e-07,
7929
+ "loss": 24.9546,
7930
+ "step": 11270
7931
+ },
7932
+ {
7933
+ "epoch": 0.8158394358557093,
7934
+ "grad_norm": 16.0,
7935
+ "learning_rate": 9.936262796175993e-07,
7936
+ "loss": 25.1575,
7937
+ "step": 11280
7938
+ },
7939
+ {
7940
+ "epoch": 0.8165626977669288,
7941
+ "grad_norm": 16.21875,
7942
+ "learning_rate": 9.936206291562675e-07,
7943
+ "loss": 25.1307,
7944
+ "step": 11290
7945
+ },
7946
+ {
7947
+ "epoch": 0.8172859596781484,
7948
+ "grad_norm": 15.453125,
7949
+ "learning_rate": 9.936149786949355e-07,
7950
+ "loss": 25.3439,
7951
+ "step": 11300
7952
+ },
7953
+ {
7954
+ "epoch": 0.8180092215893681,
7955
+ "grad_norm": 15.75,
7956
+ "learning_rate": 9.936093282336035e-07,
7957
+ "loss": 25.0032,
7958
+ "step": 11310
7959
+ },
7960
+ {
7961
+ "epoch": 0.8187324835005877,
7962
+ "grad_norm": 16.375,
7963
+ "learning_rate": 9.936036777722717e-07,
7964
+ "loss": 24.8905,
7965
+ "step": 11320
7966
+ },
7967
+ {
7968
+ "epoch": 0.8194557454118072,
7969
+ "grad_norm": 16.90625,
7970
+ "learning_rate": 9.935980273109397e-07,
7971
+ "loss": 24.6439,
7972
+ "step": 11330
7973
+ },
7974
+ {
7975
+ "epoch": 0.8201790073230268,
7976
+ "grad_norm": 17.0625,
7977
+ "learning_rate": 9.93592376849608e-07,
7978
+ "loss": 24.9961,
7979
+ "step": 11340
7980
+ },
7981
+ {
7982
+ "epoch": 0.8209022692342465,
7983
+ "grad_norm": 16.3125,
7984
+ "learning_rate": 9.93586726388276e-07,
7985
+ "loss": 24.9475,
7986
+ "step": 11350
7987
+ },
7988
+ {
7989
+ "epoch": 0.8216255311454661,
7990
+ "grad_norm": 15.796875,
7991
+ "learning_rate": 9.93581075926944e-07,
7992
+ "loss": 25.1387,
7993
+ "step": 11360
7994
+ },
7995
+ {
7996
+ "epoch": 0.8223487930566856,
7997
+ "grad_norm": 16.15625,
7998
+ "learning_rate": 9.935754254656121e-07,
7999
+ "loss": 24.6738,
8000
+ "step": 11370
8001
+ },
8002
+ {
8003
+ "epoch": 0.8230720549679053,
8004
+ "grad_norm": 16.40625,
8005
+ "learning_rate": 9.935697750042801e-07,
8006
+ "loss": 25.1657,
8007
+ "step": 11380
8008
+ },
8009
+ {
8010
+ "epoch": 0.8237953168791249,
8011
+ "grad_norm": 15.8828125,
8012
+ "learning_rate": 9.935641245429483e-07,
8013
+ "loss": 24.9072,
8014
+ "step": 11390
8015
+ },
8016
+ {
8017
+ "epoch": 0.8245185787903444,
8018
+ "grad_norm": 16.625,
8019
+ "learning_rate": 9.935584740816163e-07,
8020
+ "loss": 24.8184,
8021
+ "step": 11400
8022
+ },
8023
+ {
8024
+ "epoch": 0.825241840701564,
8025
+ "grad_norm": 15.6796875,
8026
+ "learning_rate": 9.935528236202845e-07,
8027
+ "loss": 25.093,
8028
+ "step": 11410
8029
+ },
8030
+ {
8031
+ "epoch": 0.8259651026127837,
8032
+ "grad_norm": 18.828125,
8033
+ "learning_rate": 9.935471731589525e-07,
8034
+ "loss": 24.5455,
8035
+ "step": 11420
8036
+ },
8037
+ {
8038
+ "epoch": 0.8266883645240033,
8039
+ "grad_norm": 16.640625,
8040
+ "learning_rate": 9.935415226976205e-07,
8041
+ "loss": 24.9237,
8042
+ "step": 11430
8043
+ },
8044
+ {
8045
+ "epoch": 0.8274116264352228,
8046
+ "grad_norm": 17.109375,
8047
+ "learning_rate": 9.935358722362887e-07,
8048
+ "loss": 25.1441,
8049
+ "step": 11440
8050
+ },
8051
+ {
8052
+ "epoch": 0.8281348883464424,
8053
+ "grad_norm": 16.0625,
8054
+ "learning_rate": 9.935302217749567e-07,
8055
+ "loss": 24.7008,
8056
+ "step": 11450
8057
+ },
8058
+ {
8059
+ "epoch": 0.8288581502576621,
8060
+ "grad_norm": 16.296875,
8061
+ "learning_rate": 9.93524571313625e-07,
8062
+ "loss": 24.9362,
8063
+ "step": 11460
8064
+ },
8065
+ {
8066
+ "epoch": 0.8295814121688817,
8067
+ "grad_norm": 16.234375,
8068
+ "learning_rate": 9.93518920852293e-07,
8069
+ "loss": 24.7211,
8070
+ "step": 11470
8071
+ },
8072
+ {
8073
+ "epoch": 0.8303046740801012,
8074
+ "grad_norm": 16.71875,
8075
+ "learning_rate": 9.93513270390961e-07,
8076
+ "loss": 24.6513,
8077
+ "step": 11480
8078
+ },
8079
+ {
8080
+ "epoch": 0.8310279359913209,
8081
+ "grad_norm": 15.5390625,
8082
+ "learning_rate": 9.935076199296291e-07,
8083
+ "loss": 24.7759,
8084
+ "step": 11490
8085
+ },
8086
+ {
8087
+ "epoch": 0.8317511979025405,
8088
+ "grad_norm": 18.140625,
8089
+ "learning_rate": 9.935019694682971e-07,
8090
+ "loss": 25.0801,
8091
+ "step": 11500
8092
+ },
8093
+ {
8094
+ "epoch": 0.83247445981376,
8095
+ "grad_norm": 16.453125,
8096
+ "learning_rate": 9.934963190069653e-07,
8097
+ "loss": 25.0655,
8098
+ "step": 11510
8099
+ },
8100
+ {
8101
+ "epoch": 0.8331977217249796,
8102
+ "grad_norm": 16.0625,
8103
+ "learning_rate": 9.934906685456333e-07,
8104
+ "loss": 25.2843,
8105
+ "step": 11520
8106
+ },
8107
+ {
8108
+ "epoch": 0.8339209836361993,
8109
+ "grad_norm": 15.15625,
8110
+ "learning_rate": 9.934850180843013e-07,
8111
+ "loss": 25.1964,
8112
+ "step": 11530
8113
+ },
8114
+ {
8115
+ "epoch": 0.8346442455474189,
8116
+ "grad_norm": 15.7734375,
8117
+ "learning_rate": 9.934793676229696e-07,
8118
+ "loss": 24.7941,
8119
+ "step": 11540
8120
+ },
8121
+ {
8122
+ "epoch": 0.8353675074586384,
8123
+ "grad_norm": 16.21875,
8124
+ "learning_rate": 9.934737171616375e-07,
8125
+ "loss": 24.7874,
8126
+ "step": 11550
8127
+ },
8128
+ {
8129
+ "epoch": 0.836090769369858,
8130
+ "grad_norm": 16.34375,
8131
+ "learning_rate": 9.934680667003058e-07,
8132
+ "loss": 25.2798,
8133
+ "step": 11560
8134
+ },
8135
+ {
8136
+ "epoch": 0.8368140312810777,
8137
+ "grad_norm": 15.859375,
8138
+ "learning_rate": 9.934624162389738e-07,
8139
+ "loss": 25.0314,
8140
+ "step": 11570
8141
+ },
8142
+ {
8143
+ "epoch": 0.8375372931922973,
8144
+ "grad_norm": 15.765625,
8145
+ "learning_rate": 9.93456765777642e-07,
8146
+ "loss": 25.0424,
8147
+ "step": 11580
8148
+ },
8149
+ {
8150
+ "epoch": 0.8382605551035168,
8151
+ "grad_norm": 15.9921875,
8152
+ "learning_rate": 9.9345111531631e-07,
8153
+ "loss": 25.1185,
8154
+ "step": 11590
8155
+ },
8156
+ {
8157
+ "epoch": 0.8389838170147365,
8158
+ "grad_norm": 17.15625,
8159
+ "learning_rate": 9.93445464854978e-07,
8160
+ "loss": 25.1181,
8161
+ "step": 11600
8162
+ },
8163
+ {
8164
+ "epoch": 0.8397070789259561,
8165
+ "grad_norm": 15.8828125,
8166
+ "learning_rate": 9.934398143936462e-07,
8167
+ "loss": 24.7918,
8168
+ "step": 11610
8169
+ },
8170
+ {
8171
+ "epoch": 0.8404303408371757,
8172
+ "grad_norm": 16.765625,
8173
+ "learning_rate": 9.934341639323142e-07,
8174
+ "loss": 25.0707,
8175
+ "step": 11620
8176
+ },
8177
+ {
8178
+ "epoch": 0.8411536027483952,
8179
+ "grad_norm": 17.9375,
8180
+ "learning_rate": 9.934285134709824e-07,
8181
+ "loss": 25.2668,
8182
+ "step": 11630
8183
+ },
8184
+ {
8185
+ "epoch": 0.8418768646596149,
8186
+ "grad_norm": 17.828125,
8187
+ "learning_rate": 9.934228630096504e-07,
8188
+ "loss": 25.2317,
8189
+ "step": 11640
8190
+ },
8191
+ {
8192
+ "epoch": 0.8426001265708345,
8193
+ "grad_norm": 16.203125,
8194
+ "learning_rate": 9.934172125483184e-07,
8195
+ "loss": 24.7235,
8196
+ "step": 11650
8197
+ },
8198
+ {
8199
+ "epoch": 0.843323388482054,
8200
+ "grad_norm": 15.109375,
8201
+ "learning_rate": 9.934115620869866e-07,
8202
+ "loss": 25.0079,
8203
+ "step": 11660
8204
+ },
8205
+ {
8206
+ "epoch": 0.8440466503932736,
8207
+ "grad_norm": 16.03125,
8208
+ "learning_rate": 9.934059116256546e-07,
8209
+ "loss": 24.8632,
8210
+ "step": 11670
8211
+ },
8212
+ {
8213
+ "epoch": 0.8447699123044933,
8214
+ "grad_norm": 17.25,
8215
+ "learning_rate": 9.934002611643228e-07,
8216
+ "loss": 25.1275,
8217
+ "step": 11680
8218
+ },
8219
+ {
8220
+ "epoch": 0.8454931742157129,
8221
+ "grad_norm": 15.296875,
8222
+ "learning_rate": 9.933946107029908e-07,
8223
+ "loss": 25.1413,
8224
+ "step": 11690
8225
+ },
8226
+ {
8227
+ "epoch": 0.8462164361269324,
8228
+ "grad_norm": 15.171875,
8229
+ "learning_rate": 9.933889602416588e-07,
8230
+ "loss": 25.3308,
8231
+ "step": 11700
8232
+ },
8233
+ {
8234
+ "epoch": 0.8469396980381521,
8235
+ "grad_norm": 16.671875,
8236
+ "learning_rate": 9.93383309780327e-07,
8237
+ "loss": 25.0381,
8238
+ "step": 11710
8239
+ },
8240
+ {
8241
+ "epoch": 0.8476629599493717,
8242
+ "grad_norm": 18.578125,
8243
+ "learning_rate": 9.93377659318995e-07,
8244
+ "loss": 25.043,
8245
+ "step": 11720
8246
+ },
8247
+ {
8248
+ "epoch": 0.8483862218605913,
8249
+ "grad_norm": 16.0625,
8250
+ "learning_rate": 9.933720088576632e-07,
8251
+ "loss": 25.1655,
8252
+ "step": 11730
8253
+ },
8254
+ {
8255
+ "epoch": 0.8491094837718108,
8256
+ "grad_norm": 15.890625,
8257
+ "learning_rate": 9.933663583963312e-07,
8258
+ "loss": 25.1387,
8259
+ "step": 11740
8260
+ },
8261
+ {
8262
+ "epoch": 0.8498327456830305,
8263
+ "grad_norm": 16.765625,
8264
+ "learning_rate": 9.933607079349992e-07,
8265
+ "loss": 25.1674,
8266
+ "step": 11750
8267
+ },
8268
+ {
8269
+ "epoch": 0.8505560075942501,
8270
+ "grad_norm": 16.59375,
8271
+ "learning_rate": 9.933550574736674e-07,
8272
+ "loss": 25.0077,
8273
+ "step": 11760
8274
+ },
8275
+ {
8276
+ "epoch": 0.8512792695054696,
8277
+ "grad_norm": 15.296875,
8278
+ "learning_rate": 9.933494070123354e-07,
8279
+ "loss": 25.0831,
8280
+ "step": 11770
8281
+ },
8282
+ {
8283
+ "epoch": 0.8520025314166892,
8284
+ "grad_norm": 16.71875,
8285
+ "learning_rate": 9.933437565510036e-07,
8286
+ "loss": 24.7517,
8287
+ "step": 11780
8288
+ },
8289
+ {
8290
+ "epoch": 0.8527257933279089,
8291
+ "grad_norm": 16.140625,
8292
+ "learning_rate": 9.933381060896716e-07,
8293
+ "loss": 24.8048,
8294
+ "step": 11790
8295
+ },
8296
+ {
8297
+ "epoch": 0.8534490552391285,
8298
+ "grad_norm": 16.078125,
8299
+ "learning_rate": 9.933324556283398e-07,
8300
+ "loss": 24.8567,
8301
+ "step": 11800
8302
+ },
8303
+ {
8304
+ "epoch": 0.854172317150348,
8305
+ "grad_norm": 15.828125,
8306
+ "learning_rate": 9.933268051670078e-07,
8307
+ "loss": 24.9475,
8308
+ "step": 11810
8309
+ },
8310
+ {
8311
+ "epoch": 0.8548955790615677,
8312
+ "grad_norm": 16.84375,
8313
+ "learning_rate": 9.933211547056758e-07,
8314
+ "loss": 25.2285,
8315
+ "step": 11820
8316
+ },
8317
+ {
8318
+ "epoch": 0.8556188409727873,
8319
+ "grad_norm": 16.21875,
8320
+ "learning_rate": 9.93315504244344e-07,
8321
+ "loss": 24.8682,
8322
+ "step": 11830
8323
+ },
8324
+ {
8325
+ "epoch": 0.8563421028840069,
8326
+ "grad_norm": 17.140625,
8327
+ "learning_rate": 9.93309853783012e-07,
8328
+ "loss": 24.9313,
8329
+ "step": 11840
8330
+ },
8331
+ {
8332
+ "epoch": 0.8570653647952264,
8333
+ "grad_norm": 16.828125,
8334
+ "learning_rate": 9.933042033216802e-07,
8335
+ "loss": 25.4222,
8336
+ "step": 11850
8337
+ },
8338
+ {
8339
+ "epoch": 0.8577886267064461,
8340
+ "grad_norm": 15.9453125,
8341
+ "learning_rate": 9.932985528603482e-07,
8342
+ "loss": 24.9957,
8343
+ "step": 11860
8344
+ },
8345
+ {
8346
+ "epoch": 0.8585118886176657,
8347
+ "grad_norm": 15.5859375,
8348
+ "learning_rate": 9.932929023990162e-07,
8349
+ "loss": 24.7224,
8350
+ "step": 11870
8351
+ },
8352
+ {
8353
+ "epoch": 0.8592351505288853,
8354
+ "grad_norm": 16.71875,
8355
+ "learning_rate": 9.932872519376844e-07,
8356
+ "loss": 25.2214,
8357
+ "step": 11880
8358
+ },
8359
+ {
8360
+ "epoch": 0.8599584124401048,
8361
+ "grad_norm": 16.375,
8362
+ "learning_rate": 9.932816014763526e-07,
8363
+ "loss": 24.9741,
8364
+ "step": 11890
8365
+ },
8366
+ {
8367
+ "epoch": 0.8606816743513245,
8368
+ "grad_norm": 15.109375,
8369
+ "learning_rate": 9.932759510150206e-07,
8370
+ "loss": 24.9836,
8371
+ "step": 11900
8372
+ },
8373
+ {
8374
+ "epoch": 0.8614049362625441,
8375
+ "grad_norm": 16.6875,
8376
+ "learning_rate": 9.932703005536886e-07,
8377
+ "loss": 25.1719,
8378
+ "step": 11910
8379
+ },
8380
+ {
8381
+ "epoch": 0.8621281981737636,
8382
+ "grad_norm": 16.59375,
8383
+ "learning_rate": 9.932646500923566e-07,
8384
+ "loss": 25.1345,
8385
+ "step": 11920
8386
+ },
8387
+ {
8388
+ "epoch": 0.8628514600849833,
8389
+ "grad_norm": 15.8828125,
8390
+ "learning_rate": 9.932589996310248e-07,
8391
+ "loss": 24.922,
8392
+ "step": 11930
8393
+ },
8394
+ {
8395
+ "epoch": 0.8635747219962029,
8396
+ "grad_norm": 15.5390625,
8397
+ "learning_rate": 9.93253349169693e-07,
8398
+ "loss": 25.3009,
8399
+ "step": 11940
8400
+ },
8401
+ {
8402
+ "epoch": 0.8642979839074225,
8403
+ "grad_norm": 17.015625,
8404
+ "learning_rate": 9.93247698708361e-07,
8405
+ "loss": 24.8167,
8406
+ "step": 11950
8407
+ },
8408
+ {
8409
+ "epoch": 0.865021245818642,
8410
+ "grad_norm": 16.078125,
8411
+ "learning_rate": 9.93242048247029e-07,
8412
+ "loss": 25.0831,
8413
+ "step": 11960
8414
+ },
8415
+ {
8416
+ "epoch": 0.8657445077298617,
8417
+ "grad_norm": 16.84375,
8418
+ "learning_rate": 9.93236397785697e-07,
8419
+ "loss": 25.184,
8420
+ "step": 11970
8421
+ },
8422
+ {
8423
+ "epoch": 0.8664677696410813,
8424
+ "grad_norm": 16.859375,
8425
+ "learning_rate": 9.932307473243652e-07,
8426
+ "loss": 25.1576,
8427
+ "step": 11980
8428
+ },
8429
+ {
8430
+ "epoch": 0.8671910315523009,
8431
+ "grad_norm": 14.8046875,
8432
+ "learning_rate": 9.932250968630332e-07,
8433
+ "loss": 24.9739,
8434
+ "step": 11990
8435
+ },
8436
+ {
8437
+ "epoch": 0.8679142934635204,
8438
+ "grad_norm": 15.7109375,
8439
+ "learning_rate": 9.932194464017015e-07,
8440
+ "loss": 24.9096,
8441
+ "step": 12000
8442
+ },
8443
+ {
8444
+ "epoch": 0.8686375553747401,
8445
+ "grad_norm": 17.125,
8446
+ "learning_rate": 9.932137959403695e-07,
8447
+ "loss": 25.1216,
8448
+ "step": 12010
8449
+ },
8450
+ {
8451
+ "epoch": 0.8693608172859597,
8452
+ "grad_norm": 15.578125,
8453
+ "learning_rate": 9.932081454790377e-07,
8454
+ "loss": 24.8823,
8455
+ "step": 12020
8456
+ },
8457
+ {
8458
+ "epoch": 0.8700840791971792,
8459
+ "grad_norm": 16.734375,
8460
+ "learning_rate": 9.932024950177057e-07,
8461
+ "loss": 24.9776,
8462
+ "step": 12030
8463
+ },
8464
+ {
8465
+ "epoch": 0.8708073411083989,
8466
+ "grad_norm": 16.203125,
8467
+ "learning_rate": 9.931968445563737e-07,
8468
+ "loss": 25.0451,
8469
+ "step": 12040
8470
+ },
8471
+ {
8472
+ "epoch": 0.8715306030196185,
8473
+ "grad_norm": 18.21875,
8474
+ "learning_rate": 9.931911940950419e-07,
8475
+ "loss": 25.0495,
8476
+ "step": 12050
8477
+ },
8478
+ {
8479
+ "epoch": 0.8722538649308381,
8480
+ "grad_norm": 16.09375,
8481
+ "learning_rate": 9.931855436337099e-07,
8482
+ "loss": 25.0868,
8483
+ "step": 12060
8484
+ },
8485
+ {
8486
+ "epoch": 0.8729771268420576,
8487
+ "grad_norm": 16.21875,
8488
+ "learning_rate": 9.93179893172378e-07,
8489
+ "loss": 24.8577,
8490
+ "step": 12070
8491
+ },
8492
+ {
8493
+ "epoch": 0.8737003887532773,
8494
+ "grad_norm": 16.71875,
8495
+ "learning_rate": 9.93174242711046e-07,
8496
+ "loss": 25.1413,
8497
+ "step": 12080
8498
+ },
8499
+ {
8500
+ "epoch": 0.8744236506644969,
8501
+ "grad_norm": 16.984375,
8502
+ "learning_rate": 9.93168592249714e-07,
8503
+ "loss": 25.1001,
8504
+ "step": 12090
8505
+ },
8506
+ {
8507
+ "epoch": 0.8751469125757165,
8508
+ "grad_norm": 16.9375,
8509
+ "learning_rate": 9.931629417883823e-07,
8510
+ "loss": 25.1148,
8511
+ "step": 12100
8512
+ },
8513
+ {
8514
+ "epoch": 0.875870174486936,
8515
+ "grad_norm": 16.65625,
8516
+ "learning_rate": 9.931572913270505e-07,
8517
+ "loss": 25.1793,
8518
+ "step": 12110
8519
+ },
8520
+ {
8521
+ "epoch": 0.8765934363981557,
8522
+ "grad_norm": 16.8125,
8523
+ "learning_rate": 9.931516408657185e-07,
8524
+ "loss": 25.0,
8525
+ "step": 12120
8526
+ },
8527
+ {
8528
+ "epoch": 0.8773166983093753,
8529
+ "grad_norm": 16.375,
8530
+ "learning_rate": 9.931459904043865e-07,
8531
+ "loss": 24.9269,
8532
+ "step": 12130
8533
+ },
8534
+ {
8535
+ "epoch": 0.8780399602205948,
8536
+ "grad_norm": 16.0,
8537
+ "learning_rate": 9.931403399430545e-07,
8538
+ "loss": 25.2554,
8539
+ "step": 12140
8540
+ },
8541
+ {
8542
+ "epoch": 0.8787632221318145,
8543
+ "grad_norm": 17.328125,
8544
+ "learning_rate": 9.931346894817227e-07,
8545
+ "loss": 25.126,
8546
+ "step": 12150
8547
+ },
8548
+ {
8549
+ "epoch": 0.8794864840430341,
8550
+ "grad_norm": 16.453125,
8551
+ "learning_rate": 9.93129039020391e-07,
8552
+ "loss": 24.8815,
8553
+ "step": 12160
8554
+ },
8555
+ {
8556
+ "epoch": 0.8802097459542537,
8557
+ "grad_norm": 16.15625,
8558
+ "learning_rate": 9.931233885590589e-07,
8559
+ "loss": 25.2028,
8560
+ "step": 12170
8561
+ },
8562
+ {
8563
+ "epoch": 0.8809330078654732,
8564
+ "grad_norm": 15.859375,
8565
+ "learning_rate": 9.931177380977269e-07,
8566
+ "loss": 24.8288,
8567
+ "step": 12180
8568
+ },
8569
+ {
8570
+ "epoch": 0.8816562697766929,
8571
+ "grad_norm": 15.5078125,
8572
+ "learning_rate": 9.931120876363949e-07,
8573
+ "loss": 25.1377,
8574
+ "step": 12190
8575
+ },
8576
+ {
8577
+ "epoch": 0.8823795316879125,
8578
+ "grad_norm": 16.28125,
8579
+ "learning_rate": 9.93106437175063e-07,
8580
+ "loss": 25.1025,
8581
+ "step": 12200
8582
+ },
8583
+ {
8584
+ "epoch": 0.8831027935991321,
8585
+ "grad_norm": 15.609375,
8586
+ "learning_rate": 9.931007867137313e-07,
8587
+ "loss": 24.8902,
8588
+ "step": 12210
8589
+ },
8590
+ {
8591
+ "epoch": 0.8838260555103516,
8592
+ "grad_norm": 16.4375,
8593
+ "learning_rate": 9.930951362523993e-07,
8594
+ "loss": 24.8283,
8595
+ "step": 12220
8596
+ },
8597
+ {
8598
+ "epoch": 0.8845493174215713,
8599
+ "grad_norm": 15.7421875,
8600
+ "learning_rate": 9.930894857910673e-07,
8601
+ "loss": 25.4166,
8602
+ "step": 12230
8603
+ },
8604
+ {
8605
+ "epoch": 0.8852725793327909,
8606
+ "grad_norm": 15.8359375,
8607
+ "learning_rate": 9.930838353297355e-07,
8608
+ "loss": 25.3919,
8609
+ "step": 12240
8610
+ },
8611
+ {
8612
+ "epoch": 0.8859958412440105,
8613
+ "grad_norm": 15.6640625,
8614
+ "learning_rate": 9.930781848684035e-07,
8615
+ "loss": 24.895,
8616
+ "step": 12250
8617
+ },
8618
+ {
8619
+ "epoch": 0.8867191031552301,
8620
+ "grad_norm": 16.359375,
8621
+ "learning_rate": 9.930725344070717e-07,
8622
+ "loss": 24.8067,
8623
+ "step": 12260
8624
+ },
8625
+ {
8626
+ "epoch": 0.8874423650664497,
8627
+ "grad_norm": 15.640625,
8628
+ "learning_rate": 9.930668839457397e-07,
8629
+ "loss": 24.9466,
8630
+ "step": 12270
8631
+ },
8632
+ {
8633
+ "epoch": 0.8881656269776693,
8634
+ "grad_norm": 16.375,
8635
+ "learning_rate": 9.930612334844077e-07,
8636
+ "loss": 24.8336,
8637
+ "step": 12280
8638
+ },
8639
+ {
8640
+ "epoch": 0.8888888888888888,
8641
+ "grad_norm": 16.1875,
8642
+ "learning_rate": 9.93055583023076e-07,
8643
+ "loss": 25.0462,
8644
+ "step": 12290
8645
+ },
8646
+ {
8647
+ "epoch": 0.8896121508001085,
8648
+ "grad_norm": 15.640625,
8649
+ "learning_rate": 9.93049932561744e-07,
8650
+ "loss": 25.2034,
8651
+ "step": 12300
8652
+ },
8653
+ {
8654
+ "epoch": 0.8903354127113281,
8655
+ "grad_norm": 17.0,
8656
+ "learning_rate": 9.930442821004121e-07,
8657
+ "loss": 24.9438,
8658
+ "step": 12310
8659
+ },
8660
+ {
8661
+ "epoch": 0.8910586746225477,
8662
+ "grad_norm": 16.125,
8663
+ "learning_rate": 9.930386316390801e-07,
8664
+ "loss": 24.8273,
8665
+ "step": 12320
8666
+ },
8667
+ {
8668
+ "epoch": 0.8917819365337672,
8669
+ "grad_norm": 16.90625,
8670
+ "learning_rate": 9.930329811777483e-07,
8671
+ "loss": 24.6906,
8672
+ "step": 12330
8673
+ },
8674
+ {
8675
+ "epoch": 0.8925051984449869,
8676
+ "grad_norm": 17.375,
8677
+ "learning_rate": 9.930273307164163e-07,
8678
+ "loss": 25.2003,
8679
+ "step": 12340
8680
+ },
8681
+ {
8682
+ "epoch": 0.8932284603562065,
8683
+ "grad_norm": 15.5078125,
8684
+ "learning_rate": 9.930216802550843e-07,
8685
+ "loss": 25.079,
8686
+ "step": 12350
8687
+ },
8688
+ {
8689
+ "epoch": 0.8939517222674261,
8690
+ "grad_norm": 16.328125,
8691
+ "learning_rate": 9.930160297937523e-07,
8692
+ "loss": 25.1027,
8693
+ "step": 12360
8694
+ },
8695
+ {
8696
+ "epoch": 0.8946749841786457,
8697
+ "grad_norm": 16.21875,
8698
+ "learning_rate": 9.930103793324205e-07,
8699
+ "loss": 24.7906,
8700
+ "step": 12370
8701
+ },
8702
+ {
8703
+ "epoch": 0.8953982460898653,
8704
+ "grad_norm": 16.96875,
8705
+ "learning_rate": 9.930047288710887e-07,
8706
+ "loss": 24.6628,
8707
+ "step": 12380
8708
+ },
8709
+ {
8710
+ "epoch": 0.8961215080010849,
8711
+ "grad_norm": 16.671875,
8712
+ "learning_rate": 9.929990784097567e-07,
8713
+ "loss": 25.1926,
8714
+ "step": 12390
8715
+ },
8716
+ {
8717
+ "epoch": 0.8968447699123044,
8718
+ "grad_norm": 16.0,
8719
+ "learning_rate": 9.929934279484247e-07,
8720
+ "loss": 24.9436,
8721
+ "step": 12400
8722
+ },
8723
+ {
8724
+ "epoch": 0.8975680318235241,
8725
+ "grad_norm": 17.109375,
8726
+ "learning_rate": 9.929877774870927e-07,
8727
+ "loss": 24.924,
8728
+ "step": 12410
8729
+ },
8730
+ {
8731
+ "epoch": 0.8982912937347437,
8732
+ "grad_norm": 17.46875,
8733
+ "learning_rate": 9.92982127025761e-07,
8734
+ "loss": 25.2271,
8735
+ "step": 12420
8736
+ },
8737
+ {
8738
+ "epoch": 0.8990145556459633,
8739
+ "grad_norm": 15.984375,
8740
+ "learning_rate": 9.929764765644292e-07,
8741
+ "loss": 25.0281,
8742
+ "step": 12430
8743
+ },
8744
+ {
8745
+ "epoch": 0.8997378175571829,
8746
+ "grad_norm": 16.90625,
8747
+ "learning_rate": 9.929708261030972e-07,
8748
+ "loss": 25.0888,
8749
+ "step": 12440
8750
+ },
8751
+ {
8752
+ "epoch": 0.9004610794684025,
8753
+ "grad_norm": 17.25,
8754
+ "learning_rate": 9.929651756417652e-07,
8755
+ "loss": 24.999,
8756
+ "step": 12450
8757
+ },
8758
+ {
8759
+ "epoch": 0.9011843413796221,
8760
+ "grad_norm": 15.921875,
8761
+ "learning_rate": 9.929595251804334e-07,
8762
+ "loss": 25.1128,
8763
+ "step": 12460
8764
+ },
8765
+ {
8766
+ "epoch": 0.9019076032908417,
8767
+ "grad_norm": 15.65625,
8768
+ "learning_rate": 9.929538747191014e-07,
8769
+ "loss": 25.0573,
8770
+ "step": 12470
8771
+ },
8772
+ {
8773
+ "epoch": 0.9026308652020613,
8774
+ "grad_norm": 15.96875,
8775
+ "learning_rate": 9.929482242577696e-07,
8776
+ "loss": 25.2602,
8777
+ "step": 12480
8778
+ },
8779
+ {
8780
+ "epoch": 0.9033541271132809,
8781
+ "grad_norm": 16.640625,
8782
+ "learning_rate": 9.929425737964376e-07,
8783
+ "loss": 25.3239,
8784
+ "step": 12490
8785
+ },
8786
+ {
8787
+ "epoch": 0.9040773890245005,
8788
+ "grad_norm": 15.6796875,
8789
+ "learning_rate": 9.929369233351058e-07,
8790
+ "loss": 24.9313,
8791
+ "step": 12500
8792
+ },
8793
+ {
8794
+ "epoch": 0.9040773890245005,
8795
+ "eval_loss": 1.58199143409729,
8796
+ "eval_runtime": 386.6428,
8797
+ "eval_samples_per_second": 1877.855,
8798
+ "eval_steps_per_second": 29.342,
8799
+ "step": 12500
8800
  }
8801
  ],
8802
  "logging_steps": 10,
 
8816
  "attributes": {}
8817
  }
8818
  },
8819
+ "total_flos": 1.746160695902208e+19,
8820
  "train_batch_size": 16,
8821
  "trial_name": null,
8822
  "trial_params": null