ErrorAI commited on
Commit
606562d
·
verified ·
1 Parent(s): 645ab3e

Training in progress, step 1591, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee06df85ee9ce9db81e7204019b8f0a7f19eb8c293590e81a8cd39d8d277063b
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51cdaf5086b7b82427fc4011767f9e88401ef3f81e40ca7f58262c5ab7307034
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43f5803f8cb85771411fcb82c2f6b17260fe120ffcc052e76a092ca5d6a9d9c2
3
  size 43123028
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5266ba2a3b68d481a8a34332907a152a3bda628f2def4c81022f64a8ce3d7cf
3
  size 43123028
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c1fb310658ff1f50fc54115ae9d4e78e7e3ea093c3cf9829c139ae7e4b61332
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45bbba6724fdde45bd9c9bb5002e3746eb184509ea9f4097df5f5db4bd1cfe9f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:199a0947f39b569d38d96929faf35d1973a1e9fa755293411efcc0912ca27ced
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:162c1ad25761862d9b407277c8cf0896f3fe6d2e22cd237b2351358ca76e54a4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.41726367289882926,
5
  "eval_steps": 500,
6
- "global_step": 1194,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -8365,6 +8365,2785 @@
8365
  "learning_rate": 1.4679694691917662e-05,
8366
  "loss": 9.2163,
8367
  "step": 1194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8368
  }
8369
  ],
8370
  "logging_steps": 1,
@@ -8379,12 +11158,12 @@
8379
  "should_evaluate": false,
8380
  "should_log": false,
8381
  "should_save": true,
8382
- "should_training_stop": false
8383
  },
8384
  "attributes": {}
8385
  }
8386
  },
8387
- "total_flos": 8.370750897688412e+17,
8388
  "train_batch_size": 4,
8389
  "trial_name": null,
8390
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5560020968023763,
5
  "eval_steps": 500,
6
+ "global_step": 1591,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
8365
  "learning_rate": 1.4679694691917662e-05,
8366
  "loss": 9.2163,
8367
  "step": 1194
8368
+ },
8369
+ {
8370
+ "epoch": 0.41761313996155863,
8371
+ "grad_norm": 9.734679222106934,
8372
+ "learning_rate": 1.4609661870167868e-05,
8373
+ "loss": 8.4237,
8374
+ "step": 1195
8375
+ },
8376
+ {
8377
+ "epoch": 0.41796260702428795,
8378
+ "grad_norm": 10.375848770141602,
8379
+ "learning_rate": 1.4539767908670204e-05,
8380
+ "loss": 8.725,
8381
+ "step": 1196
8382
+ },
8383
+ {
8384
+ "epoch": 0.4183120740870173,
8385
+ "grad_norm": 10.524779319763184,
8386
+ "learning_rate": 1.447001308166599e-05,
8387
+ "loss": 9.8794,
8388
+ "step": 1197
8389
+ },
8390
+ {
8391
+ "epoch": 0.41866154114974663,
8392
+ "grad_norm": 9.33531379699707,
8393
+ "learning_rate": 1.4400397662850617e-05,
8394
+ "loss": 9.3512,
8395
+ "step": 1198
8396
+ },
8397
+ {
8398
+ "epoch": 0.419011008212476,
8399
+ "grad_norm": 14.353048324584961,
8400
+ "learning_rate": 1.4330921925372515e-05,
8401
+ "loss": 9.8691,
8402
+ "step": 1199
8403
+ },
8404
+ {
8405
+ "epoch": 0.4193604752752053,
8406
+ "grad_norm": 14.870552062988281,
8407
+ "learning_rate": 1.4261586141832012e-05,
8408
+ "loss": 9.7871,
8409
+ "step": 1200
8410
+ },
8411
+ {
8412
+ "epoch": 0.4197099423379346,
8413
+ "grad_norm": 4.2067365646362305,
8414
+ "learning_rate": 1.4192390584280346e-05,
8415
+ "loss": 8.1007,
8416
+ "step": 1201
8417
+ },
8418
+ {
8419
+ "epoch": 0.420059409400664,
8420
+ "grad_norm": 3.868065357208252,
8421
+ "learning_rate": 1.4123335524218494e-05,
8422
+ "loss": 8.1654,
8423
+ "step": 1202
8424
+ },
8425
+ {
8426
+ "epoch": 0.4204088764633933,
8427
+ "grad_norm": 5.0450520515441895,
8428
+ "learning_rate": 1.4054421232596204e-05,
8429
+ "loss": 8.7779,
8430
+ "step": 1203
8431
+ },
8432
+ {
8433
+ "epoch": 0.4207583435261227,
8434
+ "grad_norm": 3.8185129165649414,
8435
+ "learning_rate": 1.3985647979810912e-05,
8436
+ "loss": 8.9213,
8437
+ "step": 1204
8438
+ },
8439
+ {
8440
+ "epoch": 0.421107810588852,
8441
+ "grad_norm": 4.133491516113281,
8442
+ "learning_rate": 1.3917016035706631e-05,
8443
+ "loss": 8.9105,
8444
+ "step": 1205
8445
+ },
8446
+ {
8447
+ "epoch": 0.42145727765158136,
8448
+ "grad_norm": 6.560299396514893,
8449
+ "learning_rate": 1.3848525669572938e-05,
8450
+ "loss": 8.4676,
8451
+ "step": 1206
8452
+ },
8453
+ {
8454
+ "epoch": 0.4218067447143107,
8455
+ "grad_norm": 3.7674949169158936,
8456
+ "learning_rate": 1.3780177150143908e-05,
8457
+ "loss": 9.4156,
8458
+ "step": 1207
8459
+ },
8460
+ {
8461
+ "epoch": 0.42215621177704,
8462
+ "grad_norm": 5.045324325561523,
8463
+ "learning_rate": 1.371197074559702e-05,
8464
+ "loss": 8.7071,
8465
+ "step": 1208
8466
+ },
8467
+ {
8468
+ "epoch": 0.42250567883976936,
8469
+ "grad_norm": 5.599214553833008,
8470
+ "learning_rate": 1.3643906723552186e-05,
8471
+ "loss": 8.5113,
8472
+ "step": 1209
8473
+ },
8474
+ {
8475
+ "epoch": 0.4228551459024987,
8476
+ "grad_norm": 5.555638313293457,
8477
+ "learning_rate": 1.3575985351070636e-05,
8478
+ "loss": 9.3735,
8479
+ "step": 1210
8480
+ },
8481
+ {
8482
+ "epoch": 0.42320461296522804,
8483
+ "grad_norm": 6.305885314941406,
8484
+ "learning_rate": 1.3508206894653885e-05,
8485
+ "loss": 9.019,
8486
+ "step": 1211
8487
+ },
8488
+ {
8489
+ "epoch": 0.42355408002795736,
8490
+ "grad_norm": 6.226932525634766,
8491
+ "learning_rate": 1.3440571620242703e-05,
8492
+ "loss": 7.9253,
8493
+ "step": 1212
8494
+ },
8495
+ {
8496
+ "epoch": 0.4239035470906867,
8497
+ "grad_norm": 4.416421890258789,
8498
+ "learning_rate": 1.3373079793216036e-05,
8499
+ "loss": 8.9577,
8500
+ "step": 1213
8501
+ },
8502
+ {
8503
+ "epoch": 0.42425301415341604,
8504
+ "grad_norm": 7.161321640014648,
8505
+ "learning_rate": 1.3305731678390048e-05,
8506
+ "loss": 8.2947,
8507
+ "step": 1214
8508
+ },
8509
+ {
8510
+ "epoch": 0.42460248121614536,
8511
+ "grad_norm": 5.642546653747559,
8512
+ "learning_rate": 1.3238527540016942e-05,
8513
+ "loss": 8.6785,
8514
+ "step": 1215
8515
+ },
8516
+ {
8517
+ "epoch": 0.4249519482788747,
8518
+ "grad_norm": 5.648965358734131,
8519
+ "learning_rate": 1.3171467641784058e-05,
8520
+ "loss": 8.1202,
8521
+ "step": 1216
8522
+ },
8523
+ {
8524
+ "epoch": 0.42530141534160404,
8525
+ "grad_norm": 7.209357738494873,
8526
+ "learning_rate": 1.3104552246812768e-05,
8527
+ "loss": 8.3829,
8528
+ "step": 1217
8529
+ },
8530
+ {
8531
+ "epoch": 0.4256508824043334,
8532
+ "grad_norm": 5.8442254066467285,
8533
+ "learning_rate": 1.3037781617657457e-05,
8534
+ "loss": 9.7688,
8535
+ "step": 1218
8536
+ },
8537
+ {
8538
+ "epoch": 0.4260003494670627,
8539
+ "grad_norm": 6.313738822937012,
8540
+ "learning_rate": 1.2971156016304508e-05,
8541
+ "loss": 9.383,
8542
+ "step": 1219
8543
+ },
8544
+ {
8545
+ "epoch": 0.4263498165297921,
8546
+ "grad_norm": 5.820607662200928,
8547
+ "learning_rate": 1.2904675704171242e-05,
8548
+ "loss": 8.4141,
8549
+ "step": 1220
8550
+ },
8551
+ {
8552
+ "epoch": 0.4266992835925214,
8553
+ "grad_norm": 6.617226600646973,
8554
+ "learning_rate": 1.2838340942104943e-05,
8555
+ "loss": 9.2134,
8556
+ "step": 1221
8557
+ },
8558
+ {
8559
+ "epoch": 0.4270487506552507,
8560
+ "grad_norm": 5.748777866363525,
8561
+ "learning_rate": 1.2772151990381765e-05,
8562
+ "loss": 9.4157,
8563
+ "step": 1222
8564
+ },
8565
+ {
8566
+ "epoch": 0.4273982177179801,
8567
+ "grad_norm": 7.220109939575195,
8568
+ "learning_rate": 1.2706109108705777e-05,
8569
+ "loss": 8.8743,
8570
+ "step": 1223
8571
+ },
8572
+ {
8573
+ "epoch": 0.4277476847807094,
8574
+ "grad_norm": 9.637005805969238,
8575
+ "learning_rate": 1.264021255620791e-05,
8576
+ "loss": 9.4095,
8577
+ "step": 1224
8578
+ },
8579
+ {
8580
+ "epoch": 0.4280971518434388,
8581
+ "grad_norm": 7.984780788421631,
8582
+ "learning_rate": 1.257446259144494e-05,
8583
+ "loss": 8.9011,
8584
+ "step": 1225
8585
+ },
8586
+ {
8587
+ "epoch": 0.4284466189061681,
8588
+ "grad_norm": 7.144513130187988,
8589
+ "learning_rate": 1.250885947239851e-05,
8590
+ "loss": 8.8924,
8591
+ "step": 1226
8592
+ },
8593
+ {
8594
+ "epoch": 0.42879608596889746,
8595
+ "grad_norm": 8.714081764221191,
8596
+ "learning_rate": 1.2443403456474017e-05,
8597
+ "loss": 7.866,
8598
+ "step": 1227
8599
+ },
8600
+ {
8601
+ "epoch": 0.42914555303162677,
8602
+ "grad_norm": 7.528012275695801,
8603
+ "learning_rate": 1.2378094800499745e-05,
8604
+ "loss": 7.8886,
8605
+ "step": 1228
8606
+ },
8607
+ {
8608
+ "epoch": 0.4294950200943561,
8609
+ "grad_norm": 6.640905857086182,
8610
+ "learning_rate": 1.2312933760725754e-05,
8611
+ "loss": 9.0671,
8612
+ "step": 1229
8613
+ },
8614
+ {
8615
+ "epoch": 0.42984448715708545,
8616
+ "grad_norm": 8.708352088928223,
8617
+ "learning_rate": 1.2247920592822898e-05,
8618
+ "loss": 8.422,
8619
+ "step": 1230
8620
+ },
8621
+ {
8622
+ "epoch": 0.43019395421981477,
8623
+ "grad_norm": 7.973822116851807,
8624
+ "learning_rate": 1.2183055551881866e-05,
8625
+ "loss": 8.9585,
8626
+ "step": 1231
8627
+ },
8628
+ {
8629
+ "epoch": 0.43054342128254414,
8630
+ "grad_norm": 10.847684860229492,
8631
+ "learning_rate": 1.2118338892412129e-05,
8632
+ "loss": 8.1578,
8633
+ "step": 1232
8634
+ },
8635
+ {
8636
+ "epoch": 0.43089288834527345,
8637
+ "grad_norm": 9.344101905822754,
8638
+ "learning_rate": 1.2053770868340914e-05,
8639
+ "loss": 7.5936,
8640
+ "step": 1233
8641
+ },
8642
+ {
8643
+ "epoch": 0.4312423554080028,
8644
+ "grad_norm": 15.424657821655273,
8645
+ "learning_rate": 1.1989351733012299e-05,
8646
+ "loss": 8.9074,
8647
+ "step": 1234
8648
+ },
8649
+ {
8650
+ "epoch": 0.43159182247073213,
8651
+ "grad_norm": 8.517824172973633,
8652
+ "learning_rate": 1.1925081739186161e-05,
8653
+ "loss": 8.2857,
8654
+ "step": 1235
8655
+ },
8656
+ {
8657
+ "epoch": 0.43194128953346145,
8658
+ "grad_norm": 7.919290065765381,
8659
+ "learning_rate": 1.1860961139037191e-05,
8660
+ "loss": 9.2476,
8661
+ "step": 1236
8662
+ },
8663
+ {
8664
+ "epoch": 0.4322907565961908,
8665
+ "grad_norm": 8.285776138305664,
8666
+ "learning_rate": 1.17969901841539e-05,
8667
+ "loss": 8.3369,
8668
+ "step": 1237
8669
+ },
8670
+ {
8671
+ "epoch": 0.43264022365892013,
8672
+ "grad_norm": 10.17358684539795,
8673
+ "learning_rate": 1.1733169125537652e-05,
8674
+ "loss": 8.0393,
8675
+ "step": 1238
8676
+ },
8677
+ {
8678
+ "epoch": 0.4329896907216495,
8679
+ "grad_norm": 9.858621597290039,
8680
+ "learning_rate": 1.1669498213601653e-05,
8681
+ "loss": 9.0913,
8682
+ "step": 1239
8683
+ },
8684
+ {
8685
+ "epoch": 0.4333391577843788,
8686
+ "grad_norm": 10.003377914428711,
8687
+ "learning_rate": 1.1605977698170001e-05,
8688
+ "loss": 8.055,
8689
+ "step": 1240
8690
+ },
8691
+ {
8692
+ "epoch": 0.4336886248471082,
8693
+ "grad_norm": 12.729085922241211,
8694
+ "learning_rate": 1.1542607828476666e-05,
8695
+ "loss": 8.5227,
8696
+ "step": 1241
8697
+ },
8698
+ {
8699
+ "epoch": 0.4340380919098375,
8700
+ "grad_norm": 9.930816650390625,
8701
+ "learning_rate": 1.147938885316454e-05,
8702
+ "loss": 8.6561,
8703
+ "step": 1242
8704
+ },
8705
+ {
8706
+ "epoch": 0.4343875589725668,
8707
+ "grad_norm": 10.807096481323242,
8708
+ "learning_rate": 1.1416321020284459e-05,
8709
+ "loss": 9.1103,
8710
+ "step": 1243
8711
+ },
8712
+ {
8713
+ "epoch": 0.4347370260352962,
8714
+ "grad_norm": 8.538290977478027,
8715
+ "learning_rate": 1.135340457729423e-05,
8716
+ "loss": 9.6528,
8717
+ "step": 1244
8718
+ },
8719
+ {
8720
+ "epoch": 0.4350864930980255,
8721
+ "grad_norm": 9.432942390441895,
8722
+ "learning_rate": 1.1290639771057648e-05,
8723
+ "loss": 8.4473,
8724
+ "step": 1245
8725
+ },
8726
+ {
8727
+ "epoch": 0.43543596016075486,
8728
+ "grad_norm": 13.870254516601562,
8729
+ "learning_rate": 1.122802684784351e-05,
8730
+ "loss": 9.2547,
8731
+ "step": 1246
8732
+ },
8733
+ {
8734
+ "epoch": 0.4357854272234842,
8735
+ "grad_norm": 9.150087356567383,
8736
+ "learning_rate": 1.1165566053324699e-05,
8737
+ "loss": 8.9713,
8738
+ "step": 1247
8739
+ },
8740
+ {
8741
+ "epoch": 0.43613489428621355,
8742
+ "grad_norm": 11.067329406738281,
8743
+ "learning_rate": 1.1103257632577214e-05,
8744
+ "loss": 9.865,
8745
+ "step": 1248
8746
+ },
8747
+ {
8748
+ "epoch": 0.43648436134894286,
8749
+ "grad_norm": 11.416949272155762,
8750
+ "learning_rate": 1.1041101830079159e-05,
8751
+ "loss": 10.1585,
8752
+ "step": 1249
8753
+ },
8754
+ {
8755
+ "epoch": 0.4368338284116722,
8756
+ "grad_norm": 16.7679443359375,
8757
+ "learning_rate": 1.0979098889709815e-05,
8758
+ "loss": 11.3673,
8759
+ "step": 1250
8760
+ },
8761
+ {
8762
+ "epoch": 0.43718329547440155,
8763
+ "grad_norm": 3.188689708709717,
8764
+ "learning_rate": 1.0917249054748701e-05,
8765
+ "loss": 8.7176,
8766
+ "step": 1251
8767
+ },
8768
+ {
8769
+ "epoch": 0.43753276253713086,
8770
+ "grad_norm": 3.094052314758301,
8771
+ "learning_rate": 1.0855552567874561e-05,
8772
+ "loss": 8.8491,
8773
+ "step": 1252
8774
+ },
8775
+ {
8776
+ "epoch": 0.43788222959986023,
8777
+ "grad_norm": 3.567803382873535,
8778
+ "learning_rate": 1.0794009671164484e-05,
8779
+ "loss": 9.0748,
8780
+ "step": 1253
8781
+ },
8782
+ {
8783
+ "epoch": 0.43823169666258954,
8784
+ "grad_norm": 5.487453937530518,
8785
+ "learning_rate": 1.0732620606092919e-05,
8786
+ "loss": 8.4438,
8787
+ "step": 1254
8788
+ },
8789
+ {
8790
+ "epoch": 0.4385811637253189,
8791
+ "grad_norm": 4.8010945320129395,
8792
+ "learning_rate": 1.067138561353072e-05,
8793
+ "loss": 8.4275,
8794
+ "step": 1255
8795
+ },
8796
+ {
8797
+ "epoch": 0.4389306307880482,
8798
+ "grad_norm": 4.5531086921691895,
8799
+ "learning_rate": 1.0610304933744208e-05,
8800
+ "loss": 9.2627,
8801
+ "step": 1256
8802
+ },
8803
+ {
8804
+ "epoch": 0.43928009785077754,
8805
+ "grad_norm": 6.944797515869141,
8806
+ "learning_rate": 1.0549378806394239e-05,
8807
+ "loss": 7.2999,
8808
+ "step": 1257
8809
+ },
8810
+ {
8811
+ "epoch": 0.4396295649135069,
8812
+ "grad_norm": 4.913161277770996,
8813
+ "learning_rate": 1.048860747053525e-05,
8814
+ "loss": 8.9859,
8815
+ "step": 1258
8816
+ },
8817
+ {
8818
+ "epoch": 0.4399790319762362,
8819
+ "grad_norm": 7.656060218811035,
8820
+ "learning_rate": 1.042799116461433e-05,
8821
+ "loss": 6.9834,
8822
+ "step": 1259
8823
+ },
8824
+ {
8825
+ "epoch": 0.4403284990389656,
8826
+ "grad_norm": 18.099225997924805,
8827
+ "learning_rate": 1.0367530126470277e-05,
8828
+ "loss": 7.7927,
8829
+ "step": 1260
8830
+ },
8831
+ {
8832
+ "epoch": 0.4406779661016949,
8833
+ "grad_norm": 6.995333194732666,
8834
+ "learning_rate": 1.030722459333267e-05,
8835
+ "loss": 7.3382,
8836
+ "step": 1261
8837
+ },
8838
+ {
8839
+ "epoch": 0.4410274331644243,
8840
+ "grad_norm": 6.343902111053467,
8841
+ "learning_rate": 1.0247074801820933e-05,
8842
+ "loss": 7.6751,
8843
+ "step": 1262
8844
+ },
8845
+ {
8846
+ "epoch": 0.4413769002271536,
8847
+ "grad_norm": 8.965407371520996,
8848
+ "learning_rate": 1.0187080987943415e-05,
8849
+ "loss": 9.0194,
8850
+ "step": 1263
8851
+ },
8852
+ {
8853
+ "epoch": 0.4417263672898829,
8854
+ "grad_norm": 7.482065677642822,
8855
+ "learning_rate": 1.0127243387096459e-05,
8856
+ "loss": 7.8801,
8857
+ "step": 1264
8858
+ },
8859
+ {
8860
+ "epoch": 0.4420758343526123,
8861
+ "grad_norm": 5.747515678405762,
8862
+ "learning_rate": 1.0067562234063477e-05,
8863
+ "loss": 7.9927,
8864
+ "step": 1265
8865
+ },
8866
+ {
8867
+ "epoch": 0.4424253014153416,
8868
+ "grad_norm": 5.930813312530518,
8869
+ "learning_rate": 1.0008037763014032e-05,
8870
+ "loss": 8.7441,
8871
+ "step": 1266
8872
+ },
8873
+ {
8874
+ "epoch": 0.44277476847807096,
8875
+ "grad_norm": 5.099485397338867,
8876
+ "learning_rate": 9.948670207502907e-06,
8877
+ "loss": 9.0423,
8878
+ "step": 1267
8879
+ },
8880
+ {
8881
+ "epoch": 0.44312423554080027,
8882
+ "grad_norm": 8.179178237915039,
8883
+ "learning_rate": 9.889459800469219e-06,
8884
+ "loss": 9.2681,
8885
+ "step": 1268
8886
+ },
8887
+ {
8888
+ "epoch": 0.44347370260352964,
8889
+ "grad_norm": 8.752791404724121,
8890
+ "learning_rate": 9.830406774235473e-06,
8891
+ "loss": 8.8095,
8892
+ "step": 1269
8893
+ },
8894
+ {
8895
+ "epoch": 0.44382316966625895,
8896
+ "grad_norm": 9.359688758850098,
8897
+ "learning_rate": 9.771511360506668e-06,
8898
+ "loss": 7.6182,
8899
+ "step": 1270
8900
+ },
8901
+ {
8902
+ "epoch": 0.44417263672898827,
8903
+ "grad_norm": 6.538898468017578,
8904
+ "learning_rate": 9.712773790369361e-06,
8905
+ "loss": 8.9901,
8906
+ "step": 1271
8907
+ },
8908
+ {
8909
+ "epoch": 0.44452210379171764,
8910
+ "grad_norm": 11.179584503173828,
8911
+ "learning_rate": 9.654194294290808e-06,
8912
+ "loss": 8.3577,
8913
+ "step": 1272
8914
+ },
8915
+ {
8916
+ "epoch": 0.44487157085444695,
8917
+ "grad_norm": 8.987554550170898,
8918
+ "learning_rate": 9.595773102118023e-06,
8919
+ "loss": 8.6319,
8920
+ "step": 1273
8921
+ },
8922
+ {
8923
+ "epoch": 0.4452210379171763,
8924
+ "grad_norm": 8.752106666564941,
8925
+ "learning_rate": 9.53751044307688e-06,
8926
+ "loss": 8.6063,
8927
+ "step": 1274
8928
+ },
8929
+ {
8930
+ "epoch": 0.44557050497990563,
8931
+ "grad_norm": 9.07926082611084,
8932
+ "learning_rate": 9.479406545771252e-06,
8933
+ "loss": 9.227,
8934
+ "step": 1275
8935
+ },
8936
+ {
8937
+ "epoch": 0.445919972042635,
8938
+ "grad_norm": 6.2154974937438965,
8939
+ "learning_rate": 9.421461638182056e-06,
8940
+ "loss": 8.815,
8941
+ "step": 1276
8942
+ },
8943
+ {
8944
+ "epoch": 0.4462694391053643,
8945
+ "grad_norm": 7.014590740203857,
8946
+ "learning_rate": 9.363675947666362e-06,
8947
+ "loss": 8.4861,
8948
+ "step": 1277
8949
+ },
8950
+ {
8951
+ "epoch": 0.44661890616809363,
8952
+ "grad_norm": 6.4154462814331055,
8953
+ "learning_rate": 9.306049700956565e-06,
8954
+ "loss": 8.7522,
8955
+ "step": 1278
8956
+ },
8957
+ {
8958
+ "epoch": 0.446968373230823,
8959
+ "grad_norm": 10.309046745300293,
8960
+ "learning_rate": 9.248583124159438e-06,
8961
+ "loss": 6.7594,
8962
+ "step": 1279
8963
+ },
8964
+ {
8965
+ "epoch": 0.4473178402935523,
8966
+ "grad_norm": 5.91942834854126,
8967
+ "learning_rate": 9.191276442755259e-06,
8968
+ "loss": 8.7557,
8969
+ "step": 1280
8970
+ },
8971
+ {
8972
+ "epoch": 0.4476673073562817,
8973
+ "grad_norm": 7.594449043273926,
8974
+ "learning_rate": 9.134129881596925e-06,
8975
+ "loss": 9.1916,
8976
+ "step": 1281
8977
+ },
8978
+ {
8979
+ "epoch": 0.448016774419011,
8980
+ "grad_norm": 8.402937889099121,
8981
+ "learning_rate": 9.077143664909087e-06,
8982
+ "loss": 9.1086,
8983
+ "step": 1282
8984
+ },
8985
+ {
8986
+ "epoch": 0.44836624148174037,
8987
+ "grad_norm": 10.502628326416016,
8988
+ "learning_rate": 9.020318016287244e-06,
8989
+ "loss": 7.7505,
8990
+ "step": 1283
8991
+ },
8992
+ {
8993
+ "epoch": 0.4487157085444697,
8994
+ "grad_norm": 7.38737154006958,
8995
+ "learning_rate": 8.963653158696878e-06,
8996
+ "loss": 9.7786,
8997
+ "step": 1284
8998
+ },
8999
+ {
9000
+ "epoch": 0.449065175607199,
9001
+ "grad_norm": 7.058697700500488,
9002
+ "learning_rate": 8.90714931447258e-06,
9003
+ "loss": 9.1793,
9004
+ "step": 1285
9005
+ },
9006
+ {
9007
+ "epoch": 0.44941464266992837,
9008
+ "grad_norm": 7.430933475494385,
9009
+ "learning_rate": 8.850806705317183e-06,
9010
+ "loss": 8.3519,
9011
+ "step": 1286
9012
+ },
9013
+ {
9014
+ "epoch": 0.4497641097326577,
9015
+ "grad_norm": 11.711349487304688,
9016
+ "learning_rate": 8.794625552300878e-06,
9017
+ "loss": 8.2734,
9018
+ "step": 1287
9019
+ },
9020
+ {
9021
+ "epoch": 0.45011357679538705,
9022
+ "grad_norm": 12.177581787109375,
9023
+ "learning_rate": 8.738606075860357e-06,
9024
+ "loss": 7.9435,
9025
+ "step": 1288
9026
+ },
9027
+ {
9028
+ "epoch": 0.45046304385811636,
9029
+ "grad_norm": 8.499977111816406,
9030
+ "learning_rate": 8.682748495797966e-06,
9031
+ "loss": 9.8977,
9032
+ "step": 1289
9033
+ },
9034
+ {
9035
+ "epoch": 0.45081251092084573,
9036
+ "grad_norm": 11.1962308883667,
9037
+ "learning_rate": 8.627053031280751e-06,
9038
+ "loss": 8.8722,
9039
+ "step": 1290
9040
+ },
9041
+ {
9042
+ "epoch": 0.45116197798357505,
9043
+ "grad_norm": 7.799564361572266,
9044
+ "learning_rate": 8.571519900839764e-06,
9045
+ "loss": 9.0422,
9046
+ "step": 1291
9047
+ },
9048
+ {
9049
+ "epoch": 0.45151144504630436,
9050
+ "grad_norm": 8.08339786529541,
9051
+ "learning_rate": 8.516149322369054e-06,
9052
+ "loss": 9.1788,
9053
+ "step": 1292
9054
+ },
9055
+ {
9056
+ "epoch": 0.45186091210903373,
9057
+ "grad_norm": 10.770648002624512,
9058
+ "learning_rate": 8.460941513124876e-06,
9059
+ "loss": 9.7641,
9060
+ "step": 1293
9061
+ },
9062
+ {
9063
+ "epoch": 0.45221037917176304,
9064
+ "grad_norm": 9.788839340209961,
9065
+ "learning_rate": 8.405896689724845e-06,
9066
+ "loss": 8.537,
9067
+ "step": 1294
9068
+ },
9069
+ {
9070
+ "epoch": 0.4525598462344924,
9071
+ "grad_norm": 10.453418731689453,
9072
+ "learning_rate": 8.351015068147066e-06,
9073
+ "loss": 8.1029,
9074
+ "step": 1295
9075
+ },
9076
+ {
9077
+ "epoch": 0.4529093132972217,
9078
+ "grad_norm": 14.541789054870605,
9079
+ "learning_rate": 8.296296863729264e-06,
9080
+ "loss": 8.582,
9081
+ "step": 1296
9082
+ },
9083
+ {
9084
+ "epoch": 0.4532587803599511,
9085
+ "grad_norm": 9.695807456970215,
9086
+ "learning_rate": 8.241742291168014e-06,
9087
+ "loss": 10.2847,
9088
+ "step": 1297
9089
+ },
9090
+ {
9091
+ "epoch": 0.4536082474226804,
9092
+ "grad_norm": 14.214749336242676,
9093
+ "learning_rate": 8.187351564517832e-06,
9094
+ "loss": 9.8919,
9095
+ "step": 1298
9096
+ },
9097
+ {
9098
+ "epoch": 0.4539577144854097,
9099
+ "grad_norm": 16.4647274017334,
9100
+ "learning_rate": 8.133124897190348e-06,
9101
+ "loss": 8.346,
9102
+ "step": 1299
9103
+ },
9104
+ {
9105
+ "epoch": 0.4543071815481391,
9106
+ "grad_norm": 11.298583984375,
9107
+ "learning_rate": 8.07906250195347e-06,
9108
+ "loss": 10.9873,
9109
+ "step": 1300
9110
+ },
9111
+ {
9112
+ "epoch": 0.4546566486108684,
9113
+ "grad_norm": 3.8676533699035645,
9114
+ "learning_rate": 8.025164590930624e-06,
9115
+ "loss": 8.5542,
9116
+ "step": 1301
9117
+ },
9118
+ {
9119
+ "epoch": 0.4550061156735978,
9120
+ "grad_norm": 4.152469635009766,
9121
+ "learning_rate": 7.971431375599736e-06,
9122
+ "loss": 8.1374,
9123
+ "step": 1302
9124
+ },
9125
+ {
9126
+ "epoch": 0.4553555827363271,
9127
+ "grad_norm": 3.3269169330596924,
9128
+ "learning_rate": 7.917863066792613e-06,
9129
+ "loss": 9.6506,
9130
+ "step": 1303
9131
+ },
9132
+ {
9133
+ "epoch": 0.45570504979905646,
9134
+ "grad_norm": 4.010972499847412,
9135
+ "learning_rate": 7.864459874693986e-06,
9136
+ "loss": 8.7469,
9137
+ "step": 1304
9138
+ },
9139
+ {
9140
+ "epoch": 0.4560545168617858,
9141
+ "grad_norm": 6.283317565917969,
9142
+ "learning_rate": 7.81122200884072e-06,
9143
+ "loss": 8.4387,
9144
+ "step": 1305
9145
+ },
9146
+ {
9147
+ "epoch": 0.4564039839245151,
9148
+ "grad_norm": 4.048277378082275,
9149
+ "learning_rate": 7.758149678121007e-06,
9150
+ "loss": 9.4361,
9151
+ "step": 1306
9152
+ },
9153
+ {
9154
+ "epoch": 0.45675345098724446,
9155
+ "grad_norm": 3.8338565826416016,
9156
+ "learning_rate": 7.705243090773522e-06,
9157
+ "loss": 9.2712,
9158
+ "step": 1307
9159
+ },
9160
+ {
9161
+ "epoch": 0.45710291804997377,
9162
+ "grad_norm": 5.3223090171813965,
9163
+ "learning_rate": 7.652502454386617e-06,
9164
+ "loss": 8.9995,
9165
+ "step": 1308
9166
+ },
9167
+ {
9168
+ "epoch": 0.45745238511270314,
9169
+ "grad_norm": 11.755619049072266,
9170
+ "learning_rate": 7.599927975897514e-06,
9171
+ "loss": 8.4042,
9172
+ "step": 1309
9173
+ },
9174
+ {
9175
+ "epoch": 0.45780185217543246,
9176
+ "grad_norm": 7.2003912925720215,
9177
+ "learning_rate": 7.547519861591473e-06,
9178
+ "loss": 7.6755,
9179
+ "step": 1310
9180
+ },
9181
+ {
9182
+ "epoch": 0.4581513192381618,
9183
+ "grad_norm": 7.062569618225098,
9184
+ "learning_rate": 7.495278317101006e-06,
9185
+ "loss": 8.8,
9186
+ "step": 1311
9187
+ },
9188
+ {
9189
+ "epoch": 0.45850078630089114,
9190
+ "grad_norm": 6.048025608062744,
9191
+ "learning_rate": 7.443203547405053e-06,
9192
+ "loss": 8.3188,
9193
+ "step": 1312
9194
+ },
9195
+ {
9196
+ "epoch": 0.45885025336362045,
9197
+ "grad_norm": 6.554856777191162,
9198
+ "learning_rate": 7.3912957568281874e-06,
9199
+ "loss": 9.1905,
9200
+ "step": 1313
9201
+ },
9202
+ {
9203
+ "epoch": 0.4591997204263498,
9204
+ "grad_norm": 5.91583776473999,
9205
+ "learning_rate": 7.339555149039817e-06,
9206
+ "loss": 8.7268,
9207
+ "step": 1314
9208
+ },
9209
+ {
9210
+ "epoch": 0.45954918748907914,
9211
+ "grad_norm": 6.909355640411377,
9212
+ "learning_rate": 7.287981927053344e-06,
9213
+ "loss": 8.4991,
9214
+ "step": 1315
9215
+ },
9216
+ {
9217
+ "epoch": 0.4598986545518085,
9218
+ "grad_norm": 6.877124309539795,
9219
+ "learning_rate": 7.236576293225444e-06,
9220
+ "loss": 7.8285,
9221
+ "step": 1316
9222
+ },
9223
+ {
9224
+ "epoch": 0.4602481216145378,
9225
+ "grad_norm": 5.941048622131348,
9226
+ "learning_rate": 7.185338449255197e-06,
9227
+ "loss": 8.5166,
9228
+ "step": 1317
9229
+ },
9230
+ {
9231
+ "epoch": 0.4605975886772672,
9232
+ "grad_norm": 5.189998149871826,
9233
+ "learning_rate": 7.13426859618338e-06,
9234
+ "loss": 9.9478,
9235
+ "step": 1318
9236
+ },
9237
+ {
9238
+ "epoch": 0.4609470557399965,
9239
+ "grad_norm": 5.6405253410339355,
9240
+ "learning_rate": 7.083366934391578e-06,
9241
+ "loss": 9.1174,
9242
+ "step": 1319
9243
+ },
9244
+ {
9245
+ "epoch": 0.4612965228027258,
9246
+ "grad_norm": 11.765382766723633,
9247
+ "learning_rate": 7.032633663601473e-06,
9248
+ "loss": 8.1464,
9249
+ "step": 1320
9250
+ },
9251
+ {
9252
+ "epoch": 0.4616459898654552,
9253
+ "grad_norm": 7.946734428405762,
9254
+ "learning_rate": 6.9820689828740185e-06,
9255
+ "loss": 8.3851,
9256
+ "step": 1321
9257
+ },
9258
+ {
9259
+ "epoch": 0.4619954569281845,
9260
+ "grad_norm": 10.29844856262207,
9261
+ "learning_rate": 6.931673090608681e-06,
9262
+ "loss": 7.3518,
9263
+ "step": 1322
9264
+ },
9265
+ {
9266
+ "epoch": 0.46234492399091387,
9267
+ "grad_norm": 5.46825647354126,
9268
+ "learning_rate": 6.8814461845426615e-06,
9269
+ "loss": 10.252,
9270
+ "step": 1323
9271
+ },
9272
+ {
9273
+ "epoch": 0.4626943910536432,
9274
+ "grad_norm": 8.910709381103516,
9275
+ "learning_rate": 6.831388461750115e-06,
9276
+ "loss": 9.0851,
9277
+ "step": 1324
9278
+ },
9279
+ {
9280
+ "epoch": 0.46304385811637255,
9281
+ "grad_norm": 6.423018455505371,
9282
+ "learning_rate": 6.781500118641376e-06,
9283
+ "loss": 9.6204,
9284
+ "step": 1325
9285
+ },
9286
+ {
9287
+ "epoch": 0.46339332517910187,
9288
+ "grad_norm": 8.594428062438965,
9289
+ "learning_rate": 6.731781350962174e-06,
9290
+ "loss": 8.1211,
9291
+ "step": 1326
9292
+ },
9293
+ {
9294
+ "epoch": 0.4637427922418312,
9295
+ "grad_norm": 6.105467319488525,
9296
+ "learning_rate": 6.682232353792894e-06,
9297
+ "loss": 8.6828,
9298
+ "step": 1327
9299
+ },
9300
+ {
9301
+ "epoch": 0.46409225930456055,
9302
+ "grad_norm": 9.352825164794922,
9303
+ "learning_rate": 6.632853321547794e-06,
9304
+ "loss": 9.3629,
9305
+ "step": 1328
9306
+ },
9307
+ {
9308
+ "epoch": 0.46444172636728986,
9309
+ "grad_norm": 8.1890869140625,
9310
+ "learning_rate": 6.583644447974241e-06,
9311
+ "loss": 8.6678,
9312
+ "step": 1329
9313
+ },
9314
+ {
9315
+ "epoch": 0.46479119343001923,
9316
+ "grad_norm": 9.602250099182129,
9317
+ "learning_rate": 6.534605926151949e-06,
9318
+ "loss": 8.0324,
9319
+ "step": 1330
9320
+ },
9321
+ {
9322
+ "epoch": 0.46514066049274855,
9323
+ "grad_norm": 9.674601554870605,
9324
+ "learning_rate": 6.4857379484922375e-06,
9325
+ "loss": 7.4394,
9326
+ "step": 1331
9327
+ },
9328
+ {
9329
+ "epoch": 0.4654901275554779,
9330
+ "grad_norm": 8.674582481384277,
9331
+ "learning_rate": 6.43704070673728e-06,
9332
+ "loss": 8.3468,
9333
+ "step": 1332
9334
+ },
9335
+ {
9336
+ "epoch": 0.46583959461820723,
9337
+ "grad_norm": 8.77232837677002,
9338
+ "learning_rate": 6.388514391959283e-06,
9339
+ "loss": 8.2759,
9340
+ "step": 1333
9341
+ },
9342
+ {
9343
+ "epoch": 0.46618906168093655,
9344
+ "grad_norm": 8.102982521057129,
9345
+ "learning_rate": 6.340159194559836e-06,
9346
+ "loss": 8.3209,
9347
+ "step": 1334
9348
+ },
9349
+ {
9350
+ "epoch": 0.4665385287436659,
9351
+ "grad_norm": 7.643329620361328,
9352
+ "learning_rate": 6.291975304269126e-06,
9353
+ "loss": 8.6541,
9354
+ "step": 1335
9355
+ },
9356
+ {
9357
+ "epoch": 0.46688799580639523,
9358
+ "grad_norm": 9.140750885009766,
9359
+ "learning_rate": 6.243962910145168e-06,
9360
+ "loss": 9.0378,
9361
+ "step": 1336
9362
+ },
9363
+ {
9364
+ "epoch": 0.4672374628691246,
9365
+ "grad_norm": 9.416555404663086,
9366
+ "learning_rate": 6.196122200573074e-06,
9367
+ "loss": 8.7321,
9368
+ "step": 1337
9369
+ },
9370
+ {
9371
+ "epoch": 0.4675869299318539,
9372
+ "grad_norm": 9.755743980407715,
9373
+ "learning_rate": 6.1484533632643395e-06,
9374
+ "loss": 8.9093,
9375
+ "step": 1338
9376
+ },
9377
+ {
9378
+ "epoch": 0.4679363969945833,
9379
+ "grad_norm": 13.36638355255127,
9380
+ "learning_rate": 6.100956585256084e-06,
9381
+ "loss": 8.6675,
9382
+ "step": 1339
9383
+ },
9384
+ {
9385
+ "epoch": 0.4682858640573126,
9386
+ "grad_norm": 9.211478233337402,
9387
+ "learning_rate": 6.053632052910297e-06,
9388
+ "loss": 8.8965,
9389
+ "step": 1340
9390
+ },
9391
+ {
9392
+ "epoch": 0.4686353311200419,
9393
+ "grad_norm": 10.262747764587402,
9394
+ "learning_rate": 6.006479951913169e-06,
9395
+ "loss": 7.8928,
9396
+ "step": 1341
9397
+ },
9398
+ {
9399
+ "epoch": 0.4689847981827713,
9400
+ "grad_norm": 9.098506927490234,
9401
+ "learning_rate": 5.9595004672743035e-06,
9402
+ "loss": 8.1507,
9403
+ "step": 1342
9404
+ },
9405
+ {
9406
+ "epoch": 0.4693342652455006,
9407
+ "grad_norm": 10.686274528503418,
9408
+ "learning_rate": 5.912693783326029e-06,
9409
+ "loss": 9.1163,
9410
+ "step": 1343
9411
+ },
9412
+ {
9413
+ "epoch": 0.46968373230822996,
9414
+ "grad_norm": 9.65219783782959,
9415
+ "learning_rate": 5.866060083722624e-06,
9416
+ "loss": 8.686,
9417
+ "step": 1344
9418
+ },
9419
+ {
9420
+ "epoch": 0.4700331993709593,
9421
+ "grad_norm": 11.175179481506348,
9422
+ "learning_rate": 5.819599551439703e-06,
9423
+ "loss": 8.4236,
9424
+ "step": 1345
9425
+ },
9426
+ {
9427
+ "epoch": 0.47038266643368865,
9428
+ "grad_norm": 11.540955543518066,
9429
+ "learning_rate": 5.773312368773342e-06,
9430
+ "loss": 9.2144,
9431
+ "step": 1346
9432
+ },
9433
+ {
9434
+ "epoch": 0.47073213349641796,
9435
+ "grad_norm": 9.967883110046387,
9436
+ "learning_rate": 5.727198717339511e-06,
9437
+ "loss": 10.2552,
9438
+ "step": 1347
9439
+ },
9440
+ {
9441
+ "epoch": 0.4710816005591473,
9442
+ "grad_norm": 13.308283805847168,
9443
+ "learning_rate": 5.681258778073267e-06,
9444
+ "loss": 9.5853,
9445
+ "step": 1348
9446
+ },
9447
+ {
9448
+ "epoch": 0.47143106762187664,
9449
+ "grad_norm": 12.092696189880371,
9450
+ "learning_rate": 5.635492731228098e-06,
9451
+ "loss": 8.1454,
9452
+ "step": 1349
9453
+ },
9454
+ {
9455
+ "epoch": 0.47178053468460596,
9456
+ "grad_norm": 14.039538383483887,
9457
+ "learning_rate": 5.5899007563751825e-06,
9458
+ "loss": 11.0537,
9459
+ "step": 1350
9460
+ },
9461
+ {
9462
+ "epoch": 0.4721300017473353,
9463
+ "grad_norm": 3.305018424987793,
9464
+ "learning_rate": 5.5444830324026976e-06,
9465
+ "loss": 9.0159,
9466
+ "step": 1351
9467
+ },
9468
+ {
9469
+ "epoch": 0.47247946881006464,
9470
+ "grad_norm": 3.534560441970825,
9471
+ "learning_rate": 5.499239737515116e-06,
9472
+ "loss": 8.608,
9473
+ "step": 1352
9474
+ },
9475
+ {
9476
+ "epoch": 0.472828935872794,
9477
+ "grad_norm": 4.541440486907959,
9478
+ "learning_rate": 5.454171049232509e-06,
9479
+ "loss": 9.2173,
9480
+ "step": 1353
9481
+ },
9482
+ {
9483
+ "epoch": 0.4731784029355233,
9484
+ "grad_norm": 4.481523513793945,
9485
+ "learning_rate": 5.409277144389851e-06,
9486
+ "loss": 7.4883,
9487
+ "step": 1354
9488
+ },
9489
+ {
9490
+ "epoch": 0.47352786999825264,
9491
+ "grad_norm": 3.991349220275879,
9492
+ "learning_rate": 5.3645581991363155e-06,
9493
+ "loss": 8.9868,
9494
+ "step": 1355
9495
+ },
9496
+ {
9497
+ "epoch": 0.473877337060982,
9498
+ "grad_norm": 5.91257381439209,
9499
+ "learning_rate": 5.320014388934602e-06,
9500
+ "loss": 8.7072,
9501
+ "step": 1356
9502
+ },
9503
+ {
9504
+ "epoch": 0.4742268041237113,
9505
+ "grad_norm": 9.132447242736816,
9506
+ "learning_rate": 5.275645888560232e-06,
9507
+ "loss": 6.1554,
9508
+ "step": 1357
9509
+ },
9510
+ {
9511
+ "epoch": 0.4745762711864407,
9512
+ "grad_norm": 5.942850589752197,
9513
+ "learning_rate": 5.231452872100845e-06,
9514
+ "loss": 7.9525,
9515
+ "step": 1358
9516
+ },
9517
+ {
9518
+ "epoch": 0.47492573824917,
9519
+ "grad_norm": 5.887539386749268,
9520
+ "learning_rate": 5.187435512955574e-06,
9521
+ "loss": 8.2633,
9522
+ "step": 1359
9523
+ },
9524
+ {
9525
+ "epoch": 0.4752752053118994,
9526
+ "grad_norm": 5.124833106994629,
9527
+ "learning_rate": 5.143593983834311e-06,
9528
+ "loss": 9.4413,
9529
+ "step": 1360
9530
+ },
9531
+ {
9532
+ "epoch": 0.4756246723746287,
9533
+ "grad_norm": 6.190767288208008,
9534
+ "learning_rate": 5.099928456757036e-06,
9535
+ "loss": 8.0085,
9536
+ "step": 1361
9537
+ },
9538
+ {
9539
+ "epoch": 0.475974139437358,
9540
+ "grad_norm": 6.943732738494873,
9541
+ "learning_rate": 5.056439103053201e-06,
9542
+ "loss": 8.2279,
9543
+ "step": 1362
9544
+ },
9545
+ {
9546
+ "epoch": 0.47632360650008737,
9547
+ "grad_norm": 10.603092193603516,
9548
+ "learning_rate": 5.013126093360959e-06,
9549
+ "loss": 8.0032,
9550
+ "step": 1363
9551
+ },
9552
+ {
9553
+ "epoch": 0.4766730735628167,
9554
+ "grad_norm": 7.044005393981934,
9555
+ "learning_rate": 4.969989597626578e-06,
9556
+ "loss": 7.8796,
9557
+ "step": 1364
9558
+ },
9559
+ {
9560
+ "epoch": 0.47702254062554605,
9561
+ "grad_norm": 4.6816534996032715,
9562
+ "learning_rate": 4.927029785103715e-06,
9563
+ "loss": 8.9511,
9564
+ "step": 1365
9565
+ },
9566
+ {
9567
+ "epoch": 0.47737200768827537,
9568
+ "grad_norm": 7.391674041748047,
9569
+ "learning_rate": 4.884246824352795e-06,
9570
+ "loss": 8.9995,
9571
+ "step": 1366
9572
+ },
9573
+ {
9574
+ "epoch": 0.47772147475100474,
9575
+ "grad_norm": 6.969574928283691,
9576
+ "learning_rate": 4.8416408832403334e-06,
9577
+ "loss": 7.7912,
9578
+ "step": 1367
9579
+ },
9580
+ {
9581
+ "epoch": 0.47807094181373405,
9582
+ "grad_norm": 8.105119705200195,
9583
+ "learning_rate": 4.799212128938285e-06,
9584
+ "loss": 7.9909,
9585
+ "step": 1368
9586
+ },
9587
+ {
9588
+ "epoch": 0.4784204088764634,
9589
+ "grad_norm": 7.973011493682861,
9590
+ "learning_rate": 4.756960727923371e-06,
9591
+ "loss": 8.7417,
9592
+ "step": 1369
9593
+ },
9594
+ {
9595
+ "epoch": 0.47876987593919274,
9596
+ "grad_norm": 8.436671257019043,
9597
+ "learning_rate": 4.714886845976429e-06,
9598
+ "loss": 8.9856,
9599
+ "step": 1370
9600
+ },
9601
+ {
9602
+ "epoch": 0.47911934300192205,
9603
+ "grad_norm": 6.047431468963623,
9604
+ "learning_rate": 4.672990648181785e-06,
9605
+ "loss": 8.1396,
9606
+ "step": 1371
9607
+ },
9608
+ {
9609
+ "epoch": 0.4794688100646514,
9610
+ "grad_norm": 6.784114837646484,
9611
+ "learning_rate": 4.631272298926581e-06,
9612
+ "loss": 8.3812,
9613
+ "step": 1372
9614
+ },
9615
+ {
9616
+ "epoch": 0.47981827712738073,
9617
+ "grad_norm": 8.626285552978516,
9618
+ "learning_rate": 4.58973196190014e-06,
9619
+ "loss": 7.5672,
9620
+ "step": 1373
9621
+ },
9622
+ {
9623
+ "epoch": 0.4801677441901101,
9624
+ "grad_norm": 10.32469654083252,
9625
+ "learning_rate": 4.548369800093322e-06,
9626
+ "loss": 8.0342,
9627
+ "step": 1374
9628
+ },
9629
+ {
9630
+ "epoch": 0.4805172112528394,
9631
+ "grad_norm": 7.146700859069824,
9632
+ "learning_rate": 4.507185975797884e-06,
9633
+ "loss": 8.1178,
9634
+ "step": 1375
9635
+ },
9636
+ {
9637
+ "epoch": 0.4808666783155688,
9638
+ "grad_norm": 16.573612213134766,
9639
+ "learning_rate": 4.466180650605861e-06,
9640
+ "loss": 8.338,
9641
+ "step": 1376
9642
+ },
9643
+ {
9644
+ "epoch": 0.4812161453782981,
9645
+ "grad_norm": 6.66664457321167,
9646
+ "learning_rate": 4.42535398540887e-06,
9647
+ "loss": 8.9789,
9648
+ "step": 1377
9649
+ },
9650
+ {
9651
+ "epoch": 0.4815656124410274,
9652
+ "grad_norm": 6.5671844482421875,
9653
+ "learning_rate": 4.3847061403975765e-06,
9654
+ "loss": 8.5679,
9655
+ "step": 1378
9656
+ },
9657
+ {
9658
+ "epoch": 0.4819150795037568,
9659
+ "grad_norm": 9.924352645874023,
9660
+ "learning_rate": 4.344237275060986e-06,
9661
+ "loss": 8.5768,
9662
+ "step": 1379
9663
+ },
9664
+ {
9665
+ "epoch": 0.4822645465664861,
9666
+ "grad_norm": 10.146461486816406,
9667
+ "learning_rate": 4.303947548185855e-06,
9668
+ "loss": 8.3134,
9669
+ "step": 1380
9670
+ },
9671
+ {
9672
+ "epoch": 0.48261401362921547,
9673
+ "grad_norm": 7.913432598114014,
9674
+ "learning_rate": 4.263837117856046e-06,
9675
+ "loss": 8.8842,
9676
+ "step": 1381
9677
+ },
9678
+ {
9679
+ "epoch": 0.4829634806919448,
9680
+ "grad_norm": 6.116405963897705,
9681
+ "learning_rate": 4.223906141451939e-06,
9682
+ "loss": 9.1651,
9683
+ "step": 1382
9684
+ },
9685
+ {
9686
+ "epoch": 0.48331294775467415,
9687
+ "grad_norm": 8.878636360168457,
9688
+ "learning_rate": 4.184154775649768e-06,
9689
+ "loss": 9.009,
9690
+ "step": 1383
9691
+ },
9692
+ {
9693
+ "epoch": 0.48366241481740346,
9694
+ "grad_norm": 18.735830307006836,
9695
+ "learning_rate": 4.144583176421058e-06,
9696
+ "loss": 7.9376,
9697
+ "step": 1384
9698
+ },
9699
+ {
9700
+ "epoch": 0.4840118818801328,
9701
+ "grad_norm": 7.983646392822266,
9702
+ "learning_rate": 4.10519149903198e-06,
9703
+ "loss": 8.4509,
9704
+ "step": 1385
9705
+ },
9706
+ {
9707
+ "epoch": 0.48436134894286215,
9708
+ "grad_norm": 8.13884449005127,
9709
+ "learning_rate": 4.065979898042737e-06,
9710
+ "loss": 9.2192,
9711
+ "step": 1386
9712
+ },
9713
+ {
9714
+ "epoch": 0.48471081600559146,
9715
+ "grad_norm": 9.294929504394531,
9716
+ "learning_rate": 4.026948527306989e-06,
9717
+ "loss": 8.5775,
9718
+ "step": 1387
9719
+ },
9720
+ {
9721
+ "epoch": 0.48506028306832083,
9722
+ "grad_norm": 11.089651107788086,
9723
+ "learning_rate": 3.9880975399712305e-06,
9724
+ "loss": 8.164,
9725
+ "step": 1388
9726
+ },
9727
+ {
9728
+ "epoch": 0.48540975013105014,
9729
+ "grad_norm": 11.714179039001465,
9730
+ "learning_rate": 3.949427088474189e-06,
9731
+ "loss": 8.8365,
9732
+ "step": 1389
9733
+ },
9734
+ {
9735
+ "epoch": 0.4857592171937795,
9736
+ "grad_norm": 7.638397216796875,
9737
+ "learning_rate": 3.910937324546199e-06,
9738
+ "loss": 8.6612,
9739
+ "step": 1390
9740
+ },
9741
+ {
9742
+ "epoch": 0.4861086842565088,
9743
+ "grad_norm": 9.247742652893066,
9744
+ "learning_rate": 3.872628399208672e-06,
9745
+ "loss": 8.9195,
9746
+ "step": 1391
9747
+ },
9748
+ {
9749
+ "epoch": 0.48645815131923814,
9750
+ "grad_norm": 7.685249328613281,
9751
+ "learning_rate": 3.834500462773449e-06,
9752
+ "loss": 8.7991,
9753
+ "step": 1392
9754
+ },
9755
+ {
9756
+ "epoch": 0.4868076183819675,
9757
+ "grad_norm": 12.701216697692871,
9758
+ "learning_rate": 3.7965536648422394e-06,
9759
+ "loss": 8.4588,
9760
+ "step": 1393
9761
+ },
9762
+ {
9763
+ "epoch": 0.4871570854446968,
9764
+ "grad_norm": 10.06445598602295,
9765
+ "learning_rate": 3.7587881543060088e-06,
9766
+ "loss": 9.83,
9767
+ "step": 1394
9768
+ },
9769
+ {
9770
+ "epoch": 0.4875065525074262,
9771
+ "grad_norm": 8.480798721313477,
9772
+ "learning_rate": 3.7212040793444257e-06,
9773
+ "loss": 9.2898,
9774
+ "step": 1395
9775
+ },
9776
+ {
9777
+ "epoch": 0.4878560195701555,
9778
+ "grad_norm": 11.033528327941895,
9779
+ "learning_rate": 3.683801587425251e-06,
9780
+ "loss": 9.8745,
9781
+ "step": 1396
9782
+ },
9783
+ {
9784
+ "epoch": 0.4882054866328849,
9785
+ "grad_norm": 8.70356559753418,
9786
+ "learning_rate": 3.646580825303786e-06,
9787
+ "loss": 8.6746,
9788
+ "step": 1397
9789
+ },
9790
+ {
9791
+ "epoch": 0.4885549536956142,
9792
+ "grad_norm": 9.125457763671875,
9793
+ "learning_rate": 3.6095419390222652e-06,
9794
+ "loss": 9.8072,
9795
+ "step": 1398
9796
+ },
9797
+ {
9798
+ "epoch": 0.4889044207583435,
9799
+ "grad_norm": 10.77108383178711,
9800
+ "learning_rate": 3.572685073909321e-06,
9801
+ "loss": 9.3096,
9802
+ "step": 1399
9803
+ },
9804
+ {
9805
+ "epoch": 0.4892538878210729,
9806
+ "grad_norm": 12.485724449157715,
9807
+ "learning_rate": 3.5360103745793738e-06,
9808
+ "loss": 9.9269,
9809
+ "step": 1400
9810
+ },
9811
+ {
9812
+ "epoch": 0.4896033548838022,
9813
+ "grad_norm": 3.6794941425323486,
9814
+ "learning_rate": 3.4995179849321103e-06,
9815
+ "loss": 8.7652,
9816
+ "step": 1401
9817
+ },
9818
+ {
9819
+ "epoch": 0.48995282194653156,
9820
+ "grad_norm": 3.6535227298736572,
9821
+ "learning_rate": 3.4632080481518558e-06,
9822
+ "loss": 8.6102,
9823
+ "step": 1402
9824
+ },
9825
+ {
9826
+ "epoch": 0.49030228900926087,
9827
+ "grad_norm": 4.039484024047852,
9828
+ "learning_rate": 3.427080706707081e-06,
9829
+ "loss": 8.947,
9830
+ "step": 1403
9831
+ },
9832
+ {
9833
+ "epoch": 0.49065175607199024,
9834
+ "grad_norm": 3.9633991718292236,
9835
+ "learning_rate": 3.39113610234979e-06,
9836
+ "loss": 8.7209,
9837
+ "step": 1404
9838
+ },
9839
+ {
9840
+ "epoch": 0.49100122313471956,
9841
+ "grad_norm": 4.306646823883057,
9842
+ "learning_rate": 3.355374376115017e-06,
9843
+ "loss": 8.957,
9844
+ "step": 1405
9845
+ },
9846
+ {
9847
+ "epoch": 0.49135069019744887,
9848
+ "grad_norm": 5.453439235687256,
9849
+ "learning_rate": 3.319795668320208e-06,
9850
+ "loss": 8.5753,
9851
+ "step": 1406
9852
+ },
9853
+ {
9854
+ "epoch": 0.49170015726017824,
9855
+ "grad_norm": 4.913475036621094,
9856
+ "learning_rate": 3.2844001185647288e-06,
9857
+ "loss": 9.3093,
9858
+ "step": 1407
9859
+ },
9860
+ {
9861
+ "epoch": 0.49204962432290755,
9862
+ "grad_norm": 4.832673072814941,
9863
+ "learning_rate": 3.249187865729264e-06,
9864
+ "loss": 8.0143,
9865
+ "step": 1408
9866
+ },
9867
+ {
9868
+ "epoch": 0.4923990913856369,
9869
+ "grad_norm": 4.572703838348389,
9870
+ "learning_rate": 3.2141590479753236e-06,
9871
+ "loss": 8.6319,
9872
+ "step": 1409
9873
+ },
9874
+ {
9875
+ "epoch": 0.49274855844836624,
9876
+ "grad_norm": 5.14667272567749,
9877
+ "learning_rate": 3.1793138027446657e-06,
9878
+ "loss": 8.6348,
9879
+ "step": 1410
9880
+ },
9881
+ {
9882
+ "epoch": 0.4930980255110956,
9883
+ "grad_norm": 4.821927547454834,
9884
+ "learning_rate": 3.1446522667587852e-06,
9885
+ "loss": 9.1985,
9886
+ "step": 1411
9887
+ },
9888
+ {
9889
+ "epoch": 0.4934474925738249,
9890
+ "grad_norm": 5.0558576583862305,
9891
+ "learning_rate": 3.110174576018338e-06,
9892
+ "loss": 9.8678,
9893
+ "step": 1412
9894
+ },
9895
+ {
9896
+ "epoch": 0.49379695963655423,
9897
+ "grad_norm": 8.692586898803711,
9898
+ "learning_rate": 3.0758808658026493e-06,
9899
+ "loss": 7.3579,
9900
+ "step": 1413
9901
+ },
9902
+ {
9903
+ "epoch": 0.4941464266992836,
9904
+ "grad_norm": 5.595980644226074,
9905
+ "learning_rate": 3.041771270669158e-06,
9906
+ "loss": 8.7682,
9907
+ "step": 1414
9908
+ },
9909
+ {
9910
+ "epoch": 0.4944958937620129,
9911
+ "grad_norm": 7.489475727081299,
9912
+ "learning_rate": 3.0078459244528958e-06,
9913
+ "loss": 8.0538,
9914
+ "step": 1415
9915
+ },
9916
+ {
9917
+ "epoch": 0.4948453608247423,
9918
+ "grad_norm": 5.671663284301758,
9919
+ "learning_rate": 2.97410496026595e-06,
9920
+ "loss": 8.691,
9921
+ "step": 1416
9922
+ },
9923
+ {
9924
+ "epoch": 0.4951948278874716,
9925
+ "grad_norm": 9.251367568969727,
9926
+ "learning_rate": 2.9405485104969767e-06,
9927
+ "loss": 8.0049,
9928
+ "step": 1417
9929
+ },
9930
+ {
9931
+ "epoch": 0.49554429495020097,
9932
+ "grad_norm": 6.323045253753662,
9933
+ "learning_rate": 2.9071767068106324e-06,
9934
+ "loss": 8.1312,
9935
+ "step": 1418
9936
+ },
9937
+ {
9938
+ "epoch": 0.4958937620129303,
9939
+ "grad_norm": 9.267412185668945,
9940
+ "learning_rate": 2.8739896801470934e-06,
9941
+ "loss": 7.6144,
9942
+ "step": 1419
9943
+ },
9944
+ {
9945
+ "epoch": 0.4962432290756596,
9946
+ "grad_norm": 5.848172187805176,
9947
+ "learning_rate": 2.840987560721542e-06,
9948
+ "loss": 9.3971,
9949
+ "step": 1420
9950
+ },
9951
+ {
9952
+ "epoch": 0.49659269613838897,
9953
+ "grad_norm": 7.514741897583008,
9954
+ "learning_rate": 2.8081704780236084e-06,
9955
+ "loss": 7.7746,
9956
+ "step": 1421
9957
+ },
9958
+ {
9959
+ "epoch": 0.4969421632011183,
9960
+ "grad_norm": 6.808731555938721,
9961
+ "learning_rate": 2.775538560816937e-06,
9962
+ "loss": 8.105,
9963
+ "step": 1422
9964
+ },
9965
+ {
9966
+ "epoch": 0.49729163026384765,
9967
+ "grad_norm": 7.612759113311768,
9968
+ "learning_rate": 2.7430919371386253e-06,
9969
+ "loss": 8.5522,
9970
+ "step": 1423
9971
+ },
9972
+ {
9973
+ "epoch": 0.49764109732657696,
9974
+ "grad_norm": 8.545870780944824,
9975
+ "learning_rate": 2.7108307342987416e-06,
9976
+ "loss": 8.4657,
9977
+ "step": 1424
9978
+ },
9979
+ {
9980
+ "epoch": 0.49799056438930633,
9981
+ "grad_norm": 6.26425313949585,
9982
+ "learning_rate": 2.6787550788798245e-06,
9983
+ "loss": 9.1488,
9984
+ "step": 1425
9985
+ },
9986
+ {
9987
+ "epoch": 0.49834003145203565,
9988
+ "grad_norm": 5.831702709197998,
9989
+ "learning_rate": 2.646865096736384e-06,
9990
+ "loss": 9.7212,
9991
+ "step": 1426
9992
+ },
9993
+ {
9994
+ "epoch": 0.49868949851476496,
9995
+ "grad_norm": 8.172411918640137,
9996
+ "learning_rate": 2.6151609129943964e-06,
9997
+ "loss": 8.3926,
9998
+ "step": 1427
9999
+ },
10000
+ {
10001
+ "epoch": 0.49903896557749433,
10002
+ "grad_norm": 6.818192958831787,
10003
+ "learning_rate": 2.5836426520508383e-06,
10004
+ "loss": 9.4195,
10005
+ "step": 1428
10006
+ },
10007
+ {
10008
+ "epoch": 0.49938843264022365,
10009
+ "grad_norm": 7.837162494659424,
10010
+ "learning_rate": 2.5523104375731797e-06,
10011
+ "loss": 9.3985,
10012
+ "step": 1429
10013
+ },
10014
+ {
10015
+ "epoch": 0.499737899702953,
10016
+ "grad_norm": 6.568439960479736,
10017
+ "learning_rate": 2.521164392498915e-06,
10018
+ "loss": 9.7159,
10019
+ "step": 1430
10020
+ },
10021
+ {
10022
+ "epoch": 0.5000873667656823,
10023
+ "grad_norm": 11.689252853393555,
10024
+ "learning_rate": 2.490204639035049e-06,
10025
+ "loss": 8.789,
10026
+ "step": 1431
10027
+ },
10028
+ {
10029
+ "epoch": 0.5004368338284116,
10030
+ "grad_norm": 9.881168365478516,
10031
+ "learning_rate": 2.4594312986576774e-06,
10032
+ "loss": 7.6069,
10033
+ "step": 1432
10034
+ },
10035
+ {
10036
+ "epoch": 0.500786300891141,
10037
+ "grad_norm": 7.408852577209473,
10038
+ "learning_rate": 2.428844492111415e-06,
10039
+ "loss": 9.4805,
10040
+ "step": 1433
10041
+ },
10042
+ {
10043
+ "epoch": 0.5011357679538704,
10044
+ "grad_norm": 7.010851860046387,
10045
+ "learning_rate": 2.3984443394090227e-06,
10046
+ "loss": 8.8805,
10047
+ "step": 1434
10048
+ },
10049
+ {
10050
+ "epoch": 0.5014852350165997,
10051
+ "grad_norm": 9.13546371459961,
10052
+ "learning_rate": 2.3682309598308747e-06,
10053
+ "loss": 7.908,
10054
+ "step": 1435
10055
+ },
10056
+ {
10057
+ "epoch": 0.501834702079329,
10058
+ "grad_norm": 7.809762001037598,
10059
+ "learning_rate": 2.33820447192451e-06,
10060
+ "loss": 9.4567,
10061
+ "step": 1436
10062
+ },
10063
+ {
10064
+ "epoch": 0.5021841691420583,
10065
+ "grad_norm": 9.83913803100586,
10066
+ "learning_rate": 2.308364993504164e-06,
10067
+ "loss": 8.3121,
10068
+ "step": 1437
10069
+ },
10070
+ {
10071
+ "epoch": 0.5025336362047877,
10072
+ "grad_norm": 8.217439651489258,
10073
+ "learning_rate": 2.278712641650316e-06,
10074
+ "loss": 9.1075,
10075
+ "step": 1438
10076
+ },
10077
+ {
10078
+ "epoch": 0.5028831032675171,
10079
+ "grad_norm": 10.243683815002441,
10080
+ "learning_rate": 2.2492475327092034e-06,
10081
+ "loss": 7.9943,
10082
+ "step": 1439
10083
+ },
10084
+ {
10085
+ "epoch": 0.5032325703302464,
10086
+ "grad_norm": 9.931056022644043,
10087
+ "learning_rate": 2.2199697822924028e-06,
10088
+ "loss": 7.9189,
10089
+ "step": 1440
10090
+ },
10091
+ {
10092
+ "epoch": 0.5035820373929757,
10093
+ "grad_norm": 12.423880577087402,
10094
+ "learning_rate": 2.1908795052763387e-06,
10095
+ "loss": 9.0813,
10096
+ "step": 1441
10097
+ },
10098
+ {
10099
+ "epoch": 0.503931504455705,
10100
+ "grad_norm": 8.046937942504883,
10101
+ "learning_rate": 2.1619768158018692e-06,
10102
+ "loss": 9.0369,
10103
+ "step": 1442
10104
+ },
10105
+ {
10106
+ "epoch": 0.5042809715184344,
10107
+ "grad_norm": 9.706149101257324,
10108
+ "learning_rate": 2.133261827273797e-06,
10109
+ "loss": 9.4839,
10110
+ "step": 1443
10111
+ },
10112
+ {
10113
+ "epoch": 0.5046304385811637,
10114
+ "grad_norm": 10.146185874938965,
10115
+ "learning_rate": 2.1047346523604738e-06,
10116
+ "loss": 9.1929,
10117
+ "step": 1444
10118
+ },
10119
+ {
10120
+ "epoch": 0.5049799056438931,
10121
+ "grad_norm": 9.447968482971191,
10122
+ "learning_rate": 2.0763954029933085e-06,
10123
+ "loss": 8.9379,
10124
+ "step": 1445
10125
+ },
10126
+ {
10127
+ "epoch": 0.5053293727066224,
10128
+ "grad_norm": 14.284878730773926,
10129
+ "learning_rate": 2.048244190366355e-06,
10130
+ "loss": 8.6403,
10131
+ "step": 1446
10132
+ },
10133
+ {
10134
+ "epoch": 0.5056788397693517,
10135
+ "grad_norm": 10.964797973632812,
10136
+ "learning_rate": 2.02028112493588e-06,
10137
+ "loss": 9.0099,
10138
+ "step": 1447
10139
+ },
10140
+ {
10141
+ "epoch": 0.5060283068320811,
10142
+ "grad_norm": 11.85975170135498,
10143
+ "learning_rate": 1.992506316419912e-06,
10144
+ "loss": 9.7333,
10145
+ "step": 1448
10146
+ },
10147
+ {
10148
+ "epoch": 0.5063777738948104,
10149
+ "grad_norm": 12.616867065429688,
10150
+ "learning_rate": 1.9649198737978326e-06,
10151
+ "loss": 9.0127,
10152
+ "step": 1449
10153
+ },
10154
+ {
10155
+ "epoch": 0.5067272409575397,
10156
+ "grad_norm": 12.144278526306152,
10157
+ "learning_rate": 1.937521905309936e-06,
10158
+ "loss": 9.9148,
10159
+ "step": 1450
10160
+ },
10161
+ {
10162
+ "epoch": 0.507076708020269,
10163
+ "grad_norm": 3.3601937294006348,
10164
+ "learning_rate": 1.9103125184569913e-06,
10165
+ "loss": 8.6474,
10166
+ "step": 1451
10167
+ },
10168
+ {
10169
+ "epoch": 0.5074261750829985,
10170
+ "grad_norm": 3.4812333583831787,
10171
+ "learning_rate": 1.883291819999833e-06,
10172
+ "loss": 8.5375,
10173
+ "step": 1452
10174
+ },
10175
+ {
10176
+ "epoch": 0.5077756421457278,
10177
+ "grad_norm": 3.0042903423309326,
10178
+ "learning_rate": 1.8564599159589579e-06,
10179
+ "loss": 8.9881,
10180
+ "step": 1453
10181
+ },
10182
+ {
10183
+ "epoch": 0.5081251092084571,
10184
+ "grad_norm": 3.5207672119140625,
10185
+ "learning_rate": 1.8298169116140907e-06,
10186
+ "loss": 8.3814,
10187
+ "step": 1454
10188
+ },
10189
+ {
10190
+ "epoch": 0.5084745762711864,
10191
+ "grad_norm": 3.828885316848755,
10192
+ "learning_rate": 1.8033629115037699e-06,
10193
+ "loss": 8.5979,
10194
+ "step": 1455
10195
+ },
10196
+ {
10197
+ "epoch": 0.5088240433339157,
10198
+ "grad_norm": 4.460960388183594,
10199
+ "learning_rate": 1.7770980194249442e-06,
10200
+ "loss": 8.5224,
10201
+ "step": 1456
10202
+ },
10203
+ {
10204
+ "epoch": 0.5091735103966452,
10205
+ "grad_norm": 3.7834277153015137,
10206
+ "learning_rate": 1.7510223384325674e-06,
10207
+ "loss": 8.6374,
10208
+ "step": 1457
10209
+ },
10210
+ {
10211
+ "epoch": 0.5095229774593745,
10212
+ "grad_norm": 4.917012691497803,
10213
+ "learning_rate": 1.7251359708391757e-06,
10214
+ "loss": 8.0025,
10215
+ "step": 1458
10216
+ },
10217
+ {
10218
+ "epoch": 0.5098724445221038,
10219
+ "grad_norm": 4.838057994842529,
10220
+ "learning_rate": 1.699439018214527e-06,
10221
+ "loss": 9.2526,
10222
+ "step": 1459
10223
+ },
10224
+ {
10225
+ "epoch": 0.5102219115848331,
10226
+ "grad_norm": 4.332263946533203,
10227
+ "learning_rate": 1.6739315813851464e-06,
10228
+ "loss": 8.5394,
10229
+ "step": 1460
10230
+ },
10231
+ {
10232
+ "epoch": 0.5105713786475624,
10233
+ "grad_norm": 4.917897701263428,
10234
+ "learning_rate": 1.6486137604339813e-06,
10235
+ "loss": 8.6178,
10236
+ "step": 1461
10237
+ },
10238
+ {
10239
+ "epoch": 0.5109208457102918,
10240
+ "grad_norm": 4.234597206115723,
10241
+ "learning_rate": 1.6234856546999855e-06,
10242
+ "loss": 9.4087,
10243
+ "step": 1462
10244
+ },
10245
+ {
10246
+ "epoch": 0.5112703127730212,
10247
+ "grad_norm": 7.1815385818481445,
10248
+ "learning_rate": 1.5985473627777248e-06,
10249
+ "loss": 7.9261,
10250
+ "step": 1463
10251
+ },
10252
+ {
10253
+ "epoch": 0.5116197798357505,
10254
+ "grad_norm": 5.788399696350098,
10255
+ "learning_rate": 1.5737989825169884e-06,
10256
+ "loss": 8.4137,
10257
+ "step": 1464
10258
+ },
10259
+ {
10260
+ "epoch": 0.5119692468984798,
10261
+ "grad_norm": 7.015099048614502,
10262
+ "learning_rate": 1.549240611022429e-06,
10263
+ "loss": 7.5312,
10264
+ "step": 1465
10265
+ },
10266
+ {
10267
+ "epoch": 0.5123187139612092,
10268
+ "grad_norm": 6.456794261932373,
10269
+ "learning_rate": 1.5248723446531721e-06,
10270
+ "loss": 8.62,
10271
+ "step": 1466
10272
+ },
10273
+ {
10274
+ "epoch": 0.5126681810239385,
10275
+ "grad_norm": 5.014076232910156,
10276
+ "learning_rate": 1.5006942790224133e-06,
10277
+ "loss": 10.0075,
10278
+ "step": 1467
10279
+ },
10280
+ {
10281
+ "epoch": 0.5130176480866678,
10282
+ "grad_norm": 7.319535255432129,
10283
+ "learning_rate": 1.4767065089970667e-06,
10284
+ "loss": 7.9803,
10285
+ "step": 1468
10286
+ },
10287
+ {
10288
+ "epoch": 0.5133671151493971,
10289
+ "grad_norm": 9.385567665100098,
10290
+ "learning_rate": 1.4529091286973995e-06,
10291
+ "loss": 8.4043,
10292
+ "step": 1469
10293
+ },
10294
+ {
10295
+ "epoch": 0.5137165822121265,
10296
+ "grad_norm": 6.6454620361328125,
10297
+ "learning_rate": 1.4293022314966431e-06,
10298
+ "loss": 8.6581,
10299
+ "step": 1470
10300
+ },
10301
+ {
10302
+ "epoch": 0.5140660492748559,
10303
+ "grad_norm": 7.449554443359375,
10304
+ "learning_rate": 1.4058859100206211e-06,
10305
+ "loss": 8.2712,
10306
+ "step": 1471
10307
+ },
10308
+ {
10309
+ "epoch": 0.5144155163375852,
10310
+ "grad_norm": 6.720264434814453,
10311
+ "learning_rate": 1.3826602561474277e-06,
10312
+ "loss": 9.1598,
10313
+ "step": 1472
10314
+ },
10315
+ {
10316
+ "epoch": 0.5147649834003145,
10317
+ "grad_norm": 6.422667503356934,
10318
+ "learning_rate": 1.3596253610070165e-06,
10319
+ "loss": 8.7228,
10320
+ "step": 1473
10321
+ },
10322
+ {
10323
+ "epoch": 0.5151144504630438,
10324
+ "grad_norm": 8.237399101257324,
10325
+ "learning_rate": 1.3367813149808729e-06,
10326
+ "loss": 8.0249,
10327
+ "step": 1474
10328
+ },
10329
+ {
10330
+ "epoch": 0.5154639175257731,
10331
+ "grad_norm": 8.319735527038574,
10332
+ "learning_rate": 1.3141282077016592e-06,
10333
+ "loss": 7.7051,
10334
+ "step": 1475
10335
+ },
10336
+ {
10337
+ "epoch": 0.5158133845885026,
10338
+ "grad_norm": 8.198334693908691,
10339
+ "learning_rate": 1.2916661280528542e-06,
10340
+ "loss": 9.7327,
10341
+ "step": 1476
10342
+ },
10343
+ {
10344
+ "epoch": 0.5161628516512319,
10345
+ "grad_norm": 9.559484481811523,
10346
+ "learning_rate": 1.2693951641683854e-06,
10347
+ "loss": 8.1461,
10348
+ "step": 1477
10349
+ },
10350
+ {
10351
+ "epoch": 0.5165123187139612,
10352
+ "grad_norm": 7.050119400024414,
10353
+ "learning_rate": 1.2473154034323254e-06,
10354
+ "loss": 9.848,
10355
+ "step": 1478
10356
+ },
10357
+ {
10358
+ "epoch": 0.5168617857766905,
10359
+ "grad_norm": 8.804779052734375,
10360
+ "learning_rate": 1.2254269324785239e-06,
10361
+ "loss": 8.0188,
10362
+ "step": 1479
10363
+ },
10364
+ {
10365
+ "epoch": 0.5172112528394199,
10366
+ "grad_norm": 7.277517795562744,
10367
+ "learning_rate": 1.203729837190265e-06,
10368
+ "loss": 9.9284,
10369
+ "step": 1480
10370
+ },
10371
+ {
10372
+ "epoch": 0.5175607199021492,
10373
+ "grad_norm": 8.914274215698242,
10374
+ "learning_rate": 1.1822242026999442e-06,
10375
+ "loss": 7.9877,
10376
+ "step": 1481
10377
+ },
10378
+ {
10379
+ "epoch": 0.5179101869648786,
10380
+ "grad_norm": 12.721630096435547,
10381
+ "learning_rate": 1.160910113388719e-06,
10382
+ "loss": 7.6372,
10383
+ "step": 1482
10384
+ },
10385
+ {
10386
+ "epoch": 0.5182596540276079,
10387
+ "grad_norm": 6.860194206237793,
10388
+ "learning_rate": 1.1397876528861927e-06,
10389
+ "loss": 10.591,
10390
+ "step": 1483
10391
+ },
10392
+ {
10393
+ "epoch": 0.5186091210903372,
10394
+ "grad_norm": 9.920988082885742,
10395
+ "learning_rate": 1.1188569040700703e-06,
10396
+ "loss": 8.1528,
10397
+ "step": 1484
10398
+ },
10399
+ {
10400
+ "epoch": 0.5189585881530666,
10401
+ "grad_norm": 7.779238224029541,
10402
+ "learning_rate": 1.0981179490658523e-06,
10403
+ "loss": 8.6199,
10404
+ "step": 1485
10405
+ },
10406
+ {
10407
+ "epoch": 0.5193080552157959,
10408
+ "grad_norm": 8.248661994934082,
10409
+ "learning_rate": 1.077570869246497e-06,
10410
+ "loss": 9.0911,
10411
+ "step": 1486
10412
+ },
10413
+ {
10414
+ "epoch": 0.5196575222785252,
10415
+ "grad_norm": 10.340319633483887,
10416
+ "learning_rate": 1.0572157452321097e-06,
10417
+ "loss": 7.3411,
10418
+ "step": 1487
10419
+ },
10420
+ {
10421
+ "epoch": 0.5200069893412546,
10422
+ "grad_norm": 11.339948654174805,
10423
+ "learning_rate": 1.0370526568896143e-06,
10424
+ "loss": 9.6583,
10425
+ "step": 1488
10426
+ },
10427
+ {
10428
+ "epoch": 0.5203564564039839,
10429
+ "grad_norm": 13.352083206176758,
10430
+ "learning_rate": 1.0170816833324548e-06,
10431
+ "loss": 7.1735,
10432
+ "step": 1489
10433
+ },
10434
+ {
10435
+ "epoch": 0.5207059234667133,
10436
+ "grad_norm": 11.769144058227539,
10437
+ "learning_rate": 9.97302902920283e-07,
10438
+ "loss": 7.3443,
10439
+ "step": 1490
10440
+ },
10441
+ {
10442
+ "epoch": 0.5210553905294426,
10443
+ "grad_norm": 15.129111289978027,
10444
+ "learning_rate": 9.77716393258632e-07,
10445
+ "loss": 6.973,
10446
+ "step": 1491
10447
+ },
10448
+ {
10449
+ "epoch": 0.5214048575921719,
10450
+ "grad_norm": 10.826708793640137,
10451
+ "learning_rate": 9.583222311986551e-07,
10452
+ "loss": 8.8058,
10453
+ "step": 1492
10454
+ },
10455
+ {
10456
+ "epoch": 0.5217543246549012,
10457
+ "grad_norm": 8.656180381774902,
10458
+ "learning_rate": 9.391204928367647e-07,
10459
+ "loss": 9.5484,
10460
+ "step": 1493
10461
+ },
10462
+ {
10463
+ "epoch": 0.5221037917176307,
10464
+ "grad_norm": 8.651819229125977,
10465
+ "learning_rate": 9.20111253514383e-07,
10466
+ "loss": 9.1537,
10467
+ "step": 1494
10468
+ },
10469
+ {
10470
+ "epoch": 0.52245325878036,
10471
+ "grad_norm": 18.103578567504883,
10472
+ "learning_rate": 9.012945878176249e-07,
10473
+ "loss": 8.3238,
10474
+ "step": 1495
10475
+ },
10476
+ {
10477
+ "epoch": 0.5228027258430893,
10478
+ "grad_norm": 10.191046714782715,
10479
+ "learning_rate": 8.826705695770044e-07,
10480
+ "loss": 9.2153,
10481
+ "step": 1496
10482
+ },
10483
+ {
10484
+ "epoch": 0.5231521929058186,
10485
+ "grad_norm": 8.137618064880371,
10486
+ "learning_rate": 8.642392718671455e-07,
10487
+ "loss": 10.0557,
10488
+ "step": 1497
10489
+ },
10490
+ {
10491
+ "epoch": 0.5235016599685479,
10492
+ "grad_norm": 10.658462524414062,
10493
+ "learning_rate": 8.460007670065107e-07,
10494
+ "loss": 9.3731,
10495
+ "step": 1498
10496
+ },
10497
+ {
10498
+ "epoch": 0.5238511270312773,
10499
+ "grad_norm": 14.869357109069824,
10500
+ "learning_rate": 8.279551265571062e-07,
10501
+ "loss": 10.0013,
10502
+ "step": 1499
10503
+ },
10504
+ {
10505
+ "epoch": 0.5242005940940067,
10506
+ "grad_norm": 11.538647651672363,
10507
+ "learning_rate": 8.101024213241826e-07,
10508
+ "loss": 10.5022,
10509
+ "step": 1500
10510
+ },
10511
+ {
10512
+ "epoch": 0.524550061156736,
10513
+ "grad_norm": 3.0151584148406982,
10514
+ "learning_rate": 7.92442721355996e-07,
10515
+ "loss": 9.0469,
10516
+ "step": 1501
10517
+ },
10518
+ {
10519
+ "epoch": 0.5248995282194653,
10520
+ "grad_norm": 3.306246280670166,
10521
+ "learning_rate": 7.749760959435026e-07,
10522
+ "loss": 9.1467,
10523
+ "step": 1502
10524
+ },
10525
+ {
10526
+ "epoch": 0.5252489952821947,
10527
+ "grad_norm": 4.392486572265625,
10528
+ "learning_rate": 7.577026136200926e-07,
10529
+ "loss": 7.8523,
10530
+ "step": 1503
10531
+ },
10532
+ {
10533
+ "epoch": 0.525598462344924,
10534
+ "grad_norm": 4.497766494750977,
10535
+ "learning_rate": 7.40622342161329e-07,
10536
+ "loss": 8.1323,
10537
+ "step": 1504
10538
+ },
10539
+ {
10540
+ "epoch": 0.5259479294076533,
10541
+ "grad_norm": 4.739800453186035,
10542
+ "learning_rate": 7.237353485846865e-07,
10543
+ "loss": 8.2431,
10544
+ "step": 1505
10545
+ },
10546
+ {
10547
+ "epoch": 0.5262973964703827,
10548
+ "grad_norm": 5.242592811584473,
10549
+ "learning_rate": 7.070416991492634e-07,
10550
+ "loss": 9.1518,
10551
+ "step": 1506
10552
+ },
10553
+ {
10554
+ "epoch": 0.526646863533112,
10555
+ "grad_norm": 5.052678108215332,
10556
+ "learning_rate": 6.905414593555482e-07,
10557
+ "loss": 9.0434,
10558
+ "step": 1507
10559
+ },
10560
+ {
10561
+ "epoch": 0.5269963305958414,
10562
+ "grad_norm": 5.126960754394531,
10563
+ "learning_rate": 6.742346939451471e-07,
10564
+ "loss": 8.4479,
10565
+ "step": 1508
10566
+ },
10567
+ {
10568
+ "epoch": 0.5273457976585707,
10569
+ "grad_norm": 3.9989051818847656,
10570
+ "learning_rate": 6.58121466900552e-07,
10571
+ "loss": 8.9057,
10572
+ "step": 1509
10573
+ },
10574
+ {
10575
+ "epoch": 0.5276952647213,
10576
+ "grad_norm": 4.889562129974365,
10577
+ "learning_rate": 6.42201841444845e-07,
10578
+ "loss": 9.1085,
10579
+ "step": 1510
10580
+ },
10581
+ {
10582
+ "epoch": 0.5280447317840293,
10583
+ "grad_norm": 5.401238918304443,
10584
+ "learning_rate": 6.264758800414993e-07,
10585
+ "loss": 8.591,
10586
+ "step": 1511
10587
+ },
10588
+ {
10589
+ "epoch": 0.5283941988467586,
10590
+ "grad_norm": 5.017122745513916,
10591
+ "learning_rate": 6.109436443941075e-07,
10592
+ "loss": 8.6129,
10593
+ "step": 1512
10594
+ },
10595
+ {
10596
+ "epoch": 0.5287436659094881,
10597
+ "grad_norm": 7.77358865737915,
10598
+ "learning_rate": 5.956051954461472e-07,
10599
+ "loss": 8.8282,
10600
+ "step": 1513
10601
+ },
10602
+ {
10603
+ "epoch": 0.5290931329722174,
10604
+ "grad_norm": 7.385234832763672,
10605
+ "learning_rate": 5.804605933807217e-07,
10606
+ "loss": 8.3611,
10607
+ "step": 1514
10608
+ },
10609
+ {
10610
+ "epoch": 0.5294426000349467,
10611
+ "grad_norm": 10.704131126403809,
10612
+ "learning_rate": 5.655098976203698e-07,
10613
+ "loss": 9.2547,
10614
+ "step": 1515
10615
+ },
10616
+ {
10617
+ "epoch": 0.529792067097676,
10618
+ "grad_norm": 6.75469446182251,
10619
+ "learning_rate": 5.50753166826784e-07,
10620
+ "loss": 8.4131,
10621
+ "step": 1516
10622
+ },
10623
+ {
10624
+ "epoch": 0.5301415341604054,
10625
+ "grad_norm": 6.179068088531494,
10626
+ "learning_rate": 5.36190458900615e-07,
10627
+ "loss": 8.3555,
10628
+ "step": 1517
10629
+ },
10630
+ {
10631
+ "epoch": 0.5304910012231348,
10632
+ "grad_norm": 6.363292694091797,
10633
+ "learning_rate": 5.218218309812118e-07,
10634
+ "loss": 9.4896,
10635
+ "step": 1518
10636
+ },
10637
+ {
10638
+ "epoch": 0.5308404682858641,
10639
+ "grad_norm": 5.677951812744141,
10640
+ "learning_rate": 5.076473394464376e-07,
10641
+ "loss": 7.9966,
10642
+ "step": 1519
10643
+ },
10644
+ {
10645
+ "epoch": 0.5311899353485934,
10646
+ "grad_norm": 8.330147743225098,
10647
+ "learning_rate": 4.93667039912421e-07,
10648
+ "loss": 8.5833,
10649
+ "step": 1520
10650
+ },
10651
+ {
10652
+ "epoch": 0.5315394024113227,
10653
+ "grad_norm": 8.095162391662598,
10654
+ "learning_rate": 4.798809872333276e-07,
10655
+ "loss": 7.2753,
10656
+ "step": 1521
10657
+ },
10658
+ {
10659
+ "epoch": 0.5318888694740521,
10660
+ "grad_norm": 7.9382429122924805,
10661
+ "learning_rate": 4.6628923550118273e-07,
10662
+ "loss": 7.8619,
10663
+ "step": 1522
10664
+ },
10665
+ {
10666
+ "epoch": 0.5322383365367814,
10667
+ "grad_norm": 5.361581802368164,
10668
+ "learning_rate": 4.528918380456271e-07,
10669
+ "loss": 9.1448,
10670
+ "step": 1523
10671
+ },
10672
+ {
10673
+ "epoch": 0.5325878035995107,
10674
+ "grad_norm": 6.494990825653076,
10675
+ "learning_rate": 4.3968884743371686e-07,
10676
+ "loss": 8.9046,
10677
+ "step": 1524
10678
+ },
10679
+ {
10680
+ "epoch": 0.5329372706622401,
10681
+ "grad_norm": 6.325202941894531,
10682
+ "learning_rate": 4.2668031546972406e-07,
10683
+ "loss": 9.416,
10684
+ "step": 1525
10685
+ },
10686
+ {
10687
+ "epoch": 0.5332867377249694,
10688
+ "grad_norm": 11.252113342285156,
10689
+ "learning_rate": 4.1386629319492556e-07,
10690
+ "loss": 7.6609,
10691
+ "step": 1526
10692
+ },
10693
+ {
10694
+ "epoch": 0.5336362047876988,
10695
+ "grad_norm": 9.190266609191895,
10696
+ "learning_rate": 4.0124683088740287e-07,
10697
+ "loss": 9.085,
10698
+ "step": 1527
10699
+ },
10700
+ {
10701
+ "epoch": 0.5339856718504281,
10702
+ "grad_norm": 7.811284065246582,
10703
+ "learning_rate": 3.888219780618485e-07,
10704
+ "loss": 9.1781,
10705
+ "step": 1528
10706
+ },
10707
+ {
10708
+ "epoch": 0.5343351389131574,
10709
+ "grad_norm": 7.097583293914795,
10710
+ "learning_rate": 3.7659178346937105e-07,
10711
+ "loss": 9.0826,
10712
+ "step": 1529
10713
+ },
10714
+ {
10715
+ "epoch": 0.5346846059758867,
10716
+ "grad_norm": 7.878072738647461,
10717
+ "learning_rate": 3.6455629509730136e-07,
10718
+ "loss": 8.4619,
10719
+ "step": 1530
10720
+ },
10721
+ {
10722
+ "epoch": 0.5350340730386162,
10723
+ "grad_norm": 9.73438549041748,
10724
+ "learning_rate": 3.527155601690091e-07,
10725
+ "loss": 9.5765,
10726
+ "step": 1531
10727
+ },
10728
+ {
10729
+ "epoch": 0.5353835401013455,
10730
+ "grad_norm": 8.027961730957031,
10731
+ "learning_rate": 3.41069625143714e-07,
10732
+ "loss": 7.9055,
10733
+ "step": 1532
10734
+ },
10735
+ {
10736
+ "epoch": 0.5357330071640748,
10737
+ "grad_norm": 8.409984588623047,
10738
+ "learning_rate": 3.296185357162973e-07,
10739
+ "loss": 8.6043,
10740
+ "step": 1533
10741
+ },
10742
+ {
10743
+ "epoch": 0.5360824742268041,
10744
+ "grad_norm": 7.9217305183410645,
10745
+ "learning_rate": 3.183623368171351e-07,
10746
+ "loss": 7.3746,
10747
+ "step": 1534
10748
+ },
10749
+ {
10750
+ "epoch": 0.5364319412895334,
10751
+ "grad_norm": 10.69045352935791,
10752
+ "learning_rate": 3.073010726119152e-07,
10753
+ "loss": 8.5955,
10754
+ "step": 1535
10755
+ },
10756
+ {
10757
+ "epoch": 0.5367814083522628,
10758
+ "grad_norm": 8.639704704284668,
10759
+ "learning_rate": 2.9643478650146496e-07,
10760
+ "loss": 8.8309,
10761
+ "step": 1536
10762
+ },
10763
+ {
10764
+ "epoch": 0.5371308754149922,
10765
+ "grad_norm": 13.088117599487305,
10766
+ "learning_rate": 2.8576352112158475e-07,
10767
+ "loss": 8.5921,
10768
+ "step": 1537
10769
+ },
10770
+ {
10771
+ "epoch": 0.5374803424777215,
10772
+ "grad_norm": 7.819159030914307,
10773
+ "learning_rate": 2.7528731834287056e-07,
10774
+ "loss": 8.4251,
10775
+ "step": 1538
10776
+ },
10777
+ {
10778
+ "epoch": 0.5378298095404508,
10779
+ "grad_norm": 10.49492359161377,
10780
+ "learning_rate": 2.6500621927054715e-07,
10781
+ "loss": 6.746,
10782
+ "step": 1539
10783
+ },
10784
+ {
10785
+ "epoch": 0.5381792766031801,
10786
+ "grad_norm": 10.173042297363281,
10787
+ "learning_rate": 2.5492026424434043e-07,
10788
+ "loss": 8.3439,
10789
+ "step": 1540
10790
+ },
10791
+ {
10792
+ "epoch": 0.5385287436659095,
10793
+ "grad_norm": 13.773662567138672,
10794
+ "learning_rate": 2.450294928382668e-07,
10795
+ "loss": 7.4032,
10796
+ "step": 1541
10797
+ },
10798
+ {
10799
+ "epoch": 0.5388782107286388,
10800
+ "grad_norm": 9.790109634399414,
10801
+ "learning_rate": 2.3533394386052177e-07,
10802
+ "loss": 8.5924,
10803
+ "step": 1542
10804
+ },
10805
+ {
10806
+ "epoch": 0.5392276777913682,
10807
+ "grad_norm": 12.852411270141602,
10808
+ "learning_rate": 2.2583365535330803e-07,
10809
+ "loss": 8.9465,
10810
+ "step": 1543
10811
+ },
10812
+ {
10813
+ "epoch": 0.5395771448540975,
10814
+ "grad_norm": 8.171817779541016,
10815
+ "learning_rate": 2.1652866459268562e-07,
10816
+ "loss": 9.4262,
10817
+ "step": 1544
10818
+ },
10819
+ {
10820
+ "epoch": 0.5399266119168269,
10821
+ "grad_norm": 10.939376831054688,
10822
+ "learning_rate": 2.0741900808843862e-07,
10823
+ "loss": 8.8726,
10824
+ "step": 1545
10825
+ },
10826
+ {
10827
+ "epoch": 0.5402760789795562,
10828
+ "grad_norm": 8.553930282592773,
10829
+ "learning_rate": 1.9850472158390866e-07,
10830
+ "loss": 9.6799,
10831
+ "step": 1546
10832
+ },
10833
+ {
10834
+ "epoch": 0.5406255460422855,
10835
+ "grad_norm": 10.361742973327637,
10836
+ "learning_rate": 1.897858400558783e-07,
10837
+ "loss": 8.9363,
10838
+ "step": 1547
10839
+ },
10840
+ {
10841
+ "epoch": 0.5409750131050148,
10842
+ "grad_norm": 11.963432312011719,
10843
+ "learning_rate": 1.8126239771442121e-07,
10844
+ "loss": 9.533,
10845
+ "step": 1548
10846
+ },
10847
+ {
10848
+ "epoch": 0.5413244801677441,
10849
+ "grad_norm": 11.194351196289062,
10850
+ "learning_rate": 1.7293442800277448e-07,
10851
+ "loss": 9.6057,
10852
+ "step": 1549
10853
+ },
10854
+ {
10855
+ "epoch": 0.5416739472304736,
10856
+ "grad_norm": 15.605978965759277,
10857
+ "learning_rate": 1.6480196359719425e-07,
10858
+ "loss": 9.9181,
10859
+ "step": 1550
10860
+ },
10861
+ {
10862
+ "epoch": 0.5420234142932029,
10863
+ "grad_norm": 3.3464150428771973,
10864
+ "learning_rate": 1.568650364068447e-07,
10865
+ "loss": 8.0512,
10866
+ "step": 1551
10867
+ },
10868
+ {
10869
+ "epoch": 0.5423728813559322,
10870
+ "grad_norm": 3.7256124019622803,
10871
+ "learning_rate": 1.4912367757366487e-07,
10872
+ "loss": 8.3264,
10873
+ "step": 1552
10874
+ },
10875
+ {
10876
+ "epoch": 0.5427223484186615,
10877
+ "grad_norm": 3.722212553024292,
10878
+ "learning_rate": 1.41577917472252e-07,
10879
+ "loss": 8.6556,
10880
+ "step": 1553
10881
+ },
10882
+ {
10883
+ "epoch": 0.5430718154813908,
10884
+ "grad_norm": 3.712941884994507,
10885
+ "learning_rate": 1.3422778570971738e-07,
10886
+ "loss": 8.3789,
10887
+ "step": 1554
10888
+ },
10889
+ {
10890
+ "epoch": 0.5434212825441203,
10891
+ "grad_norm": 4.432479381561279,
10892
+ "learning_rate": 1.2707331112561393e-07,
10893
+ "loss": 9.1147,
10894
+ "step": 1555
10895
+ },
10896
+ {
10897
+ "epoch": 0.5437707496068496,
10898
+ "grad_norm": 4.570591449737549,
10899
+ "learning_rate": 1.2011452179178652e-07,
10900
+ "loss": 8.7669,
10901
+ "step": 1556
10902
+ },
10903
+ {
10904
+ "epoch": 0.5441202166695789,
10905
+ "grad_norm": 4.474040508270264,
10906
+ "learning_rate": 1.1335144501227191e-07,
10907
+ "loss": 8.9163,
10908
+ "step": 1557
10909
+ },
10910
+ {
10911
+ "epoch": 0.5444696837323082,
10912
+ "grad_norm": 5.401274681091309,
10913
+ "learning_rate": 1.0678410732319899e-07,
10914
+ "loss": 8.6825,
10915
+ "step": 1558
10916
+ },
10917
+ {
10918
+ "epoch": 0.5448191507950376,
10919
+ "grad_norm": 4.803933620452881,
10920
+ "learning_rate": 1.0041253449268318e-07,
10921
+ "loss": 8.7886,
10922
+ "step": 1559
10923
+ },
10924
+ {
10925
+ "epoch": 0.5451686178577669,
10926
+ "grad_norm": 5.4671759605407715,
10927
+ "learning_rate": 9.423675152071543e-08,
10928
+ "loss": 9.0206,
10929
+ "step": 1560
10930
+ },
10931
+ {
10932
+ "epoch": 0.5455180849204962,
10933
+ "grad_norm": 4.31005859375,
10934
+ "learning_rate": 8.82567826390679e-08,
10935
+ "loss": 9.3175,
10936
+ "step": 1561
10937
+ },
10938
+ {
10939
+ "epoch": 0.5458675519832256,
10940
+ "grad_norm": 7.8316969871521,
10941
+ "learning_rate": 8.247265131121618e-08,
10942
+ "loss": 6.3089,
10943
+ "step": 1562
10944
+ },
10945
+ {
10946
+ "epoch": 0.5462170190459549,
10947
+ "grad_norm": 8.249174118041992,
10948
+ "learning_rate": 7.688438023221722e-08,
10949
+ "loss": 7.9301,
10950
+ "step": 1563
10951
+ },
10952
+ {
10953
+ "epoch": 0.5465664861086843,
10954
+ "grad_norm": 4.6625776290893555,
10955
+ "learning_rate": 7.149199132863716e-08,
10956
+ "loss": 9.0896,
10957
+ "step": 1564
10958
+ },
10959
+ {
10960
+ "epoch": 0.5469159531714136,
10961
+ "grad_norm": 9.107648849487305,
10962
+ "learning_rate": 6.629550575847354e-08,
10963
+ "loss": 7.1158,
10964
+ "step": 1565
10965
+ },
10966
+ {
10967
+ "epoch": 0.5472654202341429,
10968
+ "grad_norm": 6.1903605461120605,
10969
+ "learning_rate": 6.129494391105551e-08,
10970
+ "loss": 8.3276,
10971
+ "step": 1566
10972
+ },
10973
+ {
10974
+ "epoch": 0.5476148872968722,
10975
+ "grad_norm": 8.7449312210083,
10976
+ "learning_rate": 5.6490325406971524e-08,
10977
+ "loss": 8.4082,
10978
+ "step": 1567
10979
+ },
10980
+ {
10981
+ "epoch": 0.5479643543596016,
10982
+ "grad_norm": 6.001641273498535,
10983
+ "learning_rate": 5.188166909799175e-08,
10984
+ "loss": 9.3425,
10985
+ "step": 1568
10986
+ },
10987
+ {
10988
+ "epoch": 0.548313821422331,
10989
+ "grad_norm": 8.049176216125488,
10990
+ "learning_rate": 4.7468993066995814e-08,
10991
+ "loss": 8.3586,
10992
+ "step": 1569
10993
+ },
10994
+ {
10995
+ "epoch": 0.5486632884850603,
10996
+ "grad_norm": 6.374185562133789,
10997
+ "learning_rate": 4.325231462790069e-08,
10998
+ "loss": 8.5172,
10999
+ "step": 1570
11000
+ },
11001
+ {
11002
+ "epoch": 0.5490127555477896,
11003
+ "grad_norm": 7.79896879196167,
11004
+ "learning_rate": 3.923165032558296e-08,
11005
+ "loss": 7.4129,
11006
+ "step": 1571
11007
+ },
11008
+ {
11009
+ "epoch": 0.5493622226105189,
11010
+ "grad_norm": 7.130568027496338,
11011
+ "learning_rate": 3.540701593583995e-08,
11012
+ "loss": 8.9294,
11013
+ "step": 1572
11014
+ },
11015
+ {
11016
+ "epoch": 0.5497116896732483,
11017
+ "grad_norm": 10.37874698638916,
11018
+ "learning_rate": 3.1778426465289834e-08,
11019
+ "loss": 8.2947,
11020
+ "step": 1573
11021
+ },
11022
+ {
11023
+ "epoch": 0.5500611567359777,
11024
+ "grad_norm": 9.546464920043945,
11025
+ "learning_rate": 2.834589615135497e-08,
11026
+ "loss": 7.66,
11027
+ "step": 1574
11028
+ },
11029
+ {
11030
+ "epoch": 0.550410623798707,
11031
+ "grad_norm": 6.831233501434326,
11032
+ "learning_rate": 2.510943846216751e-08,
11033
+ "loss": 9.2387,
11034
+ "step": 1575
11035
+ },
11036
+ {
11037
+ "epoch": 0.5507600908614363,
11038
+ "grad_norm": 7.624803066253662,
11039
+ "learning_rate": 2.2069066096552794e-08,
11040
+ "loss": 8.6521,
11041
+ "step": 1576
11042
+ },
11043
+ {
11044
+ "epoch": 0.5511095579241656,
11045
+ "grad_norm": 8.880688667297363,
11046
+ "learning_rate": 1.9224790983940478e-08,
11047
+ "loss": 8.6401,
11048
+ "step": 1577
11049
+ },
11050
+ {
11051
+ "epoch": 0.551459024986895,
11052
+ "grad_norm": 7.104653835296631,
11053
+ "learning_rate": 1.6576624284347918e-08,
11054
+ "loss": 8.5249,
11055
+ "step": 1578
11056
+ },
11057
+ {
11058
+ "epoch": 0.5518084920496243,
11059
+ "grad_norm": 11.853543281555176,
11060
+ "learning_rate": 1.4124576388324651e-08,
11061
+ "loss": 8.1629,
11062
+ "step": 1579
11063
+ },
11064
+ {
11065
+ "epoch": 0.5521579591123537,
11066
+ "grad_norm": 12.255044937133789,
11067
+ "learning_rate": 1.1868656916919075e-08,
11068
+ "loss": 9.0506,
11069
+ "step": 1580
11070
+ },
11071
+ {
11072
+ "epoch": 0.552507426175083,
11073
+ "grad_norm": 6.950803756713867,
11074
+ "learning_rate": 9.808874721628503e-09,
11075
+ "loss": 8.6808,
11076
+ "step": 1581
11077
+ },
11078
+ {
11079
+ "epoch": 0.5528568932378123,
11080
+ "grad_norm": 7.39617395401001,
11081
+ "learning_rate": 7.945237884371404e-09,
11082
+ "loss": 9.157,
11083
+ "step": 1582
11084
+ },
11085
+ {
11086
+ "epoch": 0.5532063603005417,
11087
+ "grad_norm": 7.782493591308594,
11088
+ "learning_rate": 6.2777537174596446e-09,
11089
+ "loss": 8.477,
11090
+ "step": 1583
11091
+ },
11092
+ {
11093
+ "epoch": 0.553555827363271,
11094
+ "grad_norm": 9.773700714111328,
11095
+ "learning_rate": 4.806428763559634e-09,
11096
+ "loss": 8.7344,
11097
+ "step": 1584
11098
+ },
11099
+ {
11100
+ "epoch": 0.5539052944260003,
11101
+ "grad_norm": 12.035295486450195,
11102
+ "learning_rate": 3.5312687956756685e-09,
11103
+ "loss": 9.5495,
11104
+ "step": 1585
11105
+ },
11106
+ {
11107
+ "epoch": 0.5542547614887297,
11108
+ "grad_norm": 8.741121292114258,
11109
+ "learning_rate": 2.4522788171221777e-09,
11110
+ "loss": 8.2597,
11111
+ "step": 1586
11112
+ },
11113
+ {
11114
+ "epoch": 0.5546042285514591,
11115
+ "grad_norm": 12.165633201599121,
11116
+ "learning_rate": 1.5694630615070704e-09,
11117
+ "loss": 9.357,
11118
+ "step": 1587
11119
+ },
11120
+ {
11121
+ "epoch": 0.5549536956141884,
11122
+ "grad_norm": 6.861973762512207,
11123
+ "learning_rate": 8.828249927206322e-10,
11124
+ "loss": 8.7628,
11125
+ "step": 1588
11126
+ },
11127
+ {
11128
+ "epoch": 0.5553031626769177,
11129
+ "grad_norm": 7.887958526611328,
11130
+ "learning_rate": 3.923673048966681e-10,
11131
+ "loss": 9.156,
11132
+ "step": 1589
11133
+ },
11134
+ {
11135
+ "epoch": 0.555652629739647,
11136
+ "grad_norm": 7.190601348876953,
11137
+ "learning_rate": 9.809192244580878e-11,
11138
+ "loss": 9.0351,
11139
+ "step": 1590
11140
+ },
11141
+ {
11142
+ "epoch": 0.5560020968023763,
11143
+ "grad_norm": 10.366061210632324,
11144
+ "learning_rate": 0.0,
11145
+ "loss": 8.0351,
11146
+ "step": 1591
11147
  }
11148
  ],
11149
  "logging_steps": 1,
 
11158
  "should_evaluate": false,
11159
  "should_log": false,
11160
  "should_save": true,
11161
+ "should_training_stop": true
11162
  },
11163
  "attributes": {}
11164
  }
11165
  },
11166
+ "total_flos": 1.115399051777409e+18,
11167
  "train_batch_size": 4,
11168
  "trial_name": null,
11169
  "trial_params": null