Upload 11 files

Browse files

Files changed (4) hide show

optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +252 -0

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:872d7b9701d8d7b2a9df16eb5c7940013ae4c0ab32d96039b3651ba65e7c70ba
+size 150346986

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97e92e08cd1c210745787fb53c3aec8b820b463c1f5d67263a0c4fecb69f69ca
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b2e7f09e07aa3c7be669ba27bfff15efcc0eca70c4fe2d063cc724177dea8a7
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.986425339366516,
+  "eval_steps": 500,
+  "global_step": 330,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09,
+      "grad_norm": 14.758803367614746,
+      "learning_rate": 0.0002,
+      "loss": 23.3472,
+      "step": 10
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 59.627952575683594,
+      "learning_rate": 0.0002,
+      "loss": 15.2238,
+      "step": 20
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 4.7137131690979,
+      "learning_rate": 0.0002,
+      "loss": 2.488,
+      "step": 30
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.8212027549743652,
+      "learning_rate": 0.0002,
+      "loss": 1.0811,
+      "step": 40
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.546755313873291,
+      "learning_rate": 0.0002,
+      "loss": 0.8055,
+      "step": 50
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.0168941020965576,
+      "learning_rate": 0.0002,
+      "loss": 0.705,
+      "step": 60
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.9273145794868469,
+      "learning_rate": 0.0002,
+      "loss": 0.6457,
+      "step": 70
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.7880547642707825,
+      "learning_rate": 0.0002,
+      "loss": 0.5931,
+      "step": 80
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.8284961581230164,
+      "learning_rate": 0.0002,
+      "loss": 0.5887,
+      "step": 90
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.7326173782348633,
+      "learning_rate": 0.0002,
+      "loss": 0.5627,
+      "step": 100
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.3316080570220947,
+      "learning_rate": 0.0002,
+      "loss": 0.5699,
+      "step": 110
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.7548896074295044,
+      "learning_rate": 0.0002,
+      "loss": 0.5371,
+      "step": 120
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.612474262714386,
+      "learning_rate": 0.0002,
+      "loss": 0.479,
+      "step": 130
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.8262041807174683,
+      "learning_rate": 0.0002,
+      "loss": 0.5065,
+      "step": 140
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.9287819862365723,
+      "learning_rate": 0.0002,
+      "loss": 0.4441,
+      "step": 150
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.7204004526138306,
+      "learning_rate": 0.0002,
+      "loss": 0.5144,
+      "step": 160
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.5320543646812439,
+      "learning_rate": 0.0002,
+      "loss": 0.4624,
+      "step": 170
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.5815476775169373,
+      "learning_rate": 0.0002,
+      "loss": 0.4536,
+      "step": 180
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.4539355933666229,
+      "learning_rate": 0.0002,
+      "loss": 0.5108,
+      "step": 190
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.8424472212791443,
+      "learning_rate": 0.0002,
+      "loss": 0.4603,
+      "step": 200
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.485895574092865,
+      "learning_rate": 0.0002,
+      "loss": 0.4843,
+      "step": 210
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.4039005935192108,
+      "learning_rate": 0.0002,
+      "loss": 0.4872,
+      "step": 220
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 1.0310947895050049,
+      "learning_rate": 0.0002,
+      "loss": 0.3785,
+      "step": 230
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 0.635901153087616,
+      "learning_rate": 0.0002,
+      "loss": 0.3905,
+      "step": 240
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.38871026039123535,
+      "learning_rate": 0.0002,
+      "loss": 0.3745,
+      "step": 250
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 0.5501377582550049,
+      "learning_rate": 0.0002,
+      "loss": 0.3759,
+      "step": 260
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.6457089781761169,
+      "learning_rate": 0.0002,
+      "loss": 0.394,
+      "step": 270
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 0.8598196506500244,
+      "learning_rate": 0.0002,
+      "loss": 0.3909,
+      "step": 280
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 0.5458590984344482,
+      "learning_rate": 0.0002,
+      "loss": 0.3725,
+      "step": 290
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 0.6310967803001404,
+      "learning_rate": 0.0002,
+      "loss": 0.3884,
+      "step": 300
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 1.0128086805343628,
+      "learning_rate": 0.0002,
+      "loss": 0.3829,
+      "step": 310
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 0.5322939157485962,
+      "learning_rate": 0.0002,
+      "loss": 0.3866,
+      "step": 320
+    },
+    {
+      "epoch": 2.99,
+      "grad_norm": 0.5951926708221436,
+      "learning_rate": 0.0002,
+      "loss": 0.3494,
+      "step": 330
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 330,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "total_flos": 9.290508651144806e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}