winnieyangwannan commited on
Commit
99d8b29
·
verified ·
1 Parent(s): 67af3a0

Training in progress, step 200, checkpoint

Browse files
checkpoint-200/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "q_proj",
28
- "gate_proj",
29
  "up_proj",
30
  "o_proj",
 
 
 
31
  "k_proj",
32
- "down_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
 
26
  "up_proj",
27
  "o_proj",
28
+ "v_proj",
29
+ "q_proj",
30
+ "down_proj",
31
  "k_proj",
32
+ "gate_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2407c24f84696be3b8ddd326bb5a9a04e3f0d7c30f356f5ba249e3715c4f4aaf
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a68d2a917dab0749fe171b3ed511ad1fe4731403f257556d544e622af266ca6f
3
  size 167832240
checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e7d77c7b3a5a16fa9815515d5f44a619b4cf9328df84d68891ea2b7d4260c51
3
  size 335922386
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:629cc631973f3bbc4f03185bbc70261d9d258133eecf4047d816642b703057cc
3
  size 335922386
checkpoint-200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c680c82dfa64ac09856e13da41e1d9bbb32846404937891b317baed35a32980c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e39b169f743575a657d82b0bb4e1ba932e8cd4a88f11c5f8a87f46aaaeeed639
3
  size 14244
checkpoint-200/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ff3645a8ea38fb48d27fe4a4bdad8f5b91993325afa3cd2e0307ada985e6716
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df94ae7fb9621b11e5c9bdcc14e69064bab6aec53757509e04f89e6c7812516d
3
  size 1064
checkpoint-200/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.21008403361344538,
5
  "eval_steps": 50,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
@@ -9,180 +9,180 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01050420168067227,
13
- "grad_norm": 2.5582146644592285,
14
- "learning_rate": 4.98249299719888e-05,
15
- "loss": 1.6787,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.02100840336134454,
20
- "grad_norm": 0.9345070719718933,
21
- "learning_rate": 4.96498599439776e-05,
22
- "loss": 0.518,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.031512605042016806,
27
- "grad_norm": 1.6358414888381958,
28
- "learning_rate": 4.947478991596639e-05,
29
- "loss": 0.4604,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.04201680672268908,
34
- "grad_norm": 0.7778844237327576,
35
- "learning_rate": 4.9299719887955186e-05,
36
- "loss": 0.3771,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.052521008403361345,
41
- "grad_norm": 0.7006077766418457,
42
- "learning_rate": 4.912464985994398e-05,
43
- "loss": 0.3842,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.052521008403361345,
48
- "eval_loss": 0.42603224515914917,
49
- "eval_runtime": 13.5673,
50
- "eval_samples_per_second": 35.379,
51
- "eval_steps_per_second": 2.211,
52
  "step": 50
53
  },
54
  {
55
- "epoch": 0.06302521008403361,
56
- "grad_norm": 0.6415153741836548,
57
- "learning_rate": 4.8949579831932775e-05,
58
- "loss": 0.3399,
59
  "step": 60
60
  },
61
  {
62
- "epoch": 0.07352941176470588,
63
- "grad_norm": 0.6030780076980591,
64
- "learning_rate": 4.877450980392157e-05,
65
- "loss": 0.3447,
66
  "step": 70
67
  },
68
  {
69
- "epoch": 0.08403361344537816,
70
- "grad_norm": 0.688852071762085,
71
- "learning_rate": 4.859943977591036e-05,
72
- "loss": 0.3219,
73
  "step": 80
74
  },
75
  {
76
- "epoch": 0.09453781512605042,
77
- "grad_norm": 0.6371557712554932,
78
- "learning_rate": 4.8424369747899164e-05,
79
- "loss": 0.3379,
80
  "step": 90
81
  },
82
  {
83
- "epoch": 0.10504201680672269,
84
- "grad_norm": 0.7739270329475403,
85
- "learning_rate": 4.824929971988796e-05,
86
- "loss": 0.3177,
87
  "step": 100
88
  },
89
  {
90
- "epoch": 0.10504201680672269,
91
- "eval_loss": 0.3801896274089813,
92
- "eval_runtime": 13.6107,
93
- "eval_samples_per_second": 35.266,
94
- "eval_steps_per_second": 2.204,
95
  "step": 100
96
  },
97
  {
98
- "epoch": 0.11554621848739496,
99
- "grad_norm": 0.649507462978363,
100
- "learning_rate": 4.807422969187675e-05,
101
- "loss": 0.3415,
102
  "step": 110
103
  },
104
  {
105
- "epoch": 0.12605042016806722,
106
- "grad_norm": 0.594717264175415,
107
- "learning_rate": 4.7899159663865554e-05,
108
- "loss": 0.3325,
109
  "step": 120
110
  },
111
  {
112
- "epoch": 0.13655462184873948,
113
- "grad_norm": 0.627918541431427,
114
- "learning_rate": 4.772408963585435e-05,
115
- "loss": 0.3222,
116
  "step": 130
117
  },
118
  {
119
- "epoch": 0.14705882352941177,
120
- "grad_norm": 0.5384674668312073,
121
- "learning_rate": 4.7549019607843135e-05,
122
- "loss": 0.3426,
123
  "step": 140
124
  },
125
  {
126
- "epoch": 0.15756302521008403,
127
- "grad_norm": 0.5673420429229736,
128
- "learning_rate": 4.7373949579831936e-05,
129
- "loss": 0.3061,
130
  "step": 150
131
  },
132
  {
133
- "epoch": 0.15756302521008403,
134
- "eval_loss": 0.3653399348258972,
135
- "eval_runtime": 13.5947,
136
- "eval_samples_per_second": 35.308,
137
- "eval_steps_per_second": 2.207,
138
  "step": 150
139
  },
140
  {
141
- "epoch": 0.16806722689075632,
142
- "grad_norm": 0.6111018657684326,
143
- "learning_rate": 4.719887955182073e-05,
144
- "loss": 0.3271,
145
  "step": 160
146
  },
147
  {
148
- "epoch": 0.17857142857142858,
149
- "grad_norm": 0.7422594428062439,
150
- "learning_rate": 4.7023809523809525e-05,
151
- "loss": 0.315,
152
  "step": 170
153
  },
154
  {
155
- "epoch": 0.18907563025210083,
156
- "grad_norm": 0.7226534485816956,
157
- "learning_rate": 4.684873949579832e-05,
158
- "loss": 0.3031,
159
  "step": 180
160
  },
161
  {
162
- "epoch": 0.19957983193277312,
163
- "grad_norm": 0.6302976012229919,
164
- "learning_rate": 4.667366946778712e-05,
165
- "loss": 0.3161,
166
  "step": 190
167
  },
168
  {
169
- "epoch": 0.21008403361344538,
170
- "grad_norm": 0.6225076913833618,
171
- "learning_rate": 4.6498599439775914e-05,
172
- "loss": 0.3038,
173
  "step": 200
174
  },
175
  {
176
- "epoch": 0.21008403361344538,
177
- "eval_loss": 0.35061606764793396,
178
- "eval_runtime": 13.5616,
179
- "eval_samples_per_second": 35.394,
180
- "eval_steps_per_second": 2.212,
181
  "step": 200
182
  }
183
  ],
184
  "logging_steps": 10,
185
- "max_steps": 2856,
186
  "num_input_tokens_seen": 0,
187
  "num_train_epochs": 3,
188
  "save_steps": 100,
@@ -198,7 +198,7 @@
198
  "attributes": {}
199
  }
200
  },
201
- "total_flos": 2.383530248228045e+16,
202
  "train_batch_size": 16,
203
  "trial_name": null,
204
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2107481559536354,
5
  "eval_steps": 50,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01053740779768177,
13
+ "grad_norm": 3.070249080657959,
14
+ "learning_rate": 4.982437653670531e-05,
15
+ "loss": 1.7879,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.02107481559536354,
20
+ "grad_norm": 1.702326774597168,
21
+ "learning_rate": 4.964875307341061e-05,
22
+ "loss": 0.5567,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.03161222339304531,
27
+ "grad_norm": 1.1947294473648071,
28
+ "learning_rate": 4.947312961011591e-05,
29
+ "loss": 0.4493,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.04214963119072708,
34
+ "grad_norm": 0.9556658267974854,
35
+ "learning_rate": 4.929750614682122e-05,
36
+ "loss": 0.3728,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.05268703898840885,
41
+ "grad_norm": 0.7952510714530945,
42
+ "learning_rate": 4.9121882683526524e-05,
43
+ "loss": 0.3535,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.05268703898840885,
48
+ "eval_loss": 0.4311191439628601,
49
+ "eval_runtime": 13.6539,
50
+ "eval_samples_per_second": 35.155,
51
+ "eval_steps_per_second": 2.197,
52
  "step": 50
53
  },
54
  {
55
+ "epoch": 0.06322444678609063,
56
+ "grad_norm": 0.6962826251983643,
57
+ "learning_rate": 4.894625922023183e-05,
58
+ "loss": 0.3507,
59
  "step": 60
60
  },
61
  {
62
+ "epoch": 0.0737618545837724,
63
+ "grad_norm": 0.6941961646080017,
64
+ "learning_rate": 4.877063575693713e-05,
65
+ "loss": 0.3585,
66
  "step": 70
67
  },
68
  {
69
+ "epoch": 0.08429926238145416,
70
+ "grad_norm": 0.6864392757415771,
71
+ "learning_rate": 4.8595012293642434e-05,
72
+ "loss": 0.3496,
73
  "step": 80
74
  },
75
  {
76
+ "epoch": 0.09483667017913593,
77
+ "grad_norm": 0.7322937846183777,
78
+ "learning_rate": 4.841938883034774e-05,
79
+ "loss": 0.3295,
80
  "step": 90
81
  },
82
  {
83
+ "epoch": 0.1053740779768177,
84
+ "grad_norm": 0.6921488046646118,
85
+ "learning_rate": 4.824376536705304e-05,
86
+ "loss": 0.3357,
87
  "step": 100
88
  },
89
  {
90
+ "epoch": 0.1053740779768177,
91
+ "eval_loss": 0.39120009541511536,
92
+ "eval_runtime": 13.7031,
93
+ "eval_samples_per_second": 35.029,
94
+ "eval_steps_per_second": 2.189,
95
  "step": 100
96
  },
97
  {
98
+ "epoch": 0.11591148577449947,
99
+ "grad_norm": 0.6553240418434143,
100
+ "learning_rate": 4.8068141903758344e-05,
101
+ "loss": 0.3105,
102
  "step": 110
103
  },
104
  {
105
+ "epoch": 0.12644889357218125,
106
+ "grad_norm": 0.5637819170951843,
107
+ "learning_rate": 4.789251844046364e-05,
108
+ "loss": 0.3164,
109
  "step": 120
110
  },
111
  {
112
+ "epoch": 0.136986301369863,
113
+ "grad_norm": 0.6341928839683533,
114
+ "learning_rate": 4.7716894977168955e-05,
115
+ "loss": 0.304,
116
  "step": 130
117
  },
118
  {
119
+ "epoch": 0.1475237091675448,
120
+ "grad_norm": 0.5917785167694092,
121
+ "learning_rate": 4.754127151387426e-05,
122
+ "loss": 0.3234,
123
  "step": 140
124
  },
125
  {
126
+ "epoch": 0.15806111696522657,
127
+ "grad_norm": 0.5884453654289246,
128
+ "learning_rate": 4.736564805057956e-05,
129
+ "loss": 0.317,
130
  "step": 150
131
  },
132
  {
133
+ "epoch": 0.15806111696522657,
134
+ "eval_loss": 0.37688738107681274,
135
+ "eval_runtime": 13.6535,
136
+ "eval_samples_per_second": 35.156,
137
+ "eval_steps_per_second": 2.197,
138
  "step": 150
139
  },
140
  {
141
+ "epoch": 0.16859852476290832,
142
+ "grad_norm": 0.5819964408874512,
143
+ "learning_rate": 4.7190024587284866e-05,
144
+ "loss": 0.2992,
145
  "step": 160
146
  },
147
  {
148
+ "epoch": 0.1791359325605901,
149
+ "grad_norm": 0.689468264579773,
150
+ "learning_rate": 4.7014401123990165e-05,
151
+ "loss": 0.3168,
152
  "step": 170
153
  },
154
  {
155
+ "epoch": 0.18967334035827185,
156
+ "grad_norm": 0.6950872540473938,
157
+ "learning_rate": 4.683877766069547e-05,
158
+ "loss": 0.3041,
159
  "step": 180
160
  },
161
  {
162
+ "epoch": 0.20021074815595363,
163
+ "grad_norm": 0.8322122097015381,
164
+ "learning_rate": 4.6663154197400776e-05,
165
+ "loss": 0.3028,
166
  "step": 190
167
  },
168
  {
169
+ "epoch": 0.2107481559536354,
170
+ "grad_norm": 0.5850774645805359,
171
+ "learning_rate": 4.6487530734106075e-05,
172
+ "loss": 0.2992,
173
  "step": 200
174
  },
175
  {
176
+ "epoch": 0.2107481559536354,
177
+ "eval_loss": 0.36230018734931946,
178
+ "eval_runtime": 13.6165,
179
+ "eval_samples_per_second": 35.251,
180
+ "eval_steps_per_second": 2.203,
181
  "step": 200
182
  }
183
  ],
184
  "logging_steps": 10,
185
+ "max_steps": 2847,
186
  "num_input_tokens_seen": 0,
187
  "num_train_epochs": 3,
188
  "save_steps": 100,
 
198
  "attributes": {}
199
  }
200
  },
201
+ "total_flos": 2.3812843003969536e+16,
202
  "train_batch_size": 16,
203
  "trial_name": null,
204
  "trial_params": null
checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5894a90f0aacad19de132730666f8b4647a0c4aa14309866a5f87d3723ce6a7
3
  size 5880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5fa9d8b62d1ebe6967a504a7decdb5eeee2bb4aac96e7414f3930f9adcff095
3
  size 5880