Update README.md
Browse files
README.md
CHANGED
@@ -10,7 +10,7 @@ language:
|
|
10 |
# Mistral-Nemo-Instruct-2407-FP8
|
11 |
|
12 |
## Model Overview
|
13 |
-
- **Model Architecture:**
|
14 |
- **Input:** Text
|
15 |
- **Output:** Text
|
16 |
- **Model Optimizations:**
|
@@ -24,7 +24,7 @@ language:
|
|
24 |
- **Model Developers:** Neural Magic
|
25 |
|
26 |
Quantized version of [Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407).
|
27 |
-
It achieves an average score of
|
28 |
|
29 |
### Model Optimizations
|
30 |
|
@@ -78,7 +78,7 @@ from transformers import AutoTokenizer
|
|
78 |
|
79 |
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
|
80 |
|
81 |
-
pretrained_model_dir = "
|
82 |
quantized_model_dir = "Mistral-Nemo-Instruct-2407-FP8"
|
83 |
|
84 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, model_max_length=4096)
|
@@ -130,71 +130,71 @@ lm_eval \
|
|
130 |
<tr>
|
131 |
<td>MMLU (5-shot)
|
132 |
</td>
|
133 |
-
<td>
|
134 |
</td>
|
135 |
-
<td>
|
136 |
</td>
|
137 |
-
<td>
|
138 |
</td>
|
139 |
</tr>
|
140 |
<tr>
|
141 |
<td>ARC Challenge (25-shot)
|
142 |
</td>
|
143 |
-
<td>
|
144 |
</td>
|
145 |
-
<td>
|
146 |
</td>
|
147 |
-
<td>98.
|
148 |
</td>
|
149 |
</tr>
|
150 |
<tr>
|
151 |
<td>GSM-8K (5-shot, strict-match)
|
152 |
</td>
|
153 |
-
<td>
|
154 |
</td>
|
155 |
-
<td>73.
|
156 |
</td>
|
157 |
-
<td>
|
158 |
</td>
|
159 |
</tr>
|
160 |
<tr>
|
161 |
<td>Hellaswag (10-shot)
|
162 |
</td>
|
163 |
-
<td>
|
164 |
</td>
|
165 |
-
<td>
|
166 |
</td>
|
167 |
-
<td>99.
|
168 |
</td>
|
169 |
</tr>
|
170 |
<tr>
|
171 |
<td>Winogrande (5-shot)
|
172 |
</td>
|
173 |
-
<td>
|
174 |
</td>
|
175 |
-
<td>
|
176 |
</td>
|
177 |
-
<td>100.
|
178 |
</td>
|
179 |
</tr>
|
180 |
<tr>
|
181 |
<td>TruthfulQA (0-shot)
|
182 |
</td>
|
183 |
-
<td>
|
184 |
</td>
|
185 |
-
<td>
|
186 |
</td>
|
187 |
-
<td>
|
188 |
</td>
|
189 |
</tr>
|
190 |
<tr>
|
191 |
<td><strong>Average</strong>
|
192 |
</td>
|
193 |
-
<td><strong>
|
194 |
</td>
|
195 |
-
<td><strong>
|
196 |
</td>
|
197 |
-
<td><strong>99.
|
198 |
</td>
|
199 |
</tr>
|
200 |
</table>
|
|
|
10 |
# Mistral-Nemo-Instruct-2407-FP8
|
11 |
|
12 |
## Model Overview
|
13 |
+
- **Model Architecture:** Mistral-Nemo
|
14 |
- **Input:** Text
|
15 |
- **Output:** Text
|
16 |
- **Model Optimizations:**
|
|
|
24 |
- **Model Developers:** Neural Magic
|
25 |
|
26 |
Quantized version of [Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407).
|
27 |
+
It achieves an average score of 71.28 on the [OpenLLM](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) benchmark (version 1), whereas the unquantized model achieves 71.61.
|
28 |
|
29 |
### Model Optimizations
|
30 |
|
|
|
78 |
|
79 |
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
|
80 |
|
81 |
+
pretrained_model_dir = "mistralai/Mistral-Nemo-Instruct-2407"
|
82 |
quantized_model_dir = "Mistral-Nemo-Instruct-2407-FP8"
|
83 |
|
84 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, model_max_length=4096)
|
|
|
130 |
<tr>
|
131 |
<td>MMLU (5-shot)
|
132 |
</td>
|
133 |
+
<td>68.35
|
134 |
</td>
|
135 |
+
<td>68.50
|
136 |
</td>
|
137 |
+
<td>100.2%
|
138 |
</td>
|
139 |
</tr>
|
140 |
<tr>
|
141 |
<td>ARC Challenge (25-shot)
|
142 |
</td>
|
143 |
+
<td>65.53
|
144 |
</td>
|
145 |
+
<td>64.68
|
146 |
</td>
|
147 |
+
<td>98.70%
|
148 |
</td>
|
149 |
</tr>
|
150 |
<tr>
|
151 |
<td>GSM-8K (5-shot, strict-match)
|
152 |
</td>
|
153 |
+
<td>74.45
|
154 |
</td>
|
155 |
+
<td>73.01
|
156 |
</td>
|
157 |
+
<td>98.06%
|
158 |
</td>
|
159 |
</tr>
|
160 |
<tr>
|
161 |
<td>Hellaswag (10-shot)
|
162 |
</td>
|
163 |
+
<td>84.32
|
164 |
</td>
|
165 |
+
<td>84.18
|
166 |
</td>
|
167 |
+
<td>99.83%
|
168 |
</td>
|
169 |
</tr>
|
170 |
<tr>
|
171 |
<td>Winogrande (5-shot)
|
172 |
</td>
|
173 |
+
<td>82.16
|
174 |
</td>
|
175 |
+
<td>82.32
|
176 |
</td>
|
177 |
+
<td>100.1%
|
178 |
</td>
|
179 |
</tr>
|
180 |
<tr>
|
181 |
<td>TruthfulQA (0-shot)
|
182 |
</td>
|
183 |
+
<td>54.85
|
184 |
</td>
|
185 |
+
<td>54.96
|
186 |
</td>
|
187 |
+
<td>100.2%
|
188 |
</td>
|
189 |
</tr>
|
190 |
<tr>
|
191 |
<td><strong>Average</strong>
|
192 |
</td>
|
193 |
+
<td><strong>71.61</strong>
|
194 |
</td>
|
195 |
+
<td><strong>71.28</strong>
|
196 |
</td>
|
197 |
+
<td><strong>99.53%</strong>
|
198 |
</td>
|
199 |
</tr>
|
200 |
</table>
|