Vasily Alexeev commited on
Commit
f7750ae
·
1 Parent(s): 1a27dec

add metrics and examples in readme

Browse files
Files changed (1) hide show
  1. README.md +224 -0
README.md CHANGED
@@ -1,5 +1,229 @@
1
  ---
 
 
 
 
2
  license: other
3
  license_name: llama3
4
  license_link: https://llama.meta.com/llama3/license
 
 
 
 
 
5
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ base_model: NousResearch/Meta-Llama-3-8B-Instruct
3
+ model_type: llama
4
+ pipeline_tag: text-generation
5
+ quantized_by: Compressa
6
  license: other
7
  license_name: llama3
8
  license_link: https://llama.meta.com/llama3/license
9
+ tags:
10
+ - llama3
11
+ - omniquant
12
+ - gptq
13
+ - triton
14
  ---
15
+
16
+
17
+ # Llama 3 8B Instruct – OmniQuant
18
+
19
+ Based on [Llama 3 8B Instruct](https://huggingface.co/NousResearch/Meta-Llama-3-8B-Instruct).
20
+
21
+ Quantized with [OmniQuant](https://github.com/OpenGVLab/OmniQuant).
22
+
23
+
24
+ ## Evaluation
25
+
26
+ ### PPL (↓)
27
+
28
+ | | wiki |
29
+ | --------- | ---- |
30
+ | FP | 8,29 |
31
+ | **Quantized** | 9,15 |
32
+
33
+
34
+ ### Accuracy on English Benchmarks, % (↑)
35
+
36
+ | | piqa | arc_easy | arc_challenge | boolq | hellaswag | winogrande | mmlu_humanities | mmlu_social_sciences | mmlu_stem | mmlu_other |
37
+ | --------- | ---- | -------- | ------------- | ----- | --------- | ---------- | --------------- | -------------------- | --------- | ---------- |
38
+ | FP | 78,7 | 81,6 | 53,0 | 83,1 | 57,7 | 72,1 | 67,0 | 70,9 | 54,5 | 68,2 |
39
+ | **Quantized** | 77,3 | 80,1 | 47,7 | 82,4 | 56,7 | 70,5 | 63,5 | 70,1 | 50,5 | 64,2 |
40
+
41
+
42
+ ### Accuracy on Russian Benchmarks, % (↑)
43
+
44
+ | | danetqa | terra | rwsd | muserc | rucos | lidirus | parus | rcb | russe | rucola |
45
+ | --------- | ------- | ----- | ---- | ------ | ----- | ------- | ----- | ---- | ----- | ------ |
46
+ | FP | 78,6 | 60,9 | 65,7 | 56,1 | 64,9 | 63,2 | 71,0 | 34,1 | 60,8 | 64,1 |
47
+ | **Quantized** | 67,8 | 52,4 | 53,9 | 55,7 | 59,5 | 58,2 | 73,0 | 35,0 | 62,0 | 64,0 |
48
+
49
+
50
+ ### Summary
51
+
52
+ | | Avg Acc Diff on Eng, % (↑) | Avg Acc Diff on Rus, % (↑) | Occupied Memory, % (↓) |
53
+ | --------- | -------------------------- | -------------------------- | ---------------------- |
54
+ | FP | 0 | 0 | 100 |
55
+ | **Quantized** | \-2,11 | \-1,60 | 35,7 |
56
+
57
+
58
+ ## Examples
59
+
60
+ ### Imports and Model Loading
61
+
62
+ <details>
63
+ <summary>Expand</summary>
64
+
65
+ ```python
66
+ import gc
67
+
68
+ import auto_gptq.nn_modules.qlinear.qlinear_cuda as qlinear_cuda
69
+ import auto_gptq.nn_modules.qlinear.qlinear_triton as qlinear_triton
70
+ import torch
71
+
72
+ from accelerate import (
73
+ init_empty_weights,
74
+ infer_auto_device_map,
75
+ load_checkpoint_in_model,
76
+ )
77
+ from tqdm import tqdm
78
+ from transformers import (
79
+ AutoConfig,
80
+ AutoModelForCausalLM,
81
+ AutoTokenizer,
82
+ pipeline,
83
+ )
84
+
85
+
86
+ def get_named_linears(model):
87
+ return {
88
+ name: module for name, module in model.named_modules()
89
+ if isinstance(module, torch.nn.Linear)
90
+ }
91
+
92
+
93
+ def set_module(model, name, module):
94
+ parent = model
95
+ levels = name.split('.')
96
+
97
+ for i in range(len(levels) - 1):
98
+ cur_name = levels[i]
99
+
100
+ if cur_name.isdigit():
101
+ parent = parent[int(cur_name)]
102
+ else:
103
+ parent = getattr(parent, cur_name)
104
+
105
+ setattr(parent, levels[-1], module)
106
+
107
+
108
+ def load_model(model_path):
109
+ # Based on: https://github.com/OpenGVLab/OmniQuant/blob/main/runing_quantized_mixtral_7bx8.ipynb
110
+
111
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
112
+
113
+ if not hasattr(config, 'quantization_config'):
114
+ raise AttributeError(
115
+ f'No quantization info found in model config "{model_path}"'
116
+ f' (`quantization_config` section is missing).'
117
+ )
118
+
119
+ wbits = config.quantization_config['bits']
120
+ group_size = config.quantization_config['group_size']
121
+
122
+ # We are going to init an ordinary model and then manually replace all Linears with QuantLinears
123
+ del config.quantization_config
124
+
125
+ with init_empty_weights():
126
+ model = AutoModelForCausalLM.from_config(config=config, torch_dtype=torch.float16, trust_remote_code=True)
127
+
128
+ layers = model.model.layers
129
+
130
+ for i in tqdm(range(len(layers))):
131
+ layer = layers[i]
132
+ named_linears = get_named_linears(layer)
133
+
134
+ for name, module in named_linears.items():
135
+ params = (
136
+ wbits, group_size,
137
+ module.in_features, module.out_features,
138
+ module.bias is not None
139
+ )
140
+
141
+ if wbits in [2, 4]:
142
+ q_linear = qlinear_triton.QuantLinear(*params)
143
+ elif wbits == 3:
144
+ q_linear = qlinear_cuda.QuantLinear(*params)
145
+ else:
146
+ raise NotImplementedError("Only 2, 3 and 4 bits are supported.")
147
+
148
+ q_linear.to(next(layer.parameters()).device)
149
+ set_module(layer, name, q_linear)
150
+
151
+ torch.cuda.empty_cache()
152
+ gc.collect()
153
+
154
+ model.tie_weights()
155
+ device_map = infer_auto_device_map(model)
156
+
157
+ print("Loading pre-computed quantized weights...")
158
+
159
+ load_checkpoint_in_model(
160
+ model, checkpoint=model_path,
161
+ device_map=device_map, offload_state_dict=True,
162
+ )
163
+
164
+ print("Model loaded successfully!")
165
+
166
+ return model
167
+ ```
168
+ </details>
169
+
170
+
171
+ ### Inference
172
+
173
+ ```python
174
+ model_path = "compressa-ai/Llama-3-8B-Instruct-OmniQuant"
175
+
176
+ model = load_model(model_path).cuda()
177
+ tokenizer = AutoTokenizer.from_pretrained(
178
+ model_path, use_fast=False, trust_remote_code=True
179
+ )
180
+
181
+ system_message = "You are a friendly chatbot who always responds in the style of a pirate."
182
+ user_message = "Where are we going, Captain?"
183
+ messages = [
184
+ {"role": "system", "content": system_message},
185
+ {"role": "user", "content": user_message},
186
+ ]
187
+ prompt = tokenizer.apply_chat_template(
188
+ messages, tokenize=False, add_generation_prompt=True
189
+ )
190
+
191
+ inputs = tokenizer(prompt, return_tensors="pt")
192
+ inputs = {k: v.cuda() for k, v in inputs.items()}
193
+
194
+ outputs = model.generate(
195
+ **inputs, max_new_tokens=512,
196
+ do_sample=True, temperature=0.7, top_p=0.95,
197
+ )
198
+
199
+ response = tokenizer.decode(outputs[0])
200
+ continuation = response.removeprefix(prompt).removesuffix(tokenizer.eos_token)
201
+
202
+ print(f'Prompt:\n{prompt}')
203
+ print(f'Continuation:\n{continuation}\n')
204
+ ```
205
+
206
+
207
+ ### Inference Using Pipeline
208
+
209
+ ```python
210
+ pipe = pipeline(
211
+ "text-generation",
212
+ model=model, tokenizer=tokenizer,
213
+ max_new_tokens=512, do_sample=True,
214
+ temperature=0.7, top_p=0.95,
215
+ device=0,
216
+ )
217
+
218
+ prompt = pipe.tokenizer.apply_chat_template(
219
+ messages, tokenize=False, add_generation_prompt=True
220
+ )
221
+
222
+ outputs = pipe(prompt)
223
+
224
+ response = outputs[0]["generated_text"]
225
+ continuation = response.removeprefix(prompt)
226
+
227
+ print(f'Prompt:\n{prompt}')
228
+ print(f'Continuation:\n{continuation}\n')
229
+ ```