Spaces:
Sleeping
Sleeping
Format
Browse files
app.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
import os
|
2 |
-
from typing import
|
3 |
import gradio as gr
|
4 |
-
import torch
|
5 |
import spaces
|
6 |
from dataclasses import dataclass
|
7 |
-
from huggingface_hub import HfApi,
|
8 |
from transformers import AutoProcessor
|
9 |
from llmcompressor.modifiers.quantization import QuantizationModifier
|
10 |
from llmcompressor.transformers import oneshot, wrap_hf_model_class
|
@@ -28,19 +27,18 @@ def parse_ignore_list(ignore_str: str) -> List[str]:
|
|
28 |
def create_quantized_model(
|
29 |
model_id: str,
|
30 |
work_dir: str,
|
31 |
-
api: HfApi,
|
32 |
ignore_list: List[str],
|
33 |
model_class_name: str
|
34 |
) -> Tuple[str, List[Tuple[str, Exception]]]:
|
35 |
"""Quantize model to FP8 and save to disk"""
|
36 |
-
|
37 |
errors = []
|
38 |
try:
|
39 |
# Get the appropriate model class
|
40 |
-
exec(f"from transformers import {
|
41 |
-
model_class = eval(
|
42 |
wrapped_model_class = wrap_hf_model_class(model_class)
|
43 |
-
|
44 |
# Load model with ZeroGPU
|
45 |
model = wrapped_model_class.from_pretrained(
|
46 |
model_id,
|
@@ -162,7 +160,6 @@ processor = AutoProcessor.from_pretrained("{target_repo}")
|
|
162 |
@spaces.GPU(duration=300) # 5 minutes timeout for large models
|
163 |
def run(
|
164 |
model_id: str,
|
165 |
-
is_private: bool,
|
166 |
token: str,
|
167 |
ignore_str: str,
|
168 |
model_class_name: str
|
@@ -195,7 +192,6 @@ def run(
|
|
195 |
quantized_path, errors = create_quantized_model(
|
196 |
model_id,
|
197 |
work_dir,
|
198 |
-
api,
|
199 |
ignore_list,
|
200 |
model_class_name
|
201 |
)
|
@@ -279,9 +275,6 @@ with gr.Blocks(title=title) as demo:
|
|
279 |
label="model_id",
|
280 |
placeholder="huggingface/model-name"
|
281 |
)
|
282 |
-
is_private = gr.Checkbox(
|
283 |
-
label="Private model (requires read access to original model)"
|
284 |
-
)
|
285 |
token = gr.Text(
|
286 |
max_lines=1,
|
287 |
label="your_hf_token (requires write access)",
|
@@ -309,7 +302,7 @@ with gr.Blocks(title=title) as demo:
|
|
309 |
|
310 |
submit.click(
|
311 |
run,
|
312 |
-
inputs=[model_id,
|
313 |
outputs=output,
|
314 |
concurrency_limit=1
|
315 |
)
|
|
|
1 |
import os
|
2 |
+
from typing import Tuple, List
|
3 |
import gradio as gr
|
|
|
4 |
import spaces
|
5 |
from dataclasses import dataclass
|
6 |
+
from huggingface_hub import HfApi, CommitOperationAdd
|
7 |
from transformers import AutoProcessor
|
8 |
from llmcompressor.modifiers.quantization import QuantizationModifier
|
9 |
from llmcompressor.transformers import oneshot, wrap_hf_model_class
|
|
|
27 |
def create_quantized_model(
|
28 |
model_id: str,
|
29 |
work_dir: str,
|
|
|
30 |
ignore_list: List[str],
|
31 |
model_class_name: str
|
32 |
) -> Tuple[str, List[Tuple[str, Exception]]]:
|
33 |
"""Quantize model to FP8 and save to disk"""
|
34 |
+
|
35 |
errors = []
|
36 |
try:
|
37 |
# Get the appropriate model class
|
38 |
+
exec(f"from transformers import {model_class_name}")
|
39 |
+
model_class = eval(model_class_name)
|
40 |
wrapped_model_class = wrap_hf_model_class(model_class)
|
41 |
+
|
42 |
# Load model with ZeroGPU
|
43 |
model = wrapped_model_class.from_pretrained(
|
44 |
model_id,
|
|
|
160 |
@spaces.GPU(duration=300) # 5 minutes timeout for large models
|
161 |
def run(
|
162 |
model_id: str,
|
|
|
163 |
token: str,
|
164 |
ignore_str: str,
|
165 |
model_class_name: str
|
|
|
192 |
quantized_path, errors = create_quantized_model(
|
193 |
model_id,
|
194 |
work_dir,
|
|
|
195 |
ignore_list,
|
196 |
model_class_name
|
197 |
)
|
|
|
275 |
label="model_id",
|
276 |
placeholder="huggingface/model-name"
|
277 |
)
|
|
|
|
|
|
|
278 |
token = gr.Text(
|
279 |
max_lines=1,
|
280 |
label="your_hf_token (requires write access)",
|
|
|
302 |
|
303 |
submit.click(
|
304 |
run,
|
305 |
+
inputs=[model_id, token, ignore_str, model_class_name],
|
306 |
outputs=output,
|
307 |
concurrency_limit=1
|
308 |
)
|