Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -73,11 +73,11 @@ h3 {
|
|
73 |
def initialize_model():
|
74 |
"""Initialize the model with appropriate configurations"""
|
75 |
quantization_config = BitsAndBytesConfig(
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
llm_int8_enable_fp32_cpu_offload=True
|
81 |
)
|
82 |
|
83 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID , trust_remote_code=True)
|
@@ -90,7 +90,7 @@ def initialize_model():
|
|
90 |
device_map="cuda",
|
91 |
# attn_implementation="flash_attention_2",
|
92 |
trust_remote_code=True,
|
93 |
-
|
94 |
|
95 |
)
|
96 |
|
|
|
73 |
def initialize_model():
|
74 |
"""Initialize the model with appropriate configurations"""
|
75 |
quantization_config = BitsAndBytesConfig(
|
76 |
+
load_in_4bit=True,
|
77 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
78 |
+
bnb_4bit_quant_type="nf4",
|
79 |
+
bnb_4bit_use_double_quant=True,
|
80 |
+
#llm_int8_enable_fp32_cpu_offload=True
|
81 |
)
|
82 |
|
83 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID , trust_remote_code=True)
|
|
|
90 |
device_map="cuda",
|
91 |
# attn_implementation="flash_attention_2",
|
92 |
trust_remote_code=True,
|
93 |
+
quantization_config=quantization_config
|
94 |
|
95 |
)
|
96 |
|