Coding_Assistant

Running on Zero

Daemontatox commited on 21 days ago

Commit

ef19097

verified ·

1 Parent(s): 87d44f9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -77,7 +77,8 @@ def initialize_model():
         load_in_8bit=True,
         bnb_8bit_compute_dtype=torch.bfloat16,
         bnb_8bit_quant_type="nf4",
-        bnb_8bit_use_double_quant=True
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID , trust_remote_code=True)
@@ -86,7 +87,7 @@ def initialize_model():
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.float16,
         device_map="cuda",
         # attn_implementation="flash_attention_2",
         trust_remote_code=True,

         load_in_8bit=True,
         bnb_8bit_compute_dtype=torch.bfloat16,
         bnb_8bit_quant_type="nf4",
+        bnb_8bit_use_double_quant=True,
+        llm_int8_enable_fp32_cpu_offload=True
     )
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID , trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
+        torch_dtype="auto",
         device_map="cuda",
         # attn_implementation="flash_attention_2",
         trust_remote_code=True,