the coder from the model card has errors when executing on google colab
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import HFGenerator
#Load the model
###################################################
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version
compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
cache_dir = '.'
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
patch_linearlayers(model, patch_add_quant_config, quant_config)
#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
#prepare_for_inference(model, backend="torchao_int4")
prepare_for_inference(model, backend="bitblas") #takes a while to init...
WARNING:bitblas.utils.target_detector:TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>
, where is one of the available targets can be found in the output of tools/get_available_targets.py
.
Warning: failed to import the Marlin backend. Check if marlin is correctly installed if you want to use the Marlin backend (https://github.com/IST-DASLab/marlin).
Warning: failed to import the BitBlas backend. Check if BitBlas is correctly installed if you want to use the bitblas backend (https://github.com/microsoft/BitBLAS).
Fetchingβ7βfiles:β100%
β7/7β[08:45<00:00,β207.15s/it]
.gitattributes:β100%
β1.60k/1.60kβ[00:00<00:00,β23.6kB/s]
README.md:β100%
β4.68k/4.68kβ[00:00<00:00,β54.7kB/s]
config.json:β100%
β915/915β[00:00<00:00,β20.5kB/s]
special_tokens_map.json:β100%
β296/296β[00:00<00:00,β8.46kB/s]
tokenizer_config.json:β100%
β55.3k/55.3kβ[00:00<00:00,β2.10MB/s]
tokenizer.json:β100%
β9.09M/9.09Mβ[00:00<00:00,β36.7MB/s]
qmodel.pt:β100%
β6.03G/6.03Gβ[08:44<00:00,β16.2MB/s]
ValueError Traceback (most recent call last)
in <cell line: 15>()
13 compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17
5 frames
/usr/local/lib/python3.10/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
180
181 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 182 raise ValueError(
183 "rope_scaling
must be a dictionary with two fields, type
and factor
, " f"got {self.rope_scaling}"
184 )
ValueError: rope_scaling
must be a dictionary with two fields, type
and factor
, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
What gpu are you using ? You need at least an 3090 with cuda 12.1.
Regarding the rope_scaling
error, you need to use the latest transformers
version. Also make sure you use the latest Pytorch version.
import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator
#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)
patch_linearlayers(model, patch_add_quant_config,
BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))
model.eval();
cleanup()
#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...
#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial")
The warmup() method doesn't take a device argument.
The model and inputs are expected to be on the correct device already.
gen.warmup() #Faster generation, but warm-up takes a while
Ensure generate calls use the same device as the model
gen.generate("Write an essay about large language models", print_tokens=True, device=model.device)
gen.generate("Tell me a funny joke!", print_tokens=True, device=model.device)
gen.generate("How to make a yummy chocolate cake?", print_tokens=True, device=model.device)
Warning: failed to import the Marlin backend. Check if marlin is correctly installed if you want to use the Marlin backend (https://github.com/IST-DASLab/marlin).
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetchingβ9βfiles:β100%
β9/9β[00:00<00:00,β621.43it/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:251: FutureWarning: You are using torch.load
with weights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only
will be flipped to True
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals
. We recommend you start setting weights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
100%|ββββββββββ| 32/32 [00:00<00:00, 3491.53it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 747.69it/s]
/usr/local/lib/python3.11/dist-packages/hqq/core/peft.py:513: FutureWarning: You are using torch.load
with weights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only
will be flipped to True
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals
. We recommend you start setting weights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
lora_data = torch.load(filename, map_location="cpu")
100%|ββββββββββ| 32/32 [00:00<00:00, 67.89it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 697.02it/s]
100%|ββββββββββ| 32/32 [00:00<00:00, 3127.38it/s]
2025-02-04 00:54:21 [BitBLAS:WARNING]: TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>
, where is one of the available targets can be found in the output of tools/get_available_targets.py
.
2025-02-04 00:56:01 [BitBLAS:WARNING]: TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>
, where is one of the available targets can be found in the output of tools/get_available_targets.py
.
2025-02-04 00:56:29 [BitBLAS:WARNING]: TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>
, where is one of the available targets can be found in the output of tools/get_available_targets.py
.
2025-02-04 00:57:10 [BitBLAS:WARNING]: TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>
, where is one of the available targets can be found in the output of tools/get_available_targets.py
.
The 'batch_size' argument of StaticCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of StaticCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
RuntimeError Traceback (most recent call last)
in <cell line: 0>()
30 # The warmup() method doesn't take a device argument.
31 # The model and inputs are expected to be on the correct device already.
---> 32 gen.warmup() #Faster generation, but warm-up takes a while
33
34 # Ensure generate calls use the same device as the model
13 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py in forward(self, x, position_ids)
157 device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
158 with torch.autocast(device_type=device_type, enabled=False):
--> 159 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
160 emb = torch.cat((freqs, freqs), dim=-1)
161 cos = emb.cos()
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
#Load the model on CPU
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype)
#Quantize
from hqq.models.hf.base import AutoHQQHFModel
quant_config = BaseQuantizeConfig(nbits=4, group_size=64)
AutoHQQHFModel.quantize_model(model, quant_config=quant_config, compute_dtype=compute_dtype, device=device)
#Load the model on CPUΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨ
What gpu are you using ? You need at least an 3090 with cuda 12.1ΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨΨ
!pip install git+https://github.com/huggingface/transformers
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetchingβ9βfiles:β100%
β9/9β[00:00<00:00,β474.30it/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:237: FutureWarning: You are using torch.load
with weights_only=False
(the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only
will be flipped to True
. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals
. We recommend you start setting weights_only=True
for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
0%| | 0/32 [00:00<?, ?it/s]
AttributeError Traceback (most recent call last)
in <cell line: 0>()
8 ###################################################
9 model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
---> 10 model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
11 tokenizer = AutoTokenizer.from_pretrained(model_id)
12
4 frames
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in getattr(self, name)
1727 if name in modules:
1728 return modules[name]
-> 1729 raise AttributeError(f"'{type(self).name}' object has no attribute '{name}'")
1730
1731 def setattr(self, name: str, value: Union[Tensor, 'Module']) -> None:
AttributeError: 'LlamaAttention' object has no attribute 'rotary_emb'
Make sure you have an updated transformers version + you have a compatible GPU, this is not gonna run on the colab T4 I think