Text Generation
Transformers

the coder from the model card has errors when executing on google colab

#1
by vasilee - opened
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version

compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
cache_dir = '.'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
patch_linearlayers(model, patch_add_quant_config, quant_config)

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
#prepare_for_inference(model, backend="torchao_int4") 
prepare_for_inference(model, backend="bitblas") #takes a while to init...

WARNING:bitblas.utils.target_detector:TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>, where is one of the available targets can be found in the output of tools/get_available_targets.py.
Warning: failed to import the Marlin backend. Check if marlin is correctly installed if you want to use the Marlin backend (https://github.com/IST-DASLab/marlin).
Warning: failed to import the BitBlas backend. Check if BitBlas is correctly installed if you want to use the bitblas backend (https://github.com/microsoft/BitBLAS).
Fetching 7 files: 100%
 7/7 [08:45<00:00, 207.15s/it]
.gitattributes: 100%
 1.60k/1.60k [00:00<00:00, 23.6kB/s]
README.md: 100%
 4.68k/4.68k [00:00<00:00, 54.7kB/s]
config.json: 100%
 915/915 [00:00<00:00, 20.5kB/s]
special_tokens_map.json: 100%
 296/296 [00:00<00:00, 8.46kB/s]
tokenizer_config.json: 100%
 55.3k/55.3k [00:00<00:00, 2.10MB/s]
tokenizer.json: 100%
 9.09M/9.09M [00:00<00:00, 36.7MB/s]
qmodel.pt: 100%
 6.03G/6.03G [08:44<00:00, 16.2MB/s]

ValueError Traceback (most recent call last)
in <cell line: 15>()
13 compute_dtype = torch.float16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17

5 frames
/usr/local/lib/python3.10/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
180
181 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 182 raise ValueError(
183 "rope_scaling must be a dictionary with two fields, type and factor, " f"got {self.rope_scaling}"
184 )

ValueError: rope_scaling must be a dictionary with two fields, type and factor, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}

Mobius Labs GmbH org

What gpu are you using ? You need at least an 3090 with cuda 12.1.

Regarding the rope_scaling error, you need to use the latest transformers version. Also make sure you use the latest Pytorch version.

import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval();
cleanup()

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial")

The warmup() method doesn't take a device argument.

The model and inputs are expected to be on the correct device already.

gen.warmup() #Faster generation, but warm-up takes a while

Ensure generate calls use the same device as the model

gen.generate("Write an essay about large language models", print_tokens=True, device=model.device)
gen.generate("Tell me a funny joke!", print_tokens=True, device=model.device)
gen.generate("How to make a yummy chocolate cake?", print_tokens=True, device=model.device)

Warning: failed to import the Marlin backend. Check if marlin is correctly installed if you want to use the Marlin backend (https://github.com/IST-DASLab/marlin).
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 9 files: 100%
 9/9 [00:00<00:00, 621.43it/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:251: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:00<00:00, 3491.53it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:00<00:00, 747.69it/s]
/usr/local/lib/python3.11/dist-packages/hqq/core/peft.py:513: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
lora_data = torch.load(filename, map_location="cpu")
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:00<00:00, 67.89it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:00<00:00, 697.02it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 32/32 [00:00<00:00, 3127.38it/s]
2025-02-04 00:54:21 [BitBLAS:WARNING]: TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>, where is one of the available targets can be found in the output of tools/get_available_targets.py.
2025-02-04 00:56:01 [BitBLAS:WARNING]: TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>, where is one of the available targets can be found in the output of tools/get_available_targets.py.
2025-02-04 00:56:29 [BitBLAS:WARNING]: TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>, where is one of the available targets can be found in the output of tools/get_available_targets.py.
2025-02-04 00:57:10 [BitBLAS:WARNING]: TVM target not found. Please set the TVM target environment variable using export TVM_TARGET=<target>, where is one of the available targets can be found in the output of tools/get_available_targets.py.
The 'batch_size' argument of StaticCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of StaticCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.

RuntimeError Traceback (most recent call last)
in <cell line: 0>()
30 # The warmup() method doesn't take a device argument.
31 # The model and inputs are expected to be on the correct device already.
---> 32 gen.warmup() #Faster generation, but warm-up takes a while
33
34 # Ensure generate calls use the same device as the model

13 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py in forward(self, x, position_ids)
157 device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
158 with torch.autocast(device_type=device_type, enabled=False):
--> 159 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
160 emb = torch.cat((freqs, freqs), dim=-1)
161 cos = emb.cos()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

#Load the model on CPU
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype)

#Quantize
from hqq.models.hf.base import AutoHQQHFModel
quant_config = BaseQuantizeConfig(nbits=4, group_size=64)
AutoHQQHFModel.quantize_model(model, quant_config=quant_config, compute_dtype=compute_dtype, device=device)

#Load the model on CPU؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟

What gpu are you using ? You need at least an 3090 with cuda 12.1؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟؟

!pip install git+https://github.com/huggingface/transformers

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 9 files: 100%
 9/9 [00:00<00:00, 474.30it/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:237: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
0%| | 0/32 [00:00<?, ?it/s]

AttributeError Traceback (most recent call last)
in <cell line: 0>()
8 ###################################################
9 model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
---> 10 model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
11 tokenizer = AutoTokenizer.from_pretrained(model_id)
12

4 frames
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in getattr(self, name)
1727 if name in modules:
1728 return modules[name]
-> 1729 raise AttributeError(f"'{type(self).name}' object has no attribute '{name}'")
1730
1731 def setattr(self, name: str, value: Union[Tensor, 'Module']) -> None:

AttributeError: 'LlamaAttention' object has no attribute 'rotary_emb'

Mobius Labs GmbH org

Make sure you have an updated transformers version + you have a compatible GPU, this is not gonna run on the colab T4 I think

Sign up or log in to comment