When the length of prompt is larger than sliding_window = 4096, RuntimeError:
What's the problem when I use model.generate( cache_implementation="sliding_window") ?
When the length of prompt is larger than sliding_window = 4096, RuntimeError:
=========================================================================
Traceback (most recent call last):
File "/workspace/mistral-inference/hf-mistral-v0.1.py", line 181, in
outputs = model.generate(**inputs, cache_implementation="sliding_window", max_new_tokens=1024)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 1065, in forward
outputs = self.model(
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 791, in forward
layer_outputs = decoder_layer(
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 528, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/mistral/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 249, in forward
attn_weights = attn_weights + causal_mask
RuntimeError: The size of tensor a (4303) must match the size of tensor b (4096) at non-singleton dimension 3
================================================
#this is my code.
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, SlidingWindowCache
from transformers.generation import GenerationConfig
import torch
import os
torch.manual_seed(0)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_path = "mistralai/Mistral-7B-Instruct-v0.1/"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, attn_implementation="eager", trust_remote_code=True).eval().cuda()
print(model.config)
pre_msgs = [
""" very long text prompt""", #the length of token_ids is more than sliding_window = 4096
]
history_msg = []
for i, msg in enumerate(pre_msgs):
print("\n\n-------------------------------------------round:", i)
print("Q:", msg)
new_message = [
{"role": "user", "content": msg},
]
history_msg.append(new_message)
history_msg_ = sum(history_msg, [])
inputs = tokenizer.apply_chat_template(history_msg_, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = model.generate(**inputs, cache_implementation="sliding_window", max_new_tokens=1024)
generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
out_msg = [{"role":"assistant", "content": generated_text}]
history_msg.append(out_msg)
print(f'input size:{inputs["input_ids"].size(1)} , gen size:{outputs.size(1) - inputs["input_ids"].size(1)}')
print(generated_text)