First request OKAY, second out of memory

#4
by JLouisBiz - opened

My Python script

from flask import Flask, request, jsonify
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

app = Flask(__name__)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize processor and model
processor = AutoProcessor.from_pretrained("SmolVLM-500M-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "SmolVLM-500M-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager",  # Disable FlashAttention
).to(DEVICE)



@app
	.route('/v1/chat/completions', methods=['POST'])
def describe_image():
    data = request.json
    image_url = data.get('image_url')
    prompt = data.get('prompt', "Can you describe this image?")

    # Load image from URL
    image = load_image(image_url)

    # Create input messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt}
            ]
        },
    ]

    # Prepare inputs
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    inputs = inputs.to(DEVICE)

    # Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=500)
    generated_texts = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
    )

    # Return the generated description
    return jsonify({
        "response": generated_texts[0]
    })

if __name__ == '__main__':
    app.run(host='127.0.0.1', port=8080)
    ```
### My request

curl -X POST http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{"image_url": "http://127.0.0.1:8000/image-21.webp", "prompt": "Describe this image, summarize, without much details."}'


First time it works well, and I get the result.

Second request, I get memory error.

### Error

torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 816.00 MiB. GPU 0 has a total capacity of 3.94 GiB of which 784.25 MiB is free. Including non-PyTorch memory, this process has 3.05 GiB memory in use. Of the allocated memory 2.46 GiB is allocated by PyTorch, and 540.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
127.0.0.1 - - [24/Jan/2025 12:48:43] "POST /v1/chat/completions HTTP/1.1" 500 -
[2025-01-24 12:51:04,699] ERROR in app: Exception on /v1/chat/completions [POST]
Traceback (most recent call last):
File "/home/data1/protected/TTS/lib/python3.11/site-packages/flask/app.py", line 1511, in wsgi_app
response = self.full_dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/flask/app.py", line 919, in full_dispatch_request
rv = self.handle_user_exception(e)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/flask/app.py", line 917, in full_dispatch_request
rv = self.dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/flask/app.py", line 902, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args) # type: ignore[no-any-return]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/Programming/llamafile/SmolLM/SmolVLM-500M-Instruct-flask.py", line 45, in describe_image
generated_ids = model.generate(**inputs, max_new_tokens=500)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/transformers/generation/utils.py", line 2252, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/transformers/generation/utils.py", line 3231, in _sample
outputs = self(**model_inputs, return_dict=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 1184, in forward
outputs = self.model(
^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 999, in forward
image_hidden_states = self.vision_model(
^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 727, in forward
encoder_outputs = self.encoder(
^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 517, in forward
layer_outputs = encoder_layer(
^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 429, in forward
hidden_states, attn_weights = self.self_attn(
^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 250, in forward
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/data1/protected/TTS/lib/python3.11/site-packages/torch/nn/functional.py", line 2142, in softmax
ret = input.softmax(dim, dtype=dtype)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 816.00 MiB. GPU 0 has a total capacity of 3.94 GiB of which 784.25 MiB is free. Including non-PyTorch memory, this process has 3.05 GiB memory in use. Of the allocated memory 2.44 GiB is allocated by PyTorch, and 565.80 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
127.0.0.1 - - [24/Jan/2025 12:51:04] "POST /v1/chat/completions HTTP/1.1" 500 -


### Problem

I was expecting that 500M model would work on 4 GB VRAM, and I do not know where is really problem.

Is there anyway to resolve this memory problem?
    # Clear GPU memory
    torch.cuda.empty_cache()

the above resolved it, and now it works well!

JLouisBiz changed discussion status to closed

Sign up or log in to comment