Spaces:
Sleeping
Sleeping
optimize-docker (#3)
Browse files- Add gitignore (c469934d2ab5ae27a4caeac02c0beb10a30a33ef)
- remove unused imports (346623e65d56fc39f4a1ab6bd708112b6eb8d3a6)
- optimise docker build (6eb6110ef4a3aff3660e6c2b5708cf0bc264c967)
- .dockerignore +10 -0
- .gitignore +8 -0
- Dockerfile +72 -25
- README.md +19 -4
- app.py +244 -293
- models/GroundingDINO/ops/setup.py +1 -7
- models/GroundingDINO/ops/test.py +1 -2
- packages.txt +1 -0
- requirements.txt +8 -10
.dockerignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
env
|
2 |
+
**/__pycache__
|
3 |
+
|
4 |
+
.git/
|
5 |
+
.github/
|
6 |
+
.gitignore
|
7 |
+
|
8 |
+
*.zip
|
9 |
+
*.whl
|
10 |
+
!gradio_image_prompter-0.1.0-py3-none-any.whl
|
.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python
|
2 |
+
env/
|
3 |
+
__pycache__
|
4 |
+
.python-version
|
5 |
+
|
6 |
+
|
7 |
+
# vim
|
8 |
+
*.sw[op]
|
Dockerfile
CHANGED
@@ -1,45 +1,92 @@
|
|
|
|
1 |
# Use the specified Python runtime as a parent image
|
2 |
-
FROM docker.io/nvidia/cuda:12.1.0-
|
3 |
|
4 |
# Set the working directory in the container
|
5 |
WORKDIR /usr/src/app
|
|
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
&& apt-get clean \
|
17 |
&& rm -rf /var/lib/apt/lists/*
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
|
22 |
-
# Copy the current directory contents into the container
|
23 |
-
COPY . .
|
24 |
|
25 |
# Install any needed packages specified in requirements.txt
|
26 |
-
|
|
|
|
|
27 |
|
28 |
-
#
|
29 |
WORKDIR /usr/src/app/models/GroundingDINO/ops
|
|
|
30 |
|
31 |
# Run the setup script and the test script
|
32 |
-
RUN
|
33 |
-
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
# Expose the port Gradio will run on
|
42 |
EXPOSE 7860
|
43 |
-
|
44 |
# Default command to run the Gradio app
|
45 |
-
CMD ["
|
|
|
1 |
+
## Build time
|
2 |
# Use the specified Python runtime as a parent image
|
3 |
+
FROM docker.io/nvidia/cuda:12.1.0-devel-ubuntu22.04@sha256:e3a8f7b933e77ecee74731198a2a5483e965b585cea2660675cf4bb152237e9b AS build
|
4 |
|
5 |
# Set the working directory in the container
|
6 |
WORKDIR /usr/src/app
|
7 |
+
COPY packages.txt .
|
8 |
|
9 |
+
ENV CUDA_HOME=/usr/local/cuda \
|
10 |
+
TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \
|
11 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
12 |
+
PYTHONUNBUFFERED=1
|
13 |
+
|
14 |
+
|
15 |
+
# Delete nvidia apt list and Install required packages
|
16 |
+
RUN DEBIAN_FRONTEND=noninteractive \
|
17 |
+
&& rm -rf /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list \
|
18 |
+
&& apt-key del 7fa2af80 \
|
19 |
+
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub \
|
20 |
+
&& apt-get -yq update \
|
21 |
+
&& apt-get install --no-install-recommends -yq \
|
22 |
+
apt-transport-https \
|
23 |
+
ca-certificates \
|
24 |
+
$(cat packages.txt) \
|
25 |
+
python3 \
|
26 |
+
python3-dev \
|
27 |
+
python3-pip \
|
28 |
+
python3-venv \
|
29 |
+
ffmpeg \
|
30 |
+
libsm6 \
|
31 |
+
libxext6 \
|
32 |
&& apt-get clean \
|
33 |
&& rm -rf /var/lib/apt/lists/*
|
34 |
|
35 |
+
# Setup virtual env
|
36 |
+
RUN python3 -m venv /opt/venv
|
37 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
|
|
|
|
38 |
|
39 |
# Install any needed packages specified in requirements.txt
|
40 |
+
COPY requirements.txt .
|
41 |
+
RUN --mount=type=cache,id=pip,target=/root/.cache \
|
42 |
+
pip install -r requirements.txt
|
43 |
|
44 |
+
# Copy grounding dino ops
|
45 |
WORKDIR /usr/src/app/models/GroundingDINO/ops
|
46 |
+
COPY models/GroundingDINO/ops .
|
47 |
|
48 |
# Run the setup script and the test script
|
49 |
+
RUN CC=/usr/bin/gcc-11 python3 setup.py build && \
|
50 |
+
pip install .
|
51 |
+
|
52 |
+
## Runtime
|
53 |
+
# Use the specified Python runtime as a parent image
|
54 |
+
FROM ubuntu:22.04
|
55 |
|
56 |
+
ENV CUDA_HOME=/usr/local/cuda \
|
57 |
+
TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \
|
58 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
59 |
+
PYTHONUNBUFFERED=1 \
|
60 |
+
PATH="/opt/venv/bin:/home/user/.local/bin:$PATH" \
|
61 |
+
HOME=/home/user
|
62 |
|
63 |
+
RUN DEBIAN_FRONTEND=noninteractive apt-get -yq update && apt-get install --no-install-recommends -yq \
|
64 |
+
python3 \
|
65 |
+
python3-dev \
|
66 |
+
python3-pip \
|
67 |
+
ffmpeg \
|
68 |
+
libsm6 \
|
69 |
+
libxext6 \
|
70 |
+
&& apt-get clean \
|
71 |
+
&& rm -rf /var/lib/apt/lists/*
|
72 |
|
73 |
+
WORKDIR /app
|
74 |
+
RUN useradd -m -u 1000 user && chown -R user /app
|
75 |
+
|
76 |
+
COPY --from=build --chown=user /opt/venv /opt/venv
|
77 |
+
|
78 |
+
COPY --chown=user checkpoints checkpoints
|
79 |
+
COPY --chown=user checkpoint_best_regular.pth .
|
80 |
+
COPY --chown=user *.jpg *.JPG ./
|
81 |
+
COPY --chown=user datasets datasets
|
82 |
+
COPY --chown=user groundingdino groundingdino
|
83 |
+
COPY --chown=user models models
|
84 |
+
COPY --chown=user util util
|
85 |
+
COPY --chown=user app.py cfg_app.py ./
|
86 |
+
|
87 |
+
USER user
|
88 |
# Expose the port Gradio will run on
|
89 |
EXPOSE 7860
|
90 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
91 |
# Default command to run the Gradio app
|
92 |
+
CMD ["/opt/venv/bin/python3", "app.py"]
|
README.md
CHANGED
@@ -1,7 +1,22 @@
|
|
1 |
---
|
2 |
title: CountGD_Multi-Modal_Open-World_Counting
|
3 |
-
|
4 |
-
|
5 |
-
sdk_version: 4.44.1
|
6 |
---
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: CountGD_Multi-Modal_Open-World_Counting
|
3 |
+
sdk: docker
|
4 |
+
app_port: 7860
|
|
|
5 |
---
|
6 |
+
# CountGD: Multi Modal Open World Counting Model
|
7 |
+
|
8 |
+
To Run Locally, the best method is to use docker.
|
9 |
+
|
10 |
+
Make sure you have installed docker, nvidia-driver and nvidia container toolkit for the your platform.
|
11 |
+
|
12 |
+
Then, you can run the app locally with the following command
|
13 |
+
|
14 |
+
```bash
|
15 |
+
docker run -it \
|
16 |
+
--name countgd \
|
17 |
+
-p 7860:7860 \
|
18 |
+
--platform=linux/amd64 \
|
19 |
+
--gpus all \
|
20 |
+
registry.hf.space/nikigoli-countgd:latest \
|
21 |
+
python app.py
|
22 |
+
```
|
app.py
CHANGED
@@ -1,16 +1,10 @@
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
-
import copy
|
4 |
import random
|
5 |
import torch
|
6 |
-
import
|
7 |
-
from PIL import Image, ImageDraw, ImageFont
|
8 |
-
import torchvision.transforms.functional as F
|
9 |
import numpy as np
|
10 |
import argparse
|
11 |
-
import json
|
12 |
-
import plotly.express as px
|
13 |
-
import pandas as pd
|
14 |
from util.slconfig import SLConfig, DictAction
|
15 |
from util.misc import nested_tensor_from_tensor_list
|
16 |
import datasets.transforms as T
|
@@ -30,53 +24,9 @@ cwd = os.getcwd()
|
|
30 |
import warnings
|
31 |
warnings.filterwarnings("ignore")
|
32 |
|
33 |
-
# Installing dependencies not in requirements.txt
|
34 |
-
subprocess.run(
|
35 |
-
shlex.split(
|
36 |
-
"pip install gradio_image_prompter-0.1.0-py3-none-any.whl"
|
37 |
-
)
|
38 |
-
)
|
39 |
from gradio_image_prompter import ImagePrompter
|
40 |
-
"""
|
41 |
-
subprocess.run(
|
42 |
-
shlex.split(
|
43 |
-
"pip install MultiScaleDeformableAttention-1.0-cp310-cp310-linux_x86_64.whl"
|
44 |
-
)
|
45 |
-
)
|
46 |
-
"""
|
47 |
-
"""
|
48 |
-
subprocess.run(
|
49 |
-
shlex.split(
|
50 |
-
"python test.py"
|
51 |
-
)
|
52 |
-
)
|
53 |
-
"""
|
54 |
-
#with open('./switch_cuda.sh', 'rb') as file:
|
55 |
-
# script = file.read()
|
56 |
-
#call(script, shell=True)
|
57 |
-
|
58 |
-
with open('./build_ops.sh', 'rb') as file:
|
59 |
-
script = file.read()
|
60 |
-
call(script, shell=True)
|
61 |
-
|
62 |
-
def find_cuda():
|
63 |
-
# Check if CUDA_HOME or CUDA_PATH environment variables are set
|
64 |
-
cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
|
65 |
-
|
66 |
-
if cuda_home and os.path.exists(cuda_home):
|
67 |
-
return cuda_home
|
68 |
-
|
69 |
-
# Search for the nvcc executable in the system's PATH
|
70 |
-
nvcc_path = shutil.which('nvcc')
|
71 |
-
|
72 |
-
if nvcc_path:
|
73 |
-
# Remove the 'bin/nvcc' part to get the CUDA installation path
|
74 |
-
cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
|
75 |
-
return cuda_path
|
76 |
|
77 |
-
return None
|
78 |
|
79 |
-
cuda_path = find_cuda()
|
80 |
|
81 |
class AppSteps(Enum):
|
82 |
JUST_TEXT = 1
|
@@ -195,13 +145,6 @@ def build_model_and_transforms(args):
|
|
195 |
|
196 |
return model, data_transform
|
197 |
|
198 |
-
|
199 |
-
parser = argparse.ArgumentParser("Counting Application", parents=[get_args_parser()])
|
200 |
-
args = parser.parse_args()
|
201 |
-
device = get_device()
|
202 |
-
model, transform = build_model_and_transforms(args)
|
203 |
-
model = model.to(device)
|
204 |
-
|
205 |
examples = [
|
206 |
["strawberry.jpg", "strawberry", {"image": "strawberry.jpg"}],
|
207 |
["strawberry.jpg", "blueberry", {"image": "strawberry.jpg"}],
|
@@ -254,242 +197,250 @@ def get_ind_to_filter(text, word_ids, keywords):
|
|
254 |
|
255 |
return inds_to_filter
|
256 |
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
|
296 |
-
det_map = ndimage.gaussian_filter(
|
297 |
-
det_map, sigma=(w // 200, w // 200), order=0
|
298 |
-
)
|
299 |
-
plt.imshow(image)
|
300 |
-
plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
|
301 |
-
plt.axis('off')
|
302 |
-
img_buf = io.BytesIO()
|
303 |
-
plt.savefig(img_buf, format='png', bbox_inches='tight')
|
304 |
-
plt.close()
|
305 |
-
|
306 |
-
output_img = Image.open(img_buf)
|
307 |
-
|
308 |
-
if AppSteps.TEXT_AND_EXEMPLARS not in state:
|
309 |
-
exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
|
310 |
-
new_submit_btn = gr.Button("Count", variant="primary", interactive=False)
|
311 |
-
state = [AppSteps.JUST_TEXT, AppSteps.TEXT_AND_EXEMPLARS]
|
312 |
-
main_instructions_comp = gr.Markdown(visible=False)
|
313 |
-
step_3 = gr.Tab(visible=False)
|
314 |
-
elif AppSteps.FULL_APP not in state:
|
315 |
-
exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
|
316 |
-
new_submit_btn = submit_btn
|
317 |
-
state = [AppSteps.JUST_TEXT, AppSteps.TEXT_AND_EXEMPLARS, AppSteps.FULL_APP]
|
318 |
-
main_instructions_comp = gr.Markdown(visible=True)
|
319 |
-
step_3 = gr.Tab(visible=True)
|
320 |
-
else:
|
321 |
-
exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
|
322 |
-
new_submit_btn = submit_btn
|
323 |
-
main_instructions_comp = gr.Markdown(visible=True)
|
324 |
-
step_3 = gr.Tab(visible=True)
|
325 |
-
|
326 |
-
out_label = "Detected instances predicted with"
|
327 |
-
if len(text.strip()) > 0:
|
328 |
-
out_label += " text"
|
329 |
-
if exemplars[0].size()[0] == 1:
|
330 |
-
out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
|
331 |
-
elif exemplars[0].size()[0] > 1:
|
332 |
-
out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
|
333 |
else:
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
else:
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
with torch.no_grad():
|
360 |
-
model_output = model(
|
361 |
-
nested_tensor_from_tensor_list(input_image),
|
362 |
-
nested_tensor_from_tensor_list(input_image_exemplars),
|
363 |
-
exemplars,
|
364 |
-
[torch.tensor([0]).to(device) for _ in range(len(input_image))],
|
365 |
-
captions=[text + " ."] * len(input_image),
|
366 |
-
)
|
367 |
-
|
368 |
-
ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
|
369 |
-
logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
|
370 |
-
boxes = model_output["pred_boxes"][0]
|
371 |
-
if len(keywords.strip()) > 0:
|
372 |
-
box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
|
373 |
-
else:
|
374 |
-
box_mask = logits.max(dim=-1).values > CONF_THRESH
|
375 |
-
logits = logits[box_mask, :].cpu().numpy()
|
376 |
-
boxes = boxes[box_mask, :].cpu().numpy()
|
377 |
-
|
378 |
-
# Plot results.
|
379 |
-
(w, h) = image.size
|
380 |
-
det_map = np.zeros((h, w))
|
381 |
-
det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
|
382 |
-
det_map = ndimage.gaussian_filter(
|
383 |
-
det_map, sigma=(w // 200, w // 200), order=0
|
384 |
-
)
|
385 |
-
plt.imshow(image)
|
386 |
-
plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
|
387 |
-
plt.axis('off')
|
388 |
-
img_buf = io.BytesIO()
|
389 |
-
plt.savefig(img_buf, format='png', bbox_inches='tight')
|
390 |
-
plt.close()
|
391 |
-
|
392 |
-
output_img = Image.open(img_buf)
|
393 |
-
|
394 |
-
out_label = "Detected instances predicted with"
|
395 |
-
if len(text.strip()) > 0:
|
396 |
-
out_label += " text"
|
397 |
-
if exemplars[0].size()[0] == 1:
|
398 |
-
out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
|
399 |
-
elif exemplars[0].size()[0] > 1:
|
400 |
-
out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
|
401 |
else:
|
402 |
-
out_label
|
403 |
-
|
404 |
-
|
405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
else:
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
|
|
3 |
import random
|
4 |
import torch
|
5 |
+
from PIL import Image
|
|
|
|
|
6 |
import numpy as np
|
7 |
import argparse
|
|
|
|
|
|
|
8 |
from util.slconfig import SLConfig, DictAction
|
9 |
from util.misc import nested_tensor_from_tensor_list
|
10 |
import datasets.transforms as T
|
|
|
24 |
import warnings
|
25 |
warnings.filterwarnings("ignore")
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
from gradio_image_prompter import ImagePrompter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
|
|
29 |
|
|
|
30 |
|
31 |
class AppSteps(Enum):
|
32 |
JUST_TEXT = 1
|
|
|
145 |
|
146 |
return model, data_transform
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
examples = [
|
149 |
["strawberry.jpg", "strawberry", {"image": "strawberry.jpg"}],
|
150 |
["strawberry.jpg", "blueberry", {"image": "strawberry.jpg"}],
|
|
|
197 |
|
198 |
return inds_to_filter
|
199 |
|
200 |
+
if __name__ == '__main__':
|
201 |
+
|
202 |
+
parser = argparse.ArgumentParser("Counting Application", parents=[get_args_parser()])
|
203 |
+
args = parser.parse_args()
|
204 |
+
device = get_device()
|
205 |
+
model, transform = build_model_and_transforms(args)
|
206 |
+
model = model.to(device)
|
207 |
+
|
208 |
+
@spaces.GPU(duration=120)
|
209 |
+
def count(image, text, prompts, state, device):
|
210 |
+
|
211 |
+
keywords = "" # do not handle this for now
|
212 |
+
|
213 |
+
# Handle no prompt case.
|
214 |
+
if prompts is None:
|
215 |
+
prompts = {"image": image, "points": []}
|
216 |
+
input_image, _ = transform(image, {"exemplars": torch.tensor([])})
|
217 |
+
input_image = input_image.unsqueeze(0).to(device)
|
218 |
+
exemplars = get_box_inputs(prompts["points"])
|
219 |
+
|
220 |
+
input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
|
221 |
+
input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
|
222 |
+
exemplars = [exemplars["exemplars"].to(device)]
|
223 |
+
|
224 |
+
with torch.no_grad():
|
225 |
+
model_output = model(
|
226 |
+
nested_tensor_from_tensor_list(input_image),
|
227 |
+
nested_tensor_from_tensor_list(input_image_exemplars),
|
228 |
+
exemplars,
|
229 |
+
[torch.tensor([0]).to(device) for _ in range(len(input_image))],
|
230 |
+
captions=[text + " ."] * len(input_image),
|
231 |
+
)
|
232 |
+
|
233 |
+
ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
|
234 |
+
logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
|
235 |
+
boxes = model_output["pred_boxes"][0]
|
236 |
+
if len(keywords.strip()) > 0:
|
237 |
+
box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
else:
|
239 |
+
box_mask = logits.max(dim=-1).values > CONF_THRESH
|
240 |
+
logits = logits[box_mask, :].cpu().numpy()
|
241 |
+
boxes = boxes[box_mask, :].cpu().numpy()
|
242 |
+
|
243 |
+
# Plot results.
|
244 |
+
(w, h) = image.size
|
245 |
+
det_map = np.zeros((h, w))
|
246 |
+
det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
|
247 |
+
det_map = ndimage.gaussian_filter(
|
248 |
+
det_map, sigma=(w // 200, w // 200), order=0
|
249 |
+
)
|
250 |
+
plt.imshow(image)
|
251 |
+
plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
|
252 |
+
plt.axis('off')
|
253 |
+
img_buf = io.BytesIO()
|
254 |
+
plt.savefig(img_buf, format='png', bbox_inches='tight')
|
255 |
+
plt.close()
|
256 |
+
|
257 |
+
output_img = Image.open(img_buf)
|
258 |
+
|
259 |
+
if AppSteps.TEXT_AND_EXEMPLARS not in state:
|
260 |
+
exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
|
261 |
+
new_submit_btn = gr.Button("Count", variant="primary", interactive=False)
|
262 |
+
state = [AppSteps.JUST_TEXT, AppSteps.TEXT_AND_EXEMPLARS]
|
263 |
+
main_instructions_comp = gr.Markdown(visible=False)
|
264 |
+
step_3 = gr.Tab(visible=False)
|
265 |
+
elif AppSteps.FULL_APP not in state:
|
266 |
+
exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
|
267 |
+
new_submit_btn = submit_btn
|
268 |
+
state = [AppSteps.JUST_TEXT, AppSteps.TEXT_AND_EXEMPLARS, AppSteps.FULL_APP]
|
269 |
+
main_instructions_comp = gr.Markdown(visible=True)
|
270 |
+
step_3 = gr.Tab(visible=True)
|
271 |
else:
|
272 |
+
exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
|
273 |
+
new_submit_btn = submit_btn
|
274 |
+
main_instructions_comp = gr.Markdown(visible=True)
|
275 |
+
step_3 = gr.Tab(visible=True)
|
276 |
+
|
277 |
+
out_label = "Detected instances predicted with"
|
278 |
+
if len(text.strip()) > 0:
|
279 |
+
out_label += " text"
|
280 |
+
if exemplars[0].size()[0] == 1:
|
281 |
+
out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
|
282 |
+
elif exemplars[0].size()[0] > 1:
|
283 |
+
out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
|
284 |
+
else:
|
285 |
+
out_label += "."
|
286 |
+
elif exemplars[0].size()[0] > 0:
|
287 |
+
if exemplars[0].size()[0] == 1:
|
288 |
+
out_label += " " + str(exemplars[0].size()[0]) + " visual exemplar."
|
289 |
+
else:
|
290 |
+
out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
else:
|
292 |
+
out_label = "Nothing specified to detect."
|
293 |
+
|
294 |
+
return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]), new_submit_btn, gr.Tab(visible=True), step_3, state)
|
295 |
+
|
296 |
+
@spaces.GPU
|
297 |
+
def count_main(image, text, prompts, device):
|
298 |
+
keywords = "" # do not handle this for now
|
299 |
+
# Handle no prompt case.
|
300 |
+
if prompts is None:
|
301 |
+
prompts = {"image": image, "points": []}
|
302 |
+
input_image, _ = transform(image, {"exemplars": torch.tensor([])})
|
303 |
+
input_image = input_image.unsqueeze(0).to(device)
|
304 |
+
exemplars = get_box_inputs(prompts["points"])
|
305 |
+
|
306 |
+
input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
|
307 |
+
input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
|
308 |
+
exemplars = [exemplars["exemplars"].to(device)]
|
309 |
+
|
310 |
+
with torch.no_grad():
|
311 |
+
model_output = model(
|
312 |
+
nested_tensor_from_tensor_list(input_image),
|
313 |
+
nested_tensor_from_tensor_list(input_image_exemplars),
|
314 |
+
exemplars,
|
315 |
+
[torch.tensor([0]).to(device) for _ in range(len(input_image))],
|
316 |
+
captions=[text + " ."] * len(input_image),
|
317 |
+
)
|
318 |
+
|
319 |
+
ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
|
320 |
+
logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
|
321 |
+
boxes = model_output["pred_boxes"][0]
|
322 |
+
if len(keywords.strip()) > 0:
|
323 |
+
box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
|
324 |
else:
|
325 |
+
box_mask = logits.max(dim=-1).values > CONF_THRESH
|
326 |
+
logits = logits[box_mask, :].cpu().numpy()
|
327 |
+
boxes = boxes[box_mask, :].cpu().numpy()
|
328 |
+
|
329 |
+
# Plot results.
|
330 |
+
(w, h) = image.size
|
331 |
+
det_map = np.zeros((h, w))
|
332 |
+
det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
|
333 |
+
det_map = ndimage.gaussian_filter(
|
334 |
+
det_map, sigma=(w // 200, w // 200), order=0
|
335 |
+
)
|
336 |
+
plt.imshow(image)
|
337 |
+
plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
|
338 |
+
plt.axis('off')
|
339 |
+
img_buf = io.BytesIO()
|
340 |
+
plt.savefig(img_buf, format='png', bbox_inches='tight')
|
341 |
+
plt.close()
|
342 |
+
|
343 |
+
output_img = Image.open(img_buf)
|
344 |
+
|
345 |
+
out_label = "Detected instances predicted with"
|
346 |
+
if len(text.strip()) > 0:
|
347 |
+
out_label += " text"
|
348 |
+
if exemplars[0].size()[0] == 1:
|
349 |
+
out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
|
350 |
+
elif exemplars[0].size()[0] > 1:
|
351 |
+
out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
|
352 |
+
else:
|
353 |
+
out_label += "."
|
354 |
+
elif exemplars[0].size()[0] > 0:
|
355 |
+
if exemplars[0].size()[0] == 1:
|
356 |
+
out_label += " " + str(exemplars[0].size()[0]) + " visual exemplar."
|
357 |
+
else:
|
358 |
+
out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
|
359 |
+
else:
|
360 |
+
out_label = "Nothing specified to detect."
|
361 |
+
|
362 |
+
return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]))
|
363 |
+
|
364 |
+
def remove_label(image):
|
365 |
+
return gr.Image(show_label=False)
|
366 |
+
|
367 |
+
def check_submit_btn(exemplar_image_prompts, state):
|
368 |
+
if AppSteps.TEXT_AND_EXEMPLARS not in state or len(state) == 3:
|
369 |
+
return gr.Button("Count", variant="primary", interactive=True)
|
370 |
+
elif exemplar_image_prompts is None:
|
371 |
+
return gr.Button("Count", variant="primary", interactive=False)
|
372 |
+
elif len(get_box_inputs(exemplar_image_prompts["points"])) > 0:
|
373 |
+
return gr.Button("Count", variant="primary", interactive=True)
|
374 |
+
else:
|
375 |
+
return gr.Button("Count", variant="primary", interactive=False)
|
376 |
+
|
377 |
+
exemplar_img_drawing_instructions_part_1 = '<p><strong>Congrats, you have counted the strawberries!</strong> You can also draw a box around the object you want to count. <strong>Click and drag the mouse on the image below to draw a box around one of the strawberries.</strong> You can click the back button in the top right of the image to delete the box and try again.<img src="file/button-legend.jpg" width="750"></p>'
|
378 |
+
exemplar_img_drawing_instructions_part_2 = '<p>The boxes you draw are called \"visual exemplars,\" image examples of what you want the model to count. You can add more boxes around more examples of strawberries in the image above to increase the accuracy of the predicted count. You can also use strawberries from a different image to specify the object to count by uploading or pasting a new image above and drawing boxes around strawberries in it.</p>'
|
379 |
+
instructions_main = """
|
380 |
+
# How to Use the App
|
381 |
+
As shown earlier, there are 3 ways to specify the object to count: (1) with text only, (2) with text and any number of boxes (i.e., "visual exemplars") around example objects, and (3) with visual exemplars only. What is being used is indicated in the top left of the output image. How to try each case is detailed below.
|
382 |
+
<ol>
|
383 |
+
<li><strong>Text Only: </strong> Only provide text describing the object to count in the textbox titled "What would you like to count?" Delete all boxes drawn on the visual exemplar image.</li>
|
384 |
+
<li><strong>Text + Visual Exemplars: </strong> Provide text describing the object to count in the textbox titled "What would you like to count?" and draw at least one box around an example object in the visual exemplar image.</li>
|
385 |
+
<li><strong>Visual Exemplars Only: </strong> Remove all text in the textbox titled "What would you like to count?" and draw at least one box around an example object in the visual exemplar image.</li>
|
386 |
+
</ol>
|
387 |
+
## Click on the "App" tab at the top of the screen to exit the tutorial and start using the main app!
|
388 |
+
"""
|
389 |
+
|
390 |
+
with gr.Blocks(title="CountGD: Multi-Modal Open-World Counting", theme="soft", head="""<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1">""") as demo:
|
391 |
+
state = gr.State(value=[AppSteps.JUST_TEXT])
|
392 |
+
device = gr.State(device)
|
393 |
+
with gr.Tab("Tutorial"):
|
394 |
+
with gr.Row():
|
395 |
+
with gr.Column():
|
396 |
+
with gr.Tab("Step 3", visible=False) as step_3:
|
397 |
+
main_instructions = gr.Markdown(instructions_main)
|
398 |
+
with gr.Tab("Step 2", visible=False) as step_2:
|
399 |
+
gr.Markdown(exemplar_img_drawing_instructions_part_1)
|
400 |
+
exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', show_label=True, value={"image": "strawberry.jpg", "points": []}, interactive=True)
|
401 |
+
with gr.Accordion("Open for Further Information", open=False):
|
402 |
+
gr.Markdown(exemplar_img_drawing_instructions_part_2)
|
403 |
+
with gr.Tab("Step 1", visible=True) as step_1:
|
404 |
+
input_image = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=False, width="30vw")
|
405 |
+
gr.Markdown('# Click "Count" to count the strawberries.')
|
406 |
+
|
407 |
+
with gr.Column():
|
408 |
+
with gr.Tab("Output Image"):
|
409 |
+
detected_instances = gr.Image(label="Detected Instances", show_label='True', interactive=False, visible=True, width="40vw")
|
410 |
+
|
411 |
+
with gr.Row():
|
412 |
+
input_text = gr.Textbox(label="What would you like to count?", value="strawberry", interactive=True)
|
413 |
+
pred_count = gr.Number(label="Predicted Count", visible=False)
|
414 |
+
submit_btn = gr.Button("Count", variant="primary", interactive=True)
|
415 |
+
|
416 |
+
submit_btn.click(fn=remove_label, inputs=[detected_instances], outputs=[detected_instances]).then(fn=count, inputs=[input_image, input_text, exemplar_image, state, device], outputs=[detected_instances, pred_count, submit_btn, step_2, step_3, state])
|
417 |
+
exemplar_image.change(check_submit_btn, inputs=[exemplar_image, state], outputs=[submit_btn])
|
418 |
+
with gr.Tab("App", visible=True) as main_app:
|
419 |
+
|
420 |
+
gr.Markdown(
|
421 |
+
"""
|
422 |
+
# <center>CountGD: Multi-Modal Open-World Counting
|
423 |
+
<center><h3>Count objects with text, visual exemplars, or both together.</h3>
|
424 |
+
<h3>Scroll down to try more examples</h3>
|
425 |
+
<h3><a href='https://arxiv.org/abs/2407.04619' target='_blank' rel='noopener'>[paper]</a>
|
426 |
+
<a href='https://github.com/niki-amini-naieni/CountGD/' target='_blank' rel='noopener'>[code]</a></h3>
|
427 |
+
Limitation: this app does not support fine-grained counting based on attributes or visual grounding inputs yet. Note: if the exemplar and text conflict each other, both will be counted.</center>
|
428 |
+
"""
|
429 |
+
)
|
430 |
+
|
431 |
+
with gr.Row():
|
432 |
+
with gr.Column():
|
433 |
+
input_image_main = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=True)
|
434 |
+
input_text_main = gr.Textbox(label="What would you like to count?", placeholder="", value="strawberry")
|
435 |
+
exemplar_image_main = ImagePrompter(type='pil', label='Visual Exemplar Image', show_label=True, value={"image": "strawberry.jpg", "points": []}, interactive=True)
|
436 |
+
with gr.Column():
|
437 |
+
detected_instances_main = gr.Image(label="Detected Instances", show_label='True', interactive=False)
|
438 |
+
pred_count_main = gr.Number(label="Predicted Count")
|
439 |
+
submit_btn_main = gr.Button("Count", variant="primary")
|
440 |
+
clear_btn_main = gr.ClearButton(variant="secondary")
|
441 |
+
gr.Examples(label="Examples: click on a row to load the example. Add visual exemplars by drawing boxes on the loaded \"Visual Exemplar Image.\"", examples=examples, inputs=[input_image_main, input_text_main, exemplar_image_main])
|
442 |
+
submit_btn_main.click(fn=remove_label, inputs=[detected_instances_main], outputs=[detected_instances_main]).then(fn=count_main, inputs=[input_image_main, input_text_main, exemplar_image_main, device], outputs=[detected_instances_main, pred_count_main])
|
443 |
+
clear_btn_main.add([input_image_main, input_text_main, exemplar_image_main, detected_instances_main, pred_count_main])
|
444 |
+
|
445 |
+
|
446 |
+
demo.queue().launch(allowed_paths=['back-icon.jpg', 'paste-icon.jpg', 'upload-icon.jpg', 'button-legend.jpg'])
|
models/GroundingDINO/ops/setup.py
CHANGED
@@ -11,8 +11,6 @@ import glob
|
|
11 |
|
12 |
import torch
|
13 |
|
14 |
-
import spaces
|
15 |
-
|
16 |
from torch.utils.cpp_extension import CUDA_HOME
|
17 |
from torch.utils.cpp_extension import CppExtension
|
18 |
from torch.utils.cpp_extension import CUDAExtension
|
@@ -22,7 +20,6 @@ from setuptools import setup
|
|
22 |
|
23 |
requirements = ["torch", "torchvision"]
|
24 |
|
25 |
-
#@spaces.GPU
|
26 |
def get_extensions():
|
27 |
this_dir = os.path.dirname(os.path.abspath(__file__))
|
28 |
extensions_dir = os.path.join(this_dir, "src")
|
@@ -36,12 +33,9 @@ def get_extensions():
|
|
36 |
extra_compile_args = {"cxx": []}
|
37 |
define_macros = []
|
38 |
|
39 |
-
|
40 |
-
|
41 |
print("inside get_extensions")
|
42 |
-
print(torch.cuda.is_available())
|
43 |
print(CUDA_HOME)
|
44 |
-
if torch.cuda.is_available():
|
45 |
extension = CUDAExtension
|
46 |
sources += source_cuda
|
47 |
define_macros += [("WITH_CUDA", None)]
|
|
|
11 |
|
12 |
import torch
|
13 |
|
|
|
|
|
14 |
from torch.utils.cpp_extension import CUDA_HOME
|
15 |
from torch.utils.cpp_extension import CppExtension
|
16 |
from torch.utils.cpp_extension import CUDAExtension
|
|
|
20 |
|
21 |
requirements = ["torch", "torchvision"]
|
22 |
|
|
|
23 |
def get_extensions():
|
24 |
this_dir = os.path.dirname(os.path.abspath(__file__))
|
25 |
extensions_dir = os.path.join(this_dir, "src")
|
|
|
33 |
extra_compile_args = {"cxx": []}
|
34 |
define_macros = []
|
35 |
|
|
|
|
|
36 |
print("inside get_extensions")
|
|
|
37 |
print(CUDA_HOME)
|
38 |
+
if CUDA_HOME is not None and (torch.cuda.is_available() or ("TORCH_CUDA_ARCH_LIST" in os.environ) or torch.cuda.get_arch_list()):
|
39 |
extension = CUDAExtension
|
40 |
sources += source_cuda
|
41 |
define_macros += [("WITH_CUDA", None)]
|
models/GroundingDINO/ops/test.py
CHANGED
@@ -10,9 +10,8 @@ from __future__ import absolute_import
|
|
10 |
from __future__ import print_function
|
11 |
from __future__ import division
|
12 |
|
13 |
-
|
14 |
import torch
|
15 |
-
import torch.nn as nn
|
16 |
from torch.autograd import gradcheck
|
17 |
|
18 |
from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
|
|
|
10 |
from __future__ import print_function
|
11 |
from __future__ import division
|
12 |
|
13 |
+
|
14 |
import torch
|
|
|
15 |
from torch.autograd import gradcheck
|
16 |
|
17 |
from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
|
packages.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
build-essential
|
2 |
ninja-build
|
|
|
|
1 |
build-essential
|
2 |
ninja-build
|
3 |
+
gcc-11
|
requirements.txt
CHANGED
@@ -1,20 +1,18 @@
|
|
1 |
-
cython
|
2 |
-
submitit
|
3 |
scipy
|
4 |
termcolor
|
5 |
addict
|
6 |
yapf==0.40.1
|
7 |
timm
|
8 |
-
torch
|
9 |
-
torchvision
|
10 |
-
transformers
|
11 |
numpy
|
12 |
opencv-python
|
13 |
-
supervision==0.6.0
|
14 |
pycocotools
|
15 |
-
pyyaml>3.10
|
16 |
colorlog
|
17 |
-
plotly-express
|
18 |
setuptools
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
scipy
|
2 |
termcolor
|
3 |
addict
|
4 |
yapf==0.40.1
|
5 |
timm
|
|
|
|
|
|
|
6 |
numpy
|
7 |
opencv-python
|
|
|
8 |
pycocotools
|
|
|
9 |
colorlog
|
|
|
10 |
setuptools
|
11 |
+
ushlex
|
12 |
+
gradio>=4.0.0,<5
|
13 |
+
gradio-image-prompter
|
14 |
+
spaces
|
15 |
+
--extra-index-url https://download.pytorch.org/whl/cu121
|
16 |
+
torch
|
17 |
+
torchvision
|
18 |
+
transformers
|