nikigoli commited on
Commit
0ead84b
·
verified ·
1 Parent(s): 2bb3f92
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ env
2
+ **/__pycache__
3
+
4
+ .git/
5
+ .github/
6
+ .gitignore
7
+
8
+ *.zip
9
+ *.whl
10
+ !gradio_image_prompter-0.1.0-py3-none-any.whl
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # python
2
+ env/
3
+ __pycache__
4
+ .python-version
5
+
6
+
7
+ # vim
8
+ *.sw[op]
Dockerfile CHANGED
@@ -1,45 +1,92 @@
 
1
  # Use the specified Python runtime as a parent image
2
- FROM docker.io/nvidia/cuda:12.1.0-cudnn8-devel-ubi8@sha256:f045009cab64c9fda6113b4473ac1c57dfcca65e18ce981bce63f3cddf7b807a
3
 
4
  # Set the working directory in the container
5
  WORKDIR /usr/src/app
 
6
 
7
- # Install required packages
8
- RUN apt-get update && apt-get install -y \
9
- gcc-11 \
10
- build-essential \
11
- ffmpeg \
12
- libsm6 \
13
- libxext6 \
14
- curl \
15
- git \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  && apt-get clean \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
- # Set environment variable to use gcc-11
20
- ENV CC=/usr/bin/gcc-11
21
-
22
- # Copy the current directory contents into the container
23
- COPY . .
24
 
25
  # Install any needed packages specified in requirements.txt
26
- RUN pip install --no-cache-dir -r requirements.txt
 
 
27
 
28
- # Set the working directory for the GroundingDINO ops
29
  WORKDIR /usr/src/app/models/GroundingDINO/ops
 
30
 
31
  # Run the setup script and the test script
32
- RUN python setup.py build install
33
- RUN python test.py # This should result in 6 lines of * True
 
 
 
 
34
 
35
- # Install Gradio
36
- RUN pip install gradio
 
 
 
 
37
 
38
- # Change back to the original working directory
39
- WORKDIR /usr/src/app
 
 
 
 
 
 
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Expose the port Gradio will run on
42
  EXPOSE 7860
43
-
44
  # Default command to run the Gradio app
45
- CMD ["python", "app.py"]
 
1
+ ## Build time
2
  # Use the specified Python runtime as a parent image
3
+ FROM docker.io/nvidia/cuda:12.1.0-devel-ubuntu22.04@sha256:e3a8f7b933e77ecee74731198a2a5483e965b585cea2660675cf4bb152237e9b AS build
4
 
5
  # Set the working directory in the container
6
  WORKDIR /usr/src/app
7
+ COPY packages.txt .
8
 
9
+ ENV CUDA_HOME=/usr/local/cuda \
10
+ TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \
11
+ PYTHONDONTWRITEBYTECODE=1 \
12
+ PYTHONUNBUFFERED=1
13
+
14
+
15
+ # Delete nvidia apt list and Install required packages
16
+ RUN DEBIAN_FRONTEND=noninteractive \
17
+ && rm -rf /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list \
18
+ && apt-key del 7fa2af80 \
19
+ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub \
20
+ && apt-get -yq update \
21
+ && apt-get install --no-install-recommends -yq \
22
+ apt-transport-https \
23
+ ca-certificates \
24
+ $(cat packages.txt) \
25
+ python3 \
26
+ python3-dev \
27
+ python3-pip \
28
+ python3-venv \
29
+ ffmpeg \
30
+ libsm6 \
31
+ libxext6 \
32
  && apt-get clean \
33
  && rm -rf /var/lib/apt/lists/*
34
 
35
+ # Setup virtual env
36
+ RUN python3 -m venv /opt/venv
37
+ ENV PATH="/opt/venv/bin:$PATH"
 
 
38
 
39
  # Install any needed packages specified in requirements.txt
40
+ COPY requirements.txt .
41
+ RUN --mount=type=cache,id=pip,target=/root/.cache \
42
+ pip install -r requirements.txt
43
 
44
+ # Copy grounding dino ops
45
  WORKDIR /usr/src/app/models/GroundingDINO/ops
46
+ COPY models/GroundingDINO/ops .
47
 
48
  # Run the setup script and the test script
49
+ RUN CC=/usr/bin/gcc-11 python3 setup.py build && \
50
+ pip install .
51
+
52
+ ## Runtime
53
+ # Use the specified Python runtime as a parent image
54
+ FROM ubuntu:22.04
55
 
56
+ ENV CUDA_HOME=/usr/local/cuda \
57
+ TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \
58
+ PYTHONDONTWRITEBYTECODE=1 \
59
+ PYTHONUNBUFFERED=1 \
60
+ PATH="/opt/venv/bin:/home/user/.local/bin:$PATH" \
61
+ HOME=/home/user
62
 
63
+ RUN DEBIAN_FRONTEND=noninteractive apt-get -yq update && apt-get install --no-install-recommends -yq \
64
+ python3 \
65
+ python3-dev \
66
+ python3-pip \
67
+ ffmpeg \
68
+ libsm6 \
69
+ libxext6 \
70
+ && apt-get clean \
71
+ && rm -rf /var/lib/apt/lists/*
72
 
73
+ WORKDIR /app
74
+ RUN useradd -m -u 1000 user && chown -R user /app
75
+
76
+ COPY --from=build --chown=user /opt/venv /opt/venv
77
+
78
+ COPY --chown=user checkpoints checkpoints
79
+ COPY --chown=user checkpoint_best_regular.pth .
80
+ COPY --chown=user *.jpg *.JPG ./
81
+ COPY --chown=user datasets datasets
82
+ COPY --chown=user groundingdino groundingdino
83
+ COPY --chown=user models models
84
+ COPY --chown=user util util
85
+ COPY --chown=user app.py cfg_app.py ./
86
+
87
+ USER user
88
  # Expose the port Gradio will run on
89
  EXPOSE 7860
90
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
91
  # Default command to run the Gradio app
92
+ CMD ["/opt/venv/bin/python3", "app.py"]
README.md CHANGED
@@ -1,7 +1,22 @@
1
  ---
2
  title: CountGD_Multi-Modal_Open-World_Counting
3
- app_file: app.py
4
- sdk: gradio
5
- sdk_version: 4.44.1
6
  ---
7
- nohup python -u app.py &
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: CountGD_Multi-Modal_Open-World_Counting
3
+ sdk: docker
4
+ app_port: 7860
 
5
  ---
6
+ # CountGD: Multi Modal Open World Counting Model
7
+
8
+ To Run Locally, the best method is to use docker.
9
+
10
+ Make sure you have installed docker, nvidia-driver and nvidia container toolkit for the your platform.
11
+
12
+ Then, you can run the app locally with the following command
13
+
14
+ ```bash
15
+ docker run -it \
16
+ --name countgd \
17
+ -p 7860:7860 \
18
+ --platform=linux/amd64 \
19
+ --gpus all \
20
+ registry.hf.space/nikigoli-countgd:latest \
21
+ python app.py
22
+ ```
app.py CHANGED
@@ -1,16 +1,10 @@
1
  import spaces
2
  import gradio as gr
3
- import copy
4
  import random
5
  import torch
6
- import PIL
7
- from PIL import Image, ImageDraw, ImageFont
8
- import torchvision.transforms.functional as F
9
  import numpy as np
10
  import argparse
11
- import json
12
- import plotly.express as px
13
- import pandas as pd
14
  from util.slconfig import SLConfig, DictAction
15
  from util.misc import nested_tensor_from_tensor_list
16
  import datasets.transforms as T
@@ -30,53 +24,9 @@ cwd = os.getcwd()
30
  import warnings
31
  warnings.filterwarnings("ignore")
32
 
33
- # Installing dependencies not in requirements.txt
34
- subprocess.run(
35
- shlex.split(
36
- "pip install gradio_image_prompter-0.1.0-py3-none-any.whl"
37
- )
38
- )
39
  from gradio_image_prompter import ImagePrompter
40
- """
41
- subprocess.run(
42
- shlex.split(
43
- "pip install MultiScaleDeformableAttention-1.0-cp310-cp310-linux_x86_64.whl"
44
- )
45
- )
46
- """
47
- """
48
- subprocess.run(
49
- shlex.split(
50
- "python test.py"
51
- )
52
- )
53
- """
54
- #with open('./switch_cuda.sh', 'rb') as file:
55
- # script = file.read()
56
- #call(script, shell=True)
57
-
58
- with open('./build_ops.sh', 'rb') as file:
59
- script = file.read()
60
- call(script, shell=True)
61
-
62
- def find_cuda():
63
- # Check if CUDA_HOME or CUDA_PATH environment variables are set
64
- cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
65
-
66
- if cuda_home and os.path.exists(cuda_home):
67
- return cuda_home
68
-
69
- # Search for the nvcc executable in the system's PATH
70
- nvcc_path = shutil.which('nvcc')
71
-
72
- if nvcc_path:
73
- # Remove the 'bin/nvcc' part to get the CUDA installation path
74
- cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
75
- return cuda_path
76
 
77
- return None
78
 
79
- cuda_path = find_cuda()
80
 
81
  class AppSteps(Enum):
82
  JUST_TEXT = 1
@@ -195,13 +145,6 @@ def build_model_and_transforms(args):
195
 
196
  return model, data_transform
197
 
198
-
199
- parser = argparse.ArgumentParser("Counting Application", parents=[get_args_parser()])
200
- args = parser.parse_args()
201
- device = get_device()
202
- model, transform = build_model_and_transforms(args)
203
- model = model.to(device)
204
-
205
  examples = [
206
  ["strawberry.jpg", "strawberry", {"image": "strawberry.jpg"}],
207
  ["strawberry.jpg", "blueberry", {"image": "strawberry.jpg"}],
@@ -254,242 +197,250 @@ def get_ind_to_filter(text, word_ids, keywords):
254
 
255
  return inds_to_filter
256
 
257
- @spaces.GPU(duration=120)
258
- def count(image, text, prompts, state, device):
259
-
260
- keywords = "" # do not handle this for now
261
-
262
- # Handle no prompt case.
263
- if prompts is None:
264
- prompts = {"image": image, "points": []}
265
- input_image, _ = transform(image, {"exemplars": torch.tensor([])})
266
- input_image = input_image.unsqueeze(0).to(device)
267
- exemplars = get_box_inputs(prompts["points"])
268
-
269
- input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
270
- input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
271
- exemplars = [exemplars["exemplars"].to(device)]
272
-
273
- with torch.no_grad():
274
- model_output = model(
275
- nested_tensor_from_tensor_list(input_image),
276
- nested_tensor_from_tensor_list(input_image_exemplars),
277
- exemplars,
278
- [torch.tensor([0]).to(device) for _ in range(len(input_image))],
279
- captions=[text + " ."] * len(input_image),
280
- )
281
-
282
- ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
283
- logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
284
- boxes = model_output["pred_boxes"][0]
285
- if len(keywords.strip()) > 0:
286
- box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
287
- else:
288
- box_mask = logits.max(dim=-1).values > CONF_THRESH
289
- logits = logits[box_mask, :].cpu().numpy()
290
- boxes = boxes[box_mask, :].cpu().numpy()
291
-
292
- # Plot results.
293
- (w, h) = image.size
294
- det_map = np.zeros((h, w))
295
- det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
296
- det_map = ndimage.gaussian_filter(
297
- det_map, sigma=(w // 200, w // 200), order=0
298
- )
299
- plt.imshow(image)
300
- plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
301
- plt.axis('off')
302
- img_buf = io.BytesIO()
303
- plt.savefig(img_buf, format='png', bbox_inches='tight')
304
- plt.close()
305
-
306
- output_img = Image.open(img_buf)
307
-
308
- if AppSteps.TEXT_AND_EXEMPLARS not in state:
309
- exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
310
- new_submit_btn = gr.Button("Count", variant="primary", interactive=False)
311
- state = [AppSteps.JUST_TEXT, AppSteps.TEXT_AND_EXEMPLARS]
312
- main_instructions_comp = gr.Markdown(visible=False)
313
- step_3 = gr.Tab(visible=False)
314
- elif AppSteps.FULL_APP not in state:
315
- exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
316
- new_submit_btn = submit_btn
317
- state = [AppSteps.JUST_TEXT, AppSteps.TEXT_AND_EXEMPLARS, AppSteps.FULL_APP]
318
- main_instructions_comp = gr.Markdown(visible=True)
319
- step_3 = gr.Tab(visible=True)
320
- else:
321
- exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
322
- new_submit_btn = submit_btn
323
- main_instructions_comp = gr.Markdown(visible=True)
324
- step_3 = gr.Tab(visible=True)
325
-
326
- out_label = "Detected instances predicted with"
327
- if len(text.strip()) > 0:
328
- out_label += " text"
329
- if exemplars[0].size()[0] == 1:
330
- out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
331
- elif exemplars[0].size()[0] > 1:
332
- out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
333
  else:
334
- out_label += "."
335
- elif exemplars[0].size()[0] > 0:
336
- if exemplars[0].size()[0] == 1:
337
- out_label += " " + str(exemplars[0].size()[0]) + " visual exemplar."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  else:
339
- out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
340
- else:
341
- out_label = "Nothing specified to detect."
342
-
343
- return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]), new_submit_btn, gr.Tab(visible=True), step_3, state)
344
-
345
- @spaces.GPU
346
- def count_main(image, text, prompts, device):
347
- keywords = "" # do not handle this for now
348
- # Handle no prompt case.
349
- if prompts is None:
350
- prompts = {"image": image, "points": []}
351
- input_image, _ = transform(image, {"exemplars": torch.tensor([])})
352
- input_image = input_image.unsqueeze(0).to(device)
353
- exemplars = get_box_inputs(prompts["points"])
354
-
355
- input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
356
- input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
357
- exemplars = [exemplars["exemplars"].to(device)]
358
-
359
- with torch.no_grad():
360
- model_output = model(
361
- nested_tensor_from_tensor_list(input_image),
362
- nested_tensor_from_tensor_list(input_image_exemplars),
363
- exemplars,
364
- [torch.tensor([0]).to(device) for _ in range(len(input_image))],
365
- captions=[text + " ."] * len(input_image),
366
- )
367
-
368
- ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
369
- logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
370
- boxes = model_output["pred_boxes"][0]
371
- if len(keywords.strip()) > 0:
372
- box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
373
- else:
374
- box_mask = logits.max(dim=-1).values > CONF_THRESH
375
- logits = logits[box_mask, :].cpu().numpy()
376
- boxes = boxes[box_mask, :].cpu().numpy()
377
-
378
- # Plot results.
379
- (w, h) = image.size
380
- det_map = np.zeros((h, w))
381
- det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
382
- det_map = ndimage.gaussian_filter(
383
- det_map, sigma=(w // 200, w // 200), order=0
384
- )
385
- plt.imshow(image)
386
- plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
387
- plt.axis('off')
388
- img_buf = io.BytesIO()
389
- plt.savefig(img_buf, format='png', bbox_inches='tight')
390
- plt.close()
391
-
392
- output_img = Image.open(img_buf)
393
-
394
- out_label = "Detected instances predicted with"
395
- if len(text.strip()) > 0:
396
- out_label += " text"
397
- if exemplars[0].size()[0] == 1:
398
- out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
399
- elif exemplars[0].size()[0] > 1:
400
- out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
401
  else:
402
- out_label += "."
403
- elif exemplars[0].size()[0] > 0:
404
- if exemplars[0].size()[0] == 1:
405
- out_label += " " + str(exemplars[0].size()[0]) + " visual exemplar."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  else:
407
- out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
408
- else:
409
- out_label = "Nothing specified to detect."
410
-
411
- return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]))
412
-
413
- def remove_label(image):
414
- return gr.Image(show_label=False)
415
-
416
- def check_submit_btn(exemplar_image_prompts, state):
417
- if AppSteps.TEXT_AND_EXEMPLARS not in state or len(state) == 3:
418
- return gr.Button("Count", variant="primary", interactive=True)
419
- elif exemplar_image_prompts is None:
420
- return gr.Button("Count", variant="primary", interactive=False)
421
- elif len(get_box_inputs(exemplar_image_prompts["points"])) > 0:
422
- return gr.Button("Count", variant="primary", interactive=True)
423
- else:
424
- return gr.Button("Count", variant="primary", interactive=False)
425
-
426
- exemplar_img_drawing_instructions_part_1 = '<p><strong>Congrats, you have counted the strawberries!</strong> You can also draw a box around the object you want to count. <strong>Click and drag the mouse on the image below to draw a box around one of the strawberries.</strong> You can click the back button in the top right of the image to delete the box and try again.<img src="file/button-legend.jpg" width="750"></p>'
427
- exemplar_img_drawing_instructions_part_2 = '<p>The boxes you draw are called \"visual exemplars,\" image examples of what you want the model to count. You can add more boxes around more examples of strawberries in the image above to increase the accuracy of the predicted count. You can also use strawberries from a different image to specify the object to count by uploading or pasting a new image above and drawing boxes around strawberries in it.</p>'
428
- instructions_main = """
429
- # How to Use the App
430
- As shown earlier, there are 3 ways to specify the object to count: (1) with text only, (2) with text and any number of boxes (i.e., "visual exemplars") around example objects, and (3) with visual exemplars only. What is being used is indicated in the top left of the output image. How to try each case is detailed below.
431
- <ol>
432
- <li><strong>Text Only: </strong> Only provide text describing the object to count in the textbox titled "What would you like to count?" Delete all boxes drawn on the visual exemplar image.</li>
433
- <li><strong>Text + Visual Exemplars: </strong> Provide text describing the object to count in the textbox titled "What would you like to count?" and draw at least one box around an example object in the visual exemplar image.</li>
434
- <li><strong>Visual Exemplars Only: </strong> Remove all text in the textbox titled "What would you like to count?" and draw at least one box around an example object in the visual exemplar image.</li>
435
- </ol>
436
- ## Click on the "App" tab at the top of the screen to exit the tutorial and start using the main app!
437
- """
438
-
439
- with gr.Blocks(title="CountGD: Multi-Modal Open-World Counting", theme="soft", head="""<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1">""") as demo:
440
- state = gr.State(value=[AppSteps.JUST_TEXT])
441
- device = gr.State(device)
442
- with gr.Tab("Tutorial"):
443
- with gr.Row():
444
- with gr.Column():
445
- with gr.Tab("Step 3", visible=False) as step_3:
446
- main_instructions = gr.Markdown(instructions_main)
447
- with gr.Tab("Step 2", visible=False) as step_2:
448
- gr.Markdown(exemplar_img_drawing_instructions_part_1)
449
- exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', show_label=True, value={"image": "strawberry.jpg", "points": []}, interactive=True)
450
- with gr.Accordion("Open for Further Information", open=False):
451
- gr.Markdown(exemplar_img_drawing_instructions_part_2)
452
- with gr.Tab("Step 1", visible=True) as step_1:
453
- input_image = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=False, width="30vw")
454
- gr.Markdown('# Click "Count" to count the strawberries.')
455
-
456
- with gr.Column():
457
- with gr.Tab("Output Image"):
458
- detected_instances = gr.Image(label="Detected Instances", show_label='True', interactive=False, visible=True, width="40vw")
459
-
460
- with gr.Row():
461
- input_text = gr.Textbox(label="What would you like to count?", value="strawberry", interactive=True)
462
- pred_count = gr.Number(label="Predicted Count", visible=False)
463
- submit_btn = gr.Button("Count", variant="primary", interactive=True)
464
-
465
- submit_btn.click(fn=remove_label, inputs=[detected_instances], outputs=[detected_instances]).then(fn=count, inputs=[input_image, input_text, exemplar_image, state, device], outputs=[detected_instances, pred_count, submit_btn, step_2, step_3, state])
466
- exemplar_image.change(check_submit_btn, inputs=[exemplar_image, state], outputs=[submit_btn])
467
- with gr.Tab("App", visible=True) as main_app:
468
-
469
- gr.Markdown(
470
- """
471
- # <center>CountGD: Multi-Modal Open-World Counting
472
- <center><h3>Count objects with text, visual exemplars, or both together.</h3>
473
- <h3>Scroll down to try more examples</h3>
474
- <h3><a href='https://arxiv.org/abs/2407.04619' target='_blank' rel='noopener'>[paper]</a>
475
- <a href='https://github.com/niki-amini-naieni/CountGD/' target='_blank' rel='noopener'>[code]</a></h3>
476
- Limitation: this app does not support fine-grained counting based on attributes or visual grounding inputs yet. Note: if the exemplar and text conflict each other, both will be counted.</center>
477
- """
478
- )
479
-
480
- with gr.Row():
481
- with gr.Column():
482
- input_image_main = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=True)
483
- input_text_main = gr.Textbox(label="What would you like to count?", placeholder="", value="strawberry")
484
- exemplar_image_main = ImagePrompter(type='pil', label='Visual Exemplar Image', show_label=True, value={"image": "strawberry.jpg", "points": []}, interactive=True)
485
- with gr.Column():
486
- detected_instances_main = gr.Image(label="Detected Instances", show_label='True', interactive=False)
487
- pred_count_main = gr.Number(label="Predicted Count")
488
- submit_btn_main = gr.Button("Count", variant="primary")
489
- clear_btn_main = gr.ClearButton(variant="secondary")
490
- gr.Examples(label="Examples: click on a row to load the example. Add visual exemplars by drawing boxes on the loaded \"Visual Exemplar Image.\"", examples=examples, inputs=[input_image_main, input_text_main, exemplar_image_main])
491
- submit_btn_main.click(fn=remove_label, inputs=[detected_instances_main], outputs=[detected_instances_main]).then(fn=count_main, inputs=[input_image_main, input_text_main, exemplar_image_main, device], outputs=[detected_instances_main, pred_count_main])
492
- clear_btn_main.add([input_image_main, input_text_main, exemplar_image_main, detected_instances_main, pred_count_main])
493
-
494
-
495
- demo.queue().launch(allowed_paths=['back-icon.jpg', 'paste-icon.jpg', 'upload-icon.jpg', 'button-legend.jpg'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import spaces
2
  import gradio as gr
 
3
  import random
4
  import torch
5
+ from PIL import Image
 
 
6
  import numpy as np
7
  import argparse
 
 
 
8
  from util.slconfig import SLConfig, DictAction
9
  from util.misc import nested_tensor_from_tensor_list
10
  import datasets.transforms as T
 
24
  import warnings
25
  warnings.filterwarnings("ignore")
26
 
 
 
 
 
 
 
27
  from gradio_image_prompter import ImagePrompter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
29
 
 
30
 
31
  class AppSteps(Enum):
32
  JUST_TEXT = 1
 
145
 
146
  return model, data_transform
147
 
 
 
 
 
 
 
 
148
  examples = [
149
  ["strawberry.jpg", "strawberry", {"image": "strawberry.jpg"}],
150
  ["strawberry.jpg", "blueberry", {"image": "strawberry.jpg"}],
 
197
 
198
  return inds_to_filter
199
 
200
+ if __name__ == '__main__':
201
+
202
+ parser = argparse.ArgumentParser("Counting Application", parents=[get_args_parser()])
203
+ args = parser.parse_args()
204
+ device = get_device()
205
+ model, transform = build_model_and_transforms(args)
206
+ model = model.to(device)
207
+
208
+ @spaces.GPU(duration=120)
209
+ def count(image, text, prompts, state, device):
210
+
211
+ keywords = "" # do not handle this for now
212
+
213
+ # Handle no prompt case.
214
+ if prompts is None:
215
+ prompts = {"image": image, "points": []}
216
+ input_image, _ = transform(image, {"exemplars": torch.tensor([])})
217
+ input_image = input_image.unsqueeze(0).to(device)
218
+ exemplars = get_box_inputs(prompts["points"])
219
+
220
+ input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
221
+ input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
222
+ exemplars = [exemplars["exemplars"].to(device)]
223
+
224
+ with torch.no_grad():
225
+ model_output = model(
226
+ nested_tensor_from_tensor_list(input_image),
227
+ nested_tensor_from_tensor_list(input_image_exemplars),
228
+ exemplars,
229
+ [torch.tensor([0]).to(device) for _ in range(len(input_image))],
230
+ captions=[text + " ."] * len(input_image),
231
+ )
232
+
233
+ ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
234
+ logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
235
+ boxes = model_output["pred_boxes"][0]
236
+ if len(keywords.strip()) > 0:
237
+ box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  else:
239
+ box_mask = logits.max(dim=-1).values > CONF_THRESH
240
+ logits = logits[box_mask, :].cpu().numpy()
241
+ boxes = boxes[box_mask, :].cpu().numpy()
242
+
243
+ # Plot results.
244
+ (w, h) = image.size
245
+ det_map = np.zeros((h, w))
246
+ det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
247
+ det_map = ndimage.gaussian_filter(
248
+ det_map, sigma=(w // 200, w // 200), order=0
249
+ )
250
+ plt.imshow(image)
251
+ plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
252
+ plt.axis('off')
253
+ img_buf = io.BytesIO()
254
+ plt.savefig(img_buf, format='png', bbox_inches='tight')
255
+ plt.close()
256
+
257
+ output_img = Image.open(img_buf)
258
+
259
+ if AppSteps.TEXT_AND_EXEMPLARS not in state:
260
+ exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
261
+ new_submit_btn = gr.Button("Count", variant="primary", interactive=False)
262
+ state = [AppSteps.JUST_TEXT, AppSteps.TEXT_AND_EXEMPLARS]
263
+ main_instructions_comp = gr.Markdown(visible=False)
264
+ step_3 = gr.Tab(visible=False)
265
+ elif AppSteps.FULL_APP not in state:
266
+ exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
267
+ new_submit_btn = submit_btn
268
+ state = [AppSteps.JUST_TEXT, AppSteps.TEXT_AND_EXEMPLARS, AppSteps.FULL_APP]
269
+ main_instructions_comp = gr.Markdown(visible=True)
270
+ step_3 = gr.Tab(visible=True)
271
  else:
272
+ exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', value=prompts, interactive=True, visible=True)
273
+ new_submit_btn = submit_btn
274
+ main_instructions_comp = gr.Markdown(visible=True)
275
+ step_3 = gr.Tab(visible=True)
276
+
277
+ out_label = "Detected instances predicted with"
278
+ if len(text.strip()) > 0:
279
+ out_label += " text"
280
+ if exemplars[0].size()[0] == 1:
281
+ out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
282
+ elif exemplars[0].size()[0] > 1:
283
+ out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
284
+ else:
285
+ out_label += "."
286
+ elif exemplars[0].size()[0] > 0:
287
+ if exemplars[0].size()[0] == 1:
288
+ out_label += " " + str(exemplars[0].size()[0]) + " visual exemplar."
289
+ else:
290
+ out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  else:
292
+ out_label = "Nothing specified to detect."
293
+
294
+ return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]), new_submit_btn, gr.Tab(visible=True), step_3, state)
295
+
296
+ @spaces.GPU
297
+ def count_main(image, text, prompts, device):
298
+ keywords = "" # do not handle this for now
299
+ # Handle no prompt case.
300
+ if prompts is None:
301
+ prompts = {"image": image, "points": []}
302
+ input_image, _ = transform(image, {"exemplars": torch.tensor([])})
303
+ input_image = input_image.unsqueeze(0).to(device)
304
+ exemplars = get_box_inputs(prompts["points"])
305
+
306
+ input_image_exemplars, exemplars = transform(prompts["image"], {"exemplars": torch.tensor(exemplars)})
307
+ input_image_exemplars = input_image_exemplars.unsqueeze(0).to(device)
308
+ exemplars = [exemplars["exemplars"].to(device)]
309
+
310
+ with torch.no_grad():
311
+ model_output = model(
312
+ nested_tensor_from_tensor_list(input_image),
313
+ nested_tensor_from_tensor_list(input_image_exemplars),
314
+ exemplars,
315
+ [torch.tensor([0]).to(device) for _ in range(len(input_image))],
316
+ captions=[text + " ."] * len(input_image),
317
+ )
318
+
319
+ ind_to_filter = get_ind_to_filter(text, model_output["token"][0].word_ids, keywords)
320
+ logits = model_output["pred_logits"].sigmoid()[0][:, ind_to_filter]
321
+ boxes = model_output["pred_boxes"][0]
322
+ if len(keywords.strip()) > 0:
323
+ box_mask = (logits > CONF_THRESH).sum(dim=-1) == len(ind_to_filter)
324
  else:
325
+ box_mask = logits.max(dim=-1).values > CONF_THRESH
326
+ logits = logits[box_mask, :].cpu().numpy()
327
+ boxes = boxes[box_mask, :].cpu().numpy()
328
+
329
+ # Plot results.
330
+ (w, h) = image.size
331
+ det_map = np.zeros((h, w))
332
+ det_map[(h * boxes[:, 1]).astype(int), (w * boxes[:, 0]).astype(int)] = 1
333
+ det_map = ndimage.gaussian_filter(
334
+ det_map, sigma=(w // 200, w // 200), order=0
335
+ )
336
+ plt.imshow(image)
337
+ plt.imshow(det_map[None, :].transpose(1, 2, 0), 'jet', interpolation='none', alpha=0.7)
338
+ plt.axis('off')
339
+ img_buf = io.BytesIO()
340
+ plt.savefig(img_buf, format='png', bbox_inches='tight')
341
+ plt.close()
342
+
343
+ output_img = Image.open(img_buf)
344
+
345
+ out_label = "Detected instances predicted with"
346
+ if len(text.strip()) > 0:
347
+ out_label += " text"
348
+ if exemplars[0].size()[0] == 1:
349
+ out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplar."
350
+ elif exemplars[0].size()[0] > 1:
351
+ out_label += " and " + str(exemplars[0].size()[0]) + " visual exemplars."
352
+ else:
353
+ out_label += "."
354
+ elif exemplars[0].size()[0] > 0:
355
+ if exemplars[0].size()[0] == 1:
356
+ out_label += " " + str(exemplars[0].size()[0]) + " visual exemplar."
357
+ else:
358
+ out_label += " " + str(exemplars[0].size()[0]) + " visual exemplars."
359
+ else:
360
+ out_label = "Nothing specified to detect."
361
+
362
+ return (gr.Image(output_img, visible=True, label=out_label, show_label=True), gr.Number(label="Predicted Count", visible=True, value=boxes.shape[0]))
363
+
364
+ def remove_label(image):
365
+ return gr.Image(show_label=False)
366
+
367
+ def check_submit_btn(exemplar_image_prompts, state):
368
+ if AppSteps.TEXT_AND_EXEMPLARS not in state or len(state) == 3:
369
+ return gr.Button("Count", variant="primary", interactive=True)
370
+ elif exemplar_image_prompts is None:
371
+ return gr.Button("Count", variant="primary", interactive=False)
372
+ elif len(get_box_inputs(exemplar_image_prompts["points"])) > 0:
373
+ return gr.Button("Count", variant="primary", interactive=True)
374
+ else:
375
+ return gr.Button("Count", variant="primary", interactive=False)
376
+
377
+ exemplar_img_drawing_instructions_part_1 = '<p><strong>Congrats, you have counted the strawberries!</strong> You can also draw a box around the object you want to count. <strong>Click and drag the mouse on the image below to draw a box around one of the strawberries.</strong> You can click the back button in the top right of the image to delete the box and try again.<img src="file/button-legend.jpg" width="750"></p>'
378
+ exemplar_img_drawing_instructions_part_2 = '<p>The boxes you draw are called \"visual exemplars,\" image examples of what you want the model to count. You can add more boxes around more examples of strawberries in the image above to increase the accuracy of the predicted count. You can also use strawberries from a different image to specify the object to count by uploading or pasting a new image above and drawing boxes around strawberries in it.</p>'
379
+ instructions_main = """
380
+ # How to Use the App
381
+ As shown earlier, there are 3 ways to specify the object to count: (1) with text only, (2) with text and any number of boxes (i.e., "visual exemplars") around example objects, and (3) with visual exemplars only. What is being used is indicated in the top left of the output image. How to try each case is detailed below.
382
+ <ol>
383
+ <li><strong>Text Only: </strong> Only provide text describing the object to count in the textbox titled "What would you like to count?" Delete all boxes drawn on the visual exemplar image.</li>
384
+ <li><strong>Text + Visual Exemplars: </strong> Provide text describing the object to count in the textbox titled "What would you like to count?" and draw at least one box around an example object in the visual exemplar image.</li>
385
+ <li><strong>Visual Exemplars Only: </strong> Remove all text in the textbox titled "What would you like to count?" and draw at least one box around an example object in the visual exemplar image.</li>
386
+ </ol>
387
+ ## Click on the "App" tab at the top of the screen to exit the tutorial and start using the main app!
388
+ """
389
+
390
+ with gr.Blocks(title="CountGD: Multi-Modal Open-World Counting", theme="soft", head="""<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=1">""") as demo:
391
+ state = gr.State(value=[AppSteps.JUST_TEXT])
392
+ device = gr.State(device)
393
+ with gr.Tab("Tutorial"):
394
+ with gr.Row():
395
+ with gr.Column():
396
+ with gr.Tab("Step 3", visible=False) as step_3:
397
+ main_instructions = gr.Markdown(instructions_main)
398
+ with gr.Tab("Step 2", visible=False) as step_2:
399
+ gr.Markdown(exemplar_img_drawing_instructions_part_1)
400
+ exemplar_image = ImagePrompter(type='pil', label='Visual Exemplar Image', show_label=True, value={"image": "strawberry.jpg", "points": []}, interactive=True)
401
+ with gr.Accordion("Open for Further Information", open=False):
402
+ gr.Markdown(exemplar_img_drawing_instructions_part_2)
403
+ with gr.Tab("Step 1", visible=True) as step_1:
404
+ input_image = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=False, width="30vw")
405
+ gr.Markdown('# Click "Count" to count the strawberries.')
406
+
407
+ with gr.Column():
408
+ with gr.Tab("Output Image"):
409
+ detected_instances = gr.Image(label="Detected Instances", show_label='True', interactive=False, visible=True, width="40vw")
410
+
411
+ with gr.Row():
412
+ input_text = gr.Textbox(label="What would you like to count?", value="strawberry", interactive=True)
413
+ pred_count = gr.Number(label="Predicted Count", visible=False)
414
+ submit_btn = gr.Button("Count", variant="primary", interactive=True)
415
+
416
+ submit_btn.click(fn=remove_label, inputs=[detected_instances], outputs=[detected_instances]).then(fn=count, inputs=[input_image, input_text, exemplar_image, state, device], outputs=[detected_instances, pred_count, submit_btn, step_2, step_3, state])
417
+ exemplar_image.change(check_submit_btn, inputs=[exemplar_image, state], outputs=[submit_btn])
418
+ with gr.Tab("App", visible=True) as main_app:
419
+
420
+ gr.Markdown(
421
+ """
422
+ # <center>CountGD: Multi-Modal Open-World Counting
423
+ <center><h3>Count objects with text, visual exemplars, or both together.</h3>
424
+ <h3>Scroll down to try more examples</h3>
425
+ <h3><a href='https://arxiv.org/abs/2407.04619' target='_blank' rel='noopener'>[paper]</a>
426
+ <a href='https://github.com/niki-amini-naieni/CountGD/' target='_blank' rel='noopener'>[code]</a></h3>
427
+ Limitation: this app does not support fine-grained counting based on attributes or visual grounding inputs yet. Note: if the exemplar and text conflict each other, both will be counted.</center>
428
+ """
429
+ )
430
+
431
+ with gr.Row():
432
+ with gr.Column():
433
+ input_image_main = gr.Image(type='pil', label='Input Image', show_label='True', value="strawberry.jpg", interactive=True)
434
+ input_text_main = gr.Textbox(label="What would you like to count?", placeholder="", value="strawberry")
435
+ exemplar_image_main = ImagePrompter(type='pil', label='Visual Exemplar Image', show_label=True, value={"image": "strawberry.jpg", "points": []}, interactive=True)
436
+ with gr.Column():
437
+ detected_instances_main = gr.Image(label="Detected Instances", show_label='True', interactive=False)
438
+ pred_count_main = gr.Number(label="Predicted Count")
439
+ submit_btn_main = gr.Button("Count", variant="primary")
440
+ clear_btn_main = gr.ClearButton(variant="secondary")
441
+ gr.Examples(label="Examples: click on a row to load the example. Add visual exemplars by drawing boxes on the loaded \"Visual Exemplar Image.\"", examples=examples, inputs=[input_image_main, input_text_main, exemplar_image_main])
442
+ submit_btn_main.click(fn=remove_label, inputs=[detected_instances_main], outputs=[detected_instances_main]).then(fn=count_main, inputs=[input_image_main, input_text_main, exemplar_image_main, device], outputs=[detected_instances_main, pred_count_main])
443
+ clear_btn_main.add([input_image_main, input_text_main, exemplar_image_main, detected_instances_main, pred_count_main])
444
+
445
+
446
+ demo.queue().launch(allowed_paths=['back-icon.jpg', 'paste-icon.jpg', 'upload-icon.jpg', 'button-legend.jpg'])
models/GroundingDINO/ops/setup.py CHANGED
@@ -11,8 +11,6 @@ import glob
11
 
12
  import torch
13
 
14
- import spaces
15
-
16
  from torch.utils.cpp_extension import CUDA_HOME
17
  from torch.utils.cpp_extension import CppExtension
18
  from torch.utils.cpp_extension import CUDAExtension
@@ -22,7 +20,6 @@ from setuptools import setup
22
 
23
  requirements = ["torch", "torchvision"]
24
 
25
- #@spaces.GPU
26
  def get_extensions():
27
  this_dir = os.path.dirname(os.path.abspath(__file__))
28
  extensions_dir = os.path.join(this_dir, "src")
@@ -36,12 +33,9 @@ def get_extensions():
36
  extra_compile_args = {"cxx": []}
37
  define_macros = []
38
 
39
-
40
-
41
  print("inside get_extensions")
42
- print(torch.cuda.is_available())
43
  print(CUDA_HOME)
44
- if torch.cuda.is_available():
45
  extension = CUDAExtension
46
  sources += source_cuda
47
  define_macros += [("WITH_CUDA", None)]
 
11
 
12
  import torch
13
 
 
 
14
  from torch.utils.cpp_extension import CUDA_HOME
15
  from torch.utils.cpp_extension import CppExtension
16
  from torch.utils.cpp_extension import CUDAExtension
 
20
 
21
  requirements = ["torch", "torchvision"]
22
 
 
23
  def get_extensions():
24
  this_dir = os.path.dirname(os.path.abspath(__file__))
25
  extensions_dir = os.path.join(this_dir, "src")
 
33
  extra_compile_args = {"cxx": []}
34
  define_macros = []
35
 
 
 
36
  print("inside get_extensions")
 
37
  print(CUDA_HOME)
38
+ if CUDA_HOME is not None and (torch.cuda.is_available() or ("TORCH_CUDA_ARCH_LIST" in os.environ) or torch.cuda.get_arch_list()):
39
  extension = CUDAExtension
40
  sources += source_cuda
41
  define_macros += [("WITH_CUDA", None)]
models/GroundingDINO/ops/test.py CHANGED
@@ -10,9 +10,8 @@ from __future__ import absolute_import
10
  from __future__ import print_function
11
  from __future__ import division
12
 
13
- import time
14
  import torch
15
- import torch.nn as nn
16
  from torch.autograd import gradcheck
17
 
18
  from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
 
10
  from __future__ import print_function
11
  from __future__ import division
12
 
13
+
14
  import torch
 
15
  from torch.autograd import gradcheck
16
 
17
  from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
packages.txt CHANGED
@@ -1,2 +1,3 @@
1
  build-essential
2
  ninja-build
 
 
1
  build-essential
2
  ninja-build
3
+ gcc-11
requirements.txt CHANGED
@@ -1,20 +1,18 @@
1
- cython
2
- submitit
3
  scipy
4
  termcolor
5
  addict
6
  yapf==0.40.1
7
  timm
8
- torch
9
- torchvision
10
- transformers
11
  numpy
12
  opencv-python
13
- supervision==0.6.0
14
  pycocotools
15
- pyyaml>3.10
16
  colorlog
17
- plotly-express
18
  setuptools
19
- wheel
20
- ushlex
 
 
 
 
 
 
 
 
 
1
  scipy
2
  termcolor
3
  addict
4
  yapf==0.40.1
5
  timm
 
 
 
6
  numpy
7
  opencv-python
 
8
  pycocotools
 
9
  colorlog
 
10
  setuptools
11
+ ushlex
12
+ gradio>=4.0.0,<5
13
+ gradio-image-prompter
14
+ spaces
15
+ --extra-index-url https://download.pytorch.org/whl/cu121
16
+ torch
17
+ torchvision
18
+ transformers