Hi there

I tried to execute the script with little modifications locally. But always only one cat is detected. Depending on the threshold setting it is the one in front or the one in the back, but never both. Do you have any advice?

Here is the modified code duplicate:

from transformers import pipeline, SamModel, SamProcessor
import torch
import numpy as np
from PIL import Image, ImageDraw
import os

checkpoint = "google/owlv2-base-patch16-ensemble"
detector = pipeline(model=checkpoint, task="zero-shot-object-detection", device="cpu")
sam_model = SamModel.from_pretrained("facebook/sam-vit-base").to("cpu")
sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-base")

def query(image_path, texts, threshold, sam_threshold):
"""
This function analyzes an image for objects specified in the text input.

Args:
image_path: Path to the image file.
texts: Comma-separated list of object labels to search for in the image.
threshold: Confidence threshold for the object detection pipeline.
sam_threshold: Threshold for the second-stage mask-based verification.

Returns:
Path to the annotated image file.
"""

print("Pfad")
print(image_path)
texts = texts.split(",")
image = Image.open(image_path).convert("RGB")
#image_np = np.array(image)
#print(image_np)

Pass the image directly as an argument to the detector

predictions = detector(
image, # Provide the image here
candidate_labels=texts,
threshold=threshold
)

result_labels = []
for pred in predictions:
box = pred["box"]
score = pred["score"]
label = pred["label"]
box = [round(box["xmin"], 2), round(box["ymin"], 2),
round(box["xmax"], 2), round(box["ymax"], 2)]
inputs = sam_processor(
images=image, # Ensure the correct argument is used
input_boxes=[[box]],
return_tensors="pt"
).to("cpu")
with torch.no_grad():
outputs = sam_model(**inputs)

  mask = sam_processor.image_processor.post_process_masks(
      outputs.pred_masks.cpu(),
      inputs["original_sizes"].cpu(),
      inputs["reshaped_input_sizes"].cpu()
  )
  iou_scores = outputs["iou_scores"]

  masks, testscores, boxes = sam_processor.image_processor.filter_masks(
      mask[0],
      iou_scores[0].cpu(),
      inputs["original_sizes"][0].cpu(),
      box,
      pred_iou_thresh=sam_threshold,
  )
  #print(mask[0][0][0])

  result_labels.append((mask[0][0][0].numpy(), label))

Create and save the annotated image

draw = ImageDraw.Draw(image)
print(result_labels)
for mask, label in result_labels:
mask = Image.fromarray((mask * 255).astype(np.uint8), mode='L').resize(image.size)
mask = mask.convert("RGBA")
image.paste(mask, (0, 0), mask)

base_name, ext = os.path.splitext(image_path)
output_path = f"{base_name}_annotated{ext}"
image.save(output_path)

return output_path

Example usage

output_path = query("cats.png", "cat", 0.6, 0.88)
print(f"Annotated image saved as {output_path}")

Spaces:

merve
/

OWLSAM

Running on Zero

Detector only finds one cat.

Pass the image directly as an argument to the detector

Create and save the annotated image

Example usage