qubvel-hf HF staff commited on
Commit
fadd9bd
·
verified ·
1 Parent(s): b31249e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -66
app.py CHANGED
@@ -19,92 +19,35 @@ import torch
19
  import tqdm
20
  from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation
21
 
22
- DESCRIPTION = "# ViTPose"
23
 
24
  MAX_NUM_FRAMES = 300
25
 
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
 
28
- person_detector_name = "PekingU/rtdetr_r50vd_coco_o365"
29
- person_image_processor = AutoProcessor.from_pretrained(person_detector_name)
30
- person_model = RTDetrForObjectDetection.from_pretrained(person_detector_name, device_map=device)
31
-
32
- pose_model_name = "usyd-community/vitpose-base-simple"
33
- pose_image_processor = AutoProcessor.from_pretrained(pose_model_name)
34
- pose_model = VitPoseForPoseEstimation.from_pretrained(pose_model_name, device_map=device)
35
 
36
 
37
  @spaces.GPU(duration=5)
38
  @torch.inference_mode()
39
  def process_image(image: PIL.Image.Image) -> tuple[PIL.Image.Image, list[dict]]:
40
- inputs = person_image_processor(images=image, return_tensors="pt").to(device)
41
- outputs = person_model(**inputs)
42
  results = person_image_processor.post_process_object_detection(
43
  outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
44
  )
45
  result = results[0] # take first image results
 
46
 
47
- # Human label refers 0 index in COCO dataset
48
- person_boxes_xyxy = result["boxes"][result["labels"] == 0]
49
- person_boxes_xyxy = person_boxes_xyxy.cpu().numpy()
50
-
51
- # Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
52
- person_boxes = person_boxes_xyxy.copy()
53
- person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
54
- person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
55
-
56
- inputs = pose_image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
57
-
58
- # for vitpose-plus-base checkpoint we should additionally provide dataset_index
59
- # to specify which MOE experts to use for inference
60
- if pose_model.config.backbone_config.num_experts > 1:
61
- dataset_index = torch.tensor([0] * len(inputs["pixel_values"]))
62
- dataset_index = dataset_index.to(inputs["pixel_values"].device)
63
- inputs["dataset_index"] = dataset_index
64
-
65
- outputs = pose_model(**inputs)
66
-
67
- pose_results = pose_image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
68
- image_pose_result = pose_results[0] # results for first image
69
-
70
- # make results more human-readable
71
- human_readable_results = []
72
- for i, person_pose in enumerate(image_pose_result):
73
- data = {
74
- "person_id": i,
75
- "bbox": person_pose["bbox"].numpy().tolist(),
76
- "keypoints": [],
77
- }
78
- for keypoint, label, score in zip(
79
- person_pose["keypoints"], person_pose["labels"], person_pose["scores"], strict=True
80
- ):
81
- keypoint_name = pose_model.config.id2label[label.item()]
82
- x, y = keypoint
83
- data["keypoints"].append({"name": keypoint_name, "x": x.item(), "y": y.item(), "score": score.item()})
84
- human_readable_results.append(data)
85
-
86
- # preprocess to torch tensor of shape (n_objects, n_keypoints, 2)
87
- xy = [pose_result["keypoints"] for pose_result in image_pose_result]
88
- xy = torch.stack(xy).cpu().numpy()
89
-
90
- scores = [pose_result["scores"] for pose_result in image_pose_result]
91
- scores = torch.stack(scores).cpu().numpy()
92
-
93
- keypoints = sv.KeyPoints(xy=xy, confidence=scores)
94
- detections = sv.Detections(xyxy=person_boxes_xyxy)
95
-
96
- edge_annotator = sv.EdgeAnnotator(color=sv.Color.GREEN, thickness=1)
97
- vertex_annotator = sv.VertexAnnotator(color=sv.Color.RED, radius=2)
98
  bounding_box_annotator = sv.BoxAnnotator(color=sv.Color.WHITE, color_lookup=sv.ColorLookup.INDEX, thickness=1)
99
 
100
- annotated_frame = image.copy()
101
-
102
  # annotate bounding boxes
103
  annotated_frame = bounding_box_annotator.annotate(scene=image.copy(), detections=detections)
104
 
105
- # annotate edges and vertices
106
- annotated_frame = edge_annotator.annotate(scene=annotated_frame, key_points=keypoints)
107
- return vertex_annotator.annotate(scene=annotated_frame, key_points=keypoints), human_readable_results
108
 
109
 
110
  @spaces.GPU(duration=90)
 
19
  import tqdm
20
  from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation
21
 
22
+ DESCRIPTION = "# DAB-DETR"
23
 
24
  MAX_NUM_FRAMES = 300
25
 
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
 
28
+ checkpoint = "IDEA-Research/dab-detr-resnet-50-dc5-pat3"
29
+ image_processor = AutoProcessor.from_pretrained(person_detector_name)
30
+ model = RTDetrForObjectDetection.from_pretrained(person_detector_name, device_map=device)
 
 
 
 
31
 
32
 
33
  @spaces.GPU(duration=5)
34
  @torch.inference_mode()
35
  def process_image(image: PIL.Image.Image) -> tuple[PIL.Image.Image, list[dict]]:
36
+ inputs = image_processor(images=image, return_tensors="pt").to(device)
37
+ outputs = model(**inputs)
38
  results = person_image_processor.post_process_object_detection(
39
  outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
40
  )
41
  result = results[0] # take first image results
42
+ boxes_xyxy = result["boxes"].cpu().numpy()
43
 
44
+ detections = sv.Detections(xyxy=boxes_xyxy)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  bounding_box_annotator = sv.BoxAnnotator(color=sv.Color.WHITE, color_lookup=sv.ColorLookup.INDEX, thickness=1)
46
 
 
 
47
  # annotate bounding boxes
48
  annotated_frame = bounding_box_annotator.annotate(scene=image.copy(), detections=detections)
49
 
50
+ return annotated_frame
 
 
51
 
52
 
53
  @spaces.GPU(duration=90)