Spaces:
Running
on
Zero
Extend to video
First:
!git clone https://huggingface.co/spaces/fffiloni/Gaze-LLE
Then
cd Gaze-LLE
pip install -r requirements.txt
app.py
import gradio as gr
import torch
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont
import cv2
import numpy as np
from retinaface import RetinaFace
import os
Load the Gaze-LLE model
model, transform = torch.hub.load("fkryan/gazelle", "gazelle_dinov2_vitl14_inout")
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")
def visualize_heatmap(pil_image, heatmap, bbox=None, inout_score=None):
if isinstance(heatmap, torch.Tensor):
heatmap = heatmap.detach().cpu().numpy()
heatmap = Image.fromarray((heatmap * 255).astype(np.uint8)).resize(pil_image.size, Image.Resampling.BILINEAR)
heatmap = plt.cm.jet(np.array(heatmap) / 255.)
heatmap = (heatmap[:, :, :3] * 255).astype(np.uint8)
heatmap = Image.fromarray(heatmap).convert("RGBA")
heatmap.putalpha(90)
overlay_image = Image.alpha_composite(pil_image.convert("RGBA"), heatmap)
if bbox is not None:
width, height = pil_image.size
xmin, ymin, xmax, ymax = bbox
draw = ImageDraw.Draw(overlay_image)
draw.rectangle([xmin * width, ymin * height, xmax * width, ymax * height], outline="lime", width=int(min(width, height) * 0.01))
if inout_score is not None:
text = f"in-frame: {inout_score:.2f}"
text_x = xmin * width
text_y = ymax * height + int(height * 0.01)
draw.text((text_x, text_y), text, fill="lime", font=ImageFont.load_default())
return overlay_image
def visualize_all(pil_image, heatmaps, bboxes, inout_scores, inout_thresh=0.5):
colors = ['lime', 'tomato', 'cyan', 'fuchsia', 'yellow']
overlay_image = pil_image.convert("RGBA")
draw = ImageDraw.Draw(overlay_image)
width, height = pil_image.size
for i in range(len(bboxes)):
bbox = bboxes[i]
xmin, ymin, xmax, ymax = bbox
color = colors[i % len(colors)]
draw.rectangle([xmin * width, ymin * height, xmax * width, ymax * height], outline=color, width=int(min(width, height) * 0.01))
if inout_scores is not None:
inout_score = inout_scores[i]
text = f"in-frame: {inout_score:.2f}"
text_x = xmin * width
text_y = ymax * height + int(height * 0.01)
draw.text((text_x, text_y), text, fill=color, font=ImageFont.load_default())
if inout_scores is not None and inout_score > inout_thresh:
heatmap = heatmaps[i]
heatmap_np = heatmap.detach().cpu().numpy()
max_index = np.unravel_index(np.argmax(heatmap_np), heatmap_np.shape)
gaze_target_x = max_index[1] / heatmap_np.shape[1] * width
gaze_target_y = max_index[0] / heatmap_np.shape[0] * height
bbox_center_x = ((xmin + xmax) / 2) * width
bbox_center_y = ((ymin + ymax) / 2) * height
draw.ellipse([(gaze_target_x-5, gaze_target_y-5), (gaze_target_x+5, gaze_target_y+5)], fill=color)
draw.line([(bbox_center_x, bbox_center_y), (gaze_target_x, gaze_target_y)], fill=color, width=int(0.005*min(width, height)))
return overlay_image
def process_frame(frame, model, transform, width, height):
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
resp = RetinaFace.detect_faces(np.array(image))
bboxes = [resp[key]["facial_area"] for key in resp.keys()] if resp else []
if not bboxes:
return frame
img_tensor = transform(image).unsqueeze(0).to("cuda")
norm_bboxes = [[np.array(bbox) / np.array([width, height, width, height]) for bbox in bboxes]]
with torch.no_grad():
output = model({"images": img_tensor, "bboxes": norm_bboxes})
heatmaps = output["heatmap"][0] if "heatmap" in output else []
inout_scores = output["inout"][0] if "inout" in output else None
overlay_image = visualize_all(image, heatmaps, norm_bboxes[0], inout_scores, inout_thresh=0.5)
return np.array(overlay_image)
def process_video(video_path, model, transform, output_video_path="output.mp4"):
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
current_frame = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
current_frame += 1
print(f"Processing frame {current_frame}/{total_frames}")
processed_frame = process_frame(frame, model, transform, width, height)
out.write(cv2.cvtColor(processed_frame, cv2.COLOR_RGBA2BGR))
cap.release()
out.release()
print("Video processing complete.")
return output_video_path
with gr.Blocks() as demo:
gr.Markdown("# Gaze-LLE for Video: Gaze Target Estimation in Motion")
with gr.Row():
video_input = gr.Video(label="Input Video")
video_output = gr.Video(label="Output Video")
submit_button = gr.Button("Process Video")
def main(video_path):
output_video_path = "gaze_2_processed_video.mp4"
return process_video(video_path, model, transform, output_video_path=output_video_path)
submit_button.click(main, inputs=[video_input], outputs=[video_output])
demo.launch()