Extend to video

#1
by Jaykumaran17 - opened

First:
!git clone https://huggingface.co/spaces/fffiloni/Gaze-LLE

Then
cd Gaze-LLE
pip install -r requirements.txt

app.py


import gradio as gr
import torch
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont
import cv2
import numpy as np
from retinaface import RetinaFace
import os

Load the Gaze-LLE model

model, transform = torch.hub.load("fkryan/gazelle", "gazelle_dinov2_vitl14_inout")
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

def visualize_heatmap(pil_image, heatmap, bbox=None, inout_score=None):
if isinstance(heatmap, torch.Tensor):
heatmap = heatmap.detach().cpu().numpy()
heatmap = Image.fromarray((heatmap * 255).astype(np.uint8)).resize(pil_image.size, Image.Resampling.BILINEAR)
heatmap = plt.cm.jet(np.array(heatmap) / 255.)
heatmap = (heatmap[:, :, :3] * 255).astype(np.uint8)
heatmap = Image.fromarray(heatmap).convert("RGBA")
heatmap.putalpha(90)
overlay_image = Image.alpha_composite(pil_image.convert("RGBA"), heatmap)

if bbox is not None:
    width, height = pil_image.size
    xmin, ymin, xmax, ymax = bbox
    draw = ImageDraw.Draw(overlay_image)
    draw.rectangle([xmin * width, ymin * height, xmax * width, ymax * height], outline="lime", width=int(min(width, height) * 0.01))

    if inout_score is not None:
        text = f"in-frame: {inout_score:.2f}"
        text_x = xmin * width
        text_y = ymax * height + int(height * 0.01)
        draw.text((text_x, text_y), text, fill="lime", font=ImageFont.load_default())
return overlay_image

def visualize_all(pil_image, heatmaps, bboxes, inout_scores, inout_thresh=0.5):
colors = ['lime', 'tomato', 'cyan', 'fuchsia', 'yellow']
overlay_image = pil_image.convert("RGBA")
draw = ImageDraw.Draw(overlay_image)
width, height = pil_image.size

for i in range(len(bboxes)):
    bbox = bboxes[i]
    xmin, ymin, xmax, ymax = bbox
    color = colors[i % len(colors)]
    draw.rectangle([xmin * width, ymin * height, xmax * width, ymax * height], outline=color, width=int(min(width, height) * 0.01))

    if inout_scores is not None:
        inout_score = inout_scores[i]
        text = f"in-frame: {inout_score:.2f}"
        text_x = xmin * width
        text_y = ymax * height + int(height * 0.01)
        draw.text((text_x, text_y), text, fill=color, font=ImageFont.load_default())

    if inout_scores is not None and inout_score > inout_thresh:
        heatmap = heatmaps[i]
        heatmap_np = heatmap.detach().cpu().numpy()
        max_index = np.unravel_index(np.argmax(heatmap_np), heatmap_np.shape)
        gaze_target_x = max_index[1] / heatmap_np.shape[1] * width
        gaze_target_y = max_index[0] / heatmap_np.shape[0] * height
        bbox_center_x = ((xmin + xmax) / 2) * width
        bbox_center_y = ((ymin + ymax) / 2) * height

        draw.ellipse([(gaze_target_x-5, gaze_target_y-5), (gaze_target_x+5, gaze_target_y+5)], fill=color)
        draw.line([(bbox_center_x, bbox_center_y), (gaze_target_x, gaze_target_y)], fill=color, width=int(0.005*min(width, height)))

return overlay_image

def process_frame(frame, model, transform, width, height):
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
resp = RetinaFace.detect_faces(np.array(image))
bboxes = [resp[key]["facial_area"] for key in resp.keys()] if resp else []

if not bboxes:
    return frame

img_tensor = transform(image).unsqueeze(0).to("cuda")
norm_bboxes = [[np.array(bbox) / np.array([width, height, width, height]) for bbox in bboxes]]

with torch.no_grad():
    output = model({"images": img_tensor, "bboxes": norm_bboxes})

heatmaps = output["heatmap"][0] if "heatmap" in output else []
inout_scores = output["inout"][0] if "inout" in output else None

overlay_image = visualize_all(image, heatmaps, norm_bboxes[0], inout_scores, inout_thresh=0.5)
return np.array(overlay_image)

def process_video(video_path, model, transform, output_video_path="output.mp4"):
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fourcc = cv2.VideoWriter_fourcc(*"mp4v")

out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

current_frame = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    current_frame += 1
    print(f"Processing frame {current_frame}/{total_frames}")

    processed_frame = process_frame(frame, model, transform, width, height)
    out.write(cv2.cvtColor(processed_frame, cv2.COLOR_RGBA2BGR))

cap.release()
out.release()
print("Video processing complete.")
return output_video_path

with gr.Blocks() as demo:
gr.Markdown("# Gaze-LLE for Video: Gaze Target Estimation in Motion")

with gr.Row():
    video_input = gr.Video(label="Input Video")
    video_output = gr.Video(label="Output Video")

submit_button = gr.Button("Process Video")

def main(video_path):
    output_video_path = "gaze_2_processed_video.mp4"
    return process_video(video_path, model, transform, output_video_path=output_video_path)

submit_button.click(main, inputs=[video_input], outputs=[video_output])

demo.launch()

Sign up or log in to comment