Spaces:

ThunderVVV
/

HaWoR

Running

App Files Files Community

HaWoR / app.py

ThunderVVV

update

20771ca about 1 month ago

raw

history blame contribute delete

11.4 kB

	import gradio as gr
	import spaces
	import os
	import uuid
	import subprocess
	import torch

	print("check torch and cuda version, they must be 2.4.0 + 12.1:")
	print(torch.__version__)
	print(torch.version.cuda)

	# download model
	print("Downloading model weights")
	os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/metric_depth_vit_large_800k.pth -P ./thirdparty/Metric3D/weights/')
	os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/droid.pth -P ./weights/external/')
	os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/detector.pt -P ./weights/external/')
	os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/checkpoints/hawor.ckpt -P ./weights/hawor/checkpoints/')
	os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/checkpoints/infiller.pt -P ./weights/hawor/checkpoints/')
	os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/model_config.yaml -P ./weights/hawor/')


	def install_cuda_toolkit():
	CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"
	CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
	subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
	subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
	subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])

	os.environ["CUDA_HOME"] = "/usr/local/cuda"
	os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
	os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
	os.environ["CUDA_HOME"],
	"" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
	)
	os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"


	print("Compling other packages")
	install_cuda_toolkit()
	os.system('pip install ./thirdparty/DROID-SLAM')
	os.system('pip install ./thirdparty/DROID-SLAM/thirdparty/lietorch')
	os.environ["FORCE_CUDA"] = "1"
	os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git@stable')

	import numpy as np
	import joblib
	import cv2
	import imageio
	from easydict import EasyDict
	from scripts.scripts_test_video.detect_track_video import detect_track_video
	from scripts.scripts_test_video.hawor_video import hawor_motion_estimation, hawor_infiller
	from scripts.scripts_test_video.hawor_slam import hawor_slam
	from hawor.utils.process import get_mano_faces, run_mano, run_mano_left
	from lib.eval_utils.custom_utils import load_slam_cam
	from lib.vis.run_vis2 import lookat_matrix, run_vis2_on_video, run_vis2_on_video_cam
	from lib.vis.renderer_world import Renderer

	# @spaces.GPU(duration=200)
	def render_reconstruction(input_video, img_focal):
	args = EasyDict()
	args.video_path = input_video
	args.input_type = 'file'
	args.checkpoint = './weights/hawor/checkpoints/hawor.ckpt'
	args.infiller_weight = './weights/hawor/checkpoints/infiller.pt'
	args.vis_mode = 'world'
	args.img_focal = img_focal

	start_idx, end_idx, seq_folder, imgfiles = detect_track_video(args)

	if os.path.exists(f'{seq_folder}/tracks_{start_idx}_{end_idx}/frame_chunks_all.npy'):
	print("skip hawor motion estimation")
	frame_chunks_all = joblib.load(f'{seq_folder}/tracks_{start_idx}_{end_idx}/frame_chunks_all.npy')
	img_focal = args.img_focal
	else:
	frame_chunks_all, img_focal = hawor_motion_estimation(args, start_idx, end_idx, seq_folder)

	slam_path = os.path.join(seq_folder, f"SLAM/hawor_slam_w_scale_{start_idx}_{end_idx}.npz")
	if not os.path.exists(slam_path):
	hawor_slam(args, start_idx, end_idx)
	R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all = load_slam_cam(slam_path)

	out_path = infiller_and_vis(args, start_idx, end_idx, frame_chunks_all, R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all, seq_folder, imgfiles)
	return out_path

	@spaces.GPU(duration=80)
	def infiller_and_vis(args, start_idx, end_idx, frame_chunks_all, R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all, seq_folder, imgfiles):
	pred_trans, pred_rot, pred_hand_pose, pred_betas, pred_valid = hawor_infiller(args, start_idx, end_idx, frame_chunks_all)

	# vis sequence for this video
	hand2idx = {
	"right": 1,
	"left": 0
	}
	vis_start = 0
	vis_end = pred_trans.shape[1] - 1

	# get faces
	faces = get_mano_faces()
	faces_new = np.array([[92, 38, 234],
	[234, 38, 239],
	[38, 122, 239],
	[239, 122, 279],
	[122, 118, 279],
	[279, 118, 215],
	[118, 117, 215],
	[215, 117, 214],
	[117, 119, 214],
	[214, 119, 121],
	[119, 120, 121],
	[121, 120, 78],
	[120, 108, 78],
	[78, 108, 79]])
	faces_right = np.concatenate([faces, faces_new], axis=0)

	# get right hand vertices
	hand = 'right'
	hand_idx = hand2idx[hand]
	pred_glob_r = run_mano(pred_trans[hand_idx:hand_idx+1, vis_start:vis_end], pred_rot[hand_idx:hand_idx+1, vis_start:vis_end], pred_hand_pose[hand_idx:hand_idx+1, vis_start:vis_end], betas=pred_betas[hand_idx:hand_idx+1, vis_start:vis_end])
	right_verts = pred_glob_r['vertices'][0]
	right_dict = {
	'vertices': right_verts.unsqueeze(0),
	'faces': faces_right,
	}

	# get left hand vertices
	faces_left = faces_right[:,[0,2,1]]
	hand = 'left'
	hand_idx = hand2idx[hand]
	pred_glob_l = run_mano_left(pred_trans[hand_idx:hand_idx+1, vis_start:vis_end], pred_rot[hand_idx:hand_idx+1, vis_start:vis_end], pred_hand_pose[hand_idx:hand_idx+1, vis_start:vis_end], betas=pred_betas[hand_idx:hand_idx+1, vis_start:vis_end])
	left_verts = pred_glob_l['vertices'][0]
	left_dict = {
	'vertices': left_verts.unsqueeze(0),
	'faces': faces_left,
	}

	R_x = torch.tensor([[1, 0, 0],
	[0, -1, 0],
	[0, 0, -1]]).float()
	R_c2w_sla_all = torch.einsum('ij,njk->nik', R_x, R_c2w_sla_all)
	t_c2w_sla_all = torch.einsum('ij,nj->ni', R_x, t_c2w_sla_all)
	R_w2c_sla_all = R_c2w_sla_all.transpose(-1, -2)
	t_w2c_sla_all = -torch.einsum("bij,bj->bi", R_w2c_sla_all, t_c2w_sla_all)
	left_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, left_dict['vertices'].cpu())
	right_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, right_dict['vertices'].cpu())

	# simple visualization
	bin_size = 128
	max_faces_per_bin = 20000
	img = cv2.imread(imgfiles[0])
	renderer = Renderer(img.shape[1], img.shape[0], 1800, 'cuda',
	bin_size=bin_size, max_faces_per_bin=max_faces_per_bin)

	output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
	if not os.path.exists(output_pth):
	os.makedirs(output_pth)
	image_names = imgfiles[vis_start:vis_end]
	print(f"vis {vis_start} to {vis_end}")
	# vis_video_path = run_vis2_on_video(left_dict, right_dict, output_pth, img_focal, image_names, R_c2w=R_c2w_sla_all[vis_start:vis_end], t_c2w=t_c2w_sla_all[vis_start:vis_end], interactive=False)
	faces_left = torch.from_numpy(faces_left).cuda()
	faces_right = torch.from_numpy(faces_right).cuda()
	faces_all = torch.stack((faces_left, faces_right))

	side_source = torch.tensor([0.463, -0.478, 2.456])
	side_target = torch.tensor([0.026, -0.481, -3.184])
	up = torch.tensor([1.0, 0.0, 0.0])
	view_camera = lookat_matrix(side_source, side_target, up)
	cam_R = view_camera[:3, :3].unsqueeze(0).cuda()
	cam_T = view_camera[:3, 3].unsqueeze(0).cuda()
	vis_video_imgs = []
	out_path = f'{seq_folder}/vis_output_{str(uuid.uuid4())}.mp4'
	writer = imageio.get_writer(out_path, fps=30, mode='I',
	format='FFMPEG', macro_block_size=1)
	renderer.set_ground(100, 0, 0)
	for img_i, _ in enumerate(image_names):

	vertices_left = left_dict['vertices'][:, img_i]
	vertices_right = right_dict['vertices'][:, img_i]

	cameras, lights = renderer.create_camera_from_cv(cam_R, cam_T)
	verts_color = torch.tensor([0.207, 0.596, 0.792, 1.0]).unsqueeze(0).repeat(2, 1)
	vertices_i = torch.stack((vertices_left, vertices_right))
	rend, _ = renderer.render_multiple(vertices_i.cuda(), faces_all.cuda(), verts_color.cuda(), cameras, lights)

	writer.append_data(rend)

	writer.close()
	print("finish")

	return out_path



	header = ('''
	<div class="embed_hidden" style="text-align: center;">
	<h1> <b>HaWoR</b>: World-Space Hand Motion Reconstruction from Egocentric Videos</h1>
	<h3>
	<a href="" target="_blank" rel="noopener noreferrer">Jinglei Zhang</a><sup>1</sup>,
	<a href="https://jiankangdeng.github.io/" target="_blank" rel="noopener noreferrer">Jiankang Deng</a><sup>2</sup>,
	<br>
	<a href="https://scholar.google.com/citations?user=syoPhv8AAAAJ&hl=en" target="_blank" rel="noopener noreferrer">Chao Ma</a><sup>1</sup>,
	<a href="https://rolpotamias.github.io" target="_blank" rel="noopener noreferrer">Rolandos Alexandros Potamias</a><sup>2</sup>
	</h3>
	<h3>
	<sup>1</sup>Shanghai Jiao Tong University;
	<sup>2</sup>Imperial College London
	</h3>
	</div>
	<div style="display:flex; gap: 0.3rem; justify-content: center; align-items: center;" align="center">
	<a href='https://arxiv.org/abs/2501.02973'><img src='https://img.shields.io/badge/Arxiv-2501.02973-A42C25?style=flat&logo=arXiv&logoColor=A42C25'></a>
	<a href='https://arxiv.org/pdf/2501.02973'><img src='https://img.shields.io/badge/Paper-PDF-yellow?style=flat&logo=arXiv&logoColor=yellow'></a>
	<a href='https://hawor-project.github.io/'><img src='https://img.shields.io/badge/Project-Page-%23df5b46?style=flat&logo=Google%20chrome&logoColor=%23df5b46'></a>
	<a href='https://github.com/ThunderVVV/HaWoR'><img src='https://img.shields.io/badge/GitHub-Code-black?style=flat&logo=github&logoColor=white'></a>
	<a href='https://huggingface.co/spaces/ThunderVVV/HaWoR'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-green'></a>
	''')


	with gr.Blocks(title="HaWoR: World-Space Hand Motion Reconstruction from Egocentric Videos", css=".gradio-container") as demo:

	gr.Markdown(header)

	with gr.Row():
	with gr.Column():
	input_video = gr.Video(label="Input video", sources=["upload"])
	img_focal = gr.Number(label="Focal Length", value=600)
	# threshold = gr.Slider(value=0.3, minimum=0.05, maximum=0.95, step=0.05, label='Detection Confidence Threshold')
	#nms = gr.Slider(value=0.5, minimum=0.05, maximum=0.95, step=0.05, label='IoU NMS Threshold')
	submit = gr.Button("Submit", variant="primary")


	with gr.Column():
	reconstruction = gr.Video(label="Reconstruction",show_download_button=True)
	# hands_detected = gr.Textbox(label="Hands Detected")

	submit.click(fn=render_reconstruction, inputs=[input_video, img_focal], outputs=[reconstruction])

	with gr.Row():

	example_images = gr.Examples([
	['./example/video_0.mp4'],
	['./example/segment_037.mp4'],
	['./example/segment_018.mp4']
	],
	inputs=input_video)

	demo.launch(debug=True)