Spaces:
Running
Running
File size: 11,443 Bytes
c33d1b8 c1d7f36 7fb4aa3 8b6ae1b ca63a5c 7388ff5 a18bffa 7388ff5 fd197d1 44b4275 1523151 fd197d1 ca63a5c b56d61a ca63a5c c1d7f36 ca63a5c 923e631 ca63a5c 829e8f8 ca63a5c 89befaf 923e631 fd197d1 7fb4aa3 981eadf 153a6a8 981eadf c33d1b8 d58199d 7fb4aa3 d58199d 7fb4aa3 981eadf 7fb4aa3 d58199d 7fb4aa3 981eadf 8b6ae1b 981eadf 8b6ae1b 7fb4aa3 b7eedf7 7fb4aa3 20771ca 7fb4aa3 f00a466 7fb4aa3 e3b8f47 7fb4aa3 7268007 d50e4bf 7268007 7fb4aa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
import gradio as gr
import spaces
import os
import uuid
import subprocess
import torch
print("check torch and cuda version, they must be 2.4.0 + 12.1:")
print(torch.__version__)
print(torch.version.cuda)
# download model
print("Downloading model weights")
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/metric_depth_vit_large_800k.pth -P ./thirdparty/Metric3D/weights/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/droid.pth -P ./weights/external/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/detector.pt -P ./weights/external/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/checkpoints/hawor.ckpt -P ./weights/hawor/checkpoints/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/checkpoints/infiller.pt -P ./weights/hawor/checkpoints/')
os.system('wget -q https://huggingface.co/ThunderVVV/HaWoR/resolve/main/hawor/model_config.yaml -P ./weights/hawor/')
def install_cuda_toolkit():
CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"
CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])
os.environ["CUDA_HOME"] = "/usr/local/cuda"
os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
os.environ["CUDA_HOME"],
"" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
)
os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
print("Compling other packages")
install_cuda_toolkit()
os.system('pip install ./thirdparty/DROID-SLAM')
os.system('pip install ./thirdparty/DROID-SLAM/thirdparty/lietorch')
os.environ["FORCE_CUDA"] = "1"
os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git@stable')
import numpy as np
import joblib
import cv2
import imageio
from easydict import EasyDict
from scripts.scripts_test_video.detect_track_video import detect_track_video
from scripts.scripts_test_video.hawor_video import hawor_motion_estimation, hawor_infiller
from scripts.scripts_test_video.hawor_slam import hawor_slam
from hawor.utils.process import get_mano_faces, run_mano, run_mano_left
from lib.eval_utils.custom_utils import load_slam_cam
from lib.vis.run_vis2 import lookat_matrix, run_vis2_on_video, run_vis2_on_video_cam
from lib.vis.renderer_world import Renderer
# @spaces.GPU(duration=200)
def render_reconstruction(input_video, img_focal):
args = EasyDict()
args.video_path = input_video
args.input_type = 'file'
args.checkpoint = './weights/hawor/checkpoints/hawor.ckpt'
args.infiller_weight = './weights/hawor/checkpoints/infiller.pt'
args.vis_mode = 'world'
args.img_focal = img_focal
start_idx, end_idx, seq_folder, imgfiles = detect_track_video(args)
if os.path.exists(f'{seq_folder}/tracks_{start_idx}_{end_idx}/frame_chunks_all.npy'):
print("skip hawor motion estimation")
frame_chunks_all = joblib.load(f'{seq_folder}/tracks_{start_idx}_{end_idx}/frame_chunks_all.npy')
img_focal = args.img_focal
else:
frame_chunks_all, img_focal = hawor_motion_estimation(args, start_idx, end_idx, seq_folder)
slam_path = os.path.join(seq_folder, f"SLAM/hawor_slam_w_scale_{start_idx}_{end_idx}.npz")
if not os.path.exists(slam_path):
hawor_slam(args, start_idx, end_idx)
R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all = load_slam_cam(slam_path)
out_path = infiller_and_vis(args, start_idx, end_idx, frame_chunks_all, R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all, seq_folder, imgfiles)
return out_path
@spaces.GPU(duration=80)
def infiller_and_vis(args, start_idx, end_idx, frame_chunks_all, R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all, seq_folder, imgfiles):
pred_trans, pred_rot, pred_hand_pose, pred_betas, pred_valid = hawor_infiller(args, start_idx, end_idx, frame_chunks_all)
# vis sequence for this video
hand2idx = {
"right": 1,
"left": 0
}
vis_start = 0
vis_end = pred_trans.shape[1] - 1
# get faces
faces = get_mano_faces()
faces_new = np.array([[92, 38, 234],
[234, 38, 239],
[38, 122, 239],
[239, 122, 279],
[122, 118, 279],
[279, 118, 215],
[118, 117, 215],
[215, 117, 214],
[117, 119, 214],
[214, 119, 121],
[119, 120, 121],
[121, 120, 78],
[120, 108, 78],
[78, 108, 79]])
faces_right = np.concatenate([faces, faces_new], axis=0)
# get right hand vertices
hand = 'right'
hand_idx = hand2idx[hand]
pred_glob_r = run_mano(pred_trans[hand_idx:hand_idx+1, vis_start:vis_end], pred_rot[hand_idx:hand_idx+1, vis_start:vis_end], pred_hand_pose[hand_idx:hand_idx+1, vis_start:vis_end], betas=pred_betas[hand_idx:hand_idx+1, vis_start:vis_end])
right_verts = pred_glob_r['vertices'][0]
right_dict = {
'vertices': right_verts.unsqueeze(0),
'faces': faces_right,
}
# get left hand vertices
faces_left = faces_right[:,[0,2,1]]
hand = 'left'
hand_idx = hand2idx[hand]
pred_glob_l = run_mano_left(pred_trans[hand_idx:hand_idx+1, vis_start:vis_end], pred_rot[hand_idx:hand_idx+1, vis_start:vis_end], pred_hand_pose[hand_idx:hand_idx+1, vis_start:vis_end], betas=pred_betas[hand_idx:hand_idx+1, vis_start:vis_end])
left_verts = pred_glob_l['vertices'][0]
left_dict = {
'vertices': left_verts.unsqueeze(0),
'faces': faces_left,
}
R_x = torch.tensor([[1, 0, 0],
[0, -1, 0],
[0, 0, -1]]).float()
R_c2w_sla_all = torch.einsum('ij,njk->nik', R_x, R_c2w_sla_all)
t_c2w_sla_all = torch.einsum('ij,nj->ni', R_x, t_c2w_sla_all)
R_w2c_sla_all = R_c2w_sla_all.transpose(-1, -2)
t_w2c_sla_all = -torch.einsum("bij,bj->bi", R_w2c_sla_all, t_c2w_sla_all)
left_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, left_dict['vertices'].cpu())
right_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, right_dict['vertices'].cpu())
# simple visualization
bin_size = 128
max_faces_per_bin = 20000
img = cv2.imread(imgfiles[0])
renderer = Renderer(img.shape[1], img.shape[0], 1800, 'cuda',
bin_size=bin_size, max_faces_per_bin=max_faces_per_bin)
output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
if not os.path.exists(output_pth):
os.makedirs(output_pth)
image_names = imgfiles[vis_start:vis_end]
print(f"vis {vis_start} to {vis_end}")
# vis_video_path = run_vis2_on_video(left_dict, right_dict, output_pth, img_focal, image_names, R_c2w=R_c2w_sla_all[vis_start:vis_end], t_c2w=t_c2w_sla_all[vis_start:vis_end], interactive=False)
faces_left = torch.from_numpy(faces_left).cuda()
faces_right = torch.from_numpy(faces_right).cuda()
faces_all = torch.stack((faces_left, faces_right))
side_source = torch.tensor([0.463, -0.478, 2.456])
side_target = torch.tensor([0.026, -0.481, -3.184])
up = torch.tensor([1.0, 0.0, 0.0])
view_camera = lookat_matrix(side_source, side_target, up)
cam_R = view_camera[:3, :3].unsqueeze(0).cuda()
cam_T = view_camera[:3, 3].unsqueeze(0).cuda()
vis_video_imgs = []
out_path = f'{seq_folder}/vis_output_{str(uuid.uuid4())}.mp4'
writer = imageio.get_writer(out_path, fps=30, mode='I',
format='FFMPEG', macro_block_size=1)
renderer.set_ground(100, 0, 0)
for img_i, _ in enumerate(image_names):
vertices_left = left_dict['vertices'][:, img_i]
vertices_right = right_dict['vertices'][:, img_i]
cameras, lights = renderer.create_camera_from_cv(cam_R, cam_T)
verts_color = torch.tensor([0.207, 0.596, 0.792, 1.0]).unsqueeze(0).repeat(2, 1)
vertices_i = torch.stack((vertices_left, vertices_right))
rend, _ = renderer.render_multiple(vertices_i.cuda(), faces_all.cuda(), verts_color.cuda(), cameras, lights)
writer.append_data(rend)
writer.close()
print("finish")
return out_path
header = ('''
<div class="embed_hidden" style="text-align: center;">
<h1> <b>HaWoR</b>: World-Space Hand Motion Reconstruction from Egocentric Videos</h1>
<h3>
<a href="" target="_blank" rel="noopener noreferrer">Jinglei Zhang</a><sup>1</sup>,
<a href="https://jiankangdeng.github.io/" target="_blank" rel="noopener noreferrer">Jiankang Deng</a><sup>2</sup>,
<br>
<a href="https://scholar.google.com/citations?user=syoPhv8AAAAJ&hl=en" target="_blank" rel="noopener noreferrer">Chao Ma</a><sup>1</sup>,
<a href="https://rolpotamias.github.io" target="_blank" rel="noopener noreferrer">Rolandos Alexandros Potamias</a><sup>2</sup>
</h3>
<h3>
<sup>1</sup>Shanghai Jiao Tong University;
<sup>2</sup>Imperial College London
</h3>
</div>
<div style="display:flex; gap: 0.3rem; justify-content: center; align-items: center;" align="center">
<a href='https://arxiv.org/abs/2501.02973'><img src='https://img.shields.io/badge/Arxiv-2501.02973-A42C25?style=flat&logo=arXiv&logoColor=A42C25'></a>
<a href='https://arxiv.org/pdf/2501.02973'><img src='https://img.shields.io/badge/Paper-PDF-yellow?style=flat&logo=arXiv&logoColor=yellow'></a>
<a href='https://hawor-project.github.io/'><img src='https://img.shields.io/badge/Project-Page-%23df5b46?style=flat&logo=Google%20chrome&logoColor=%23df5b46'></a>
<a href='https://github.com/ThunderVVV/HaWoR'><img src='https://img.shields.io/badge/GitHub-Code-black?style=flat&logo=github&logoColor=white'></a>
<a href='https://huggingface.co/spaces/ThunderVVV/HaWoR'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-green'></a>
''')
with gr.Blocks(title="HaWoR: World-Space Hand Motion Reconstruction from Egocentric Videos", css=".gradio-container") as demo:
gr.Markdown(header)
with gr.Row():
with gr.Column():
input_video = gr.Video(label="Input video", sources=["upload"])
img_focal = gr.Number(label="Focal Length", value=600)
# threshold = gr.Slider(value=0.3, minimum=0.05, maximum=0.95, step=0.05, label='Detection Confidence Threshold')
#nms = gr.Slider(value=0.5, minimum=0.05, maximum=0.95, step=0.05, label='IoU NMS Threshold')
submit = gr.Button("Submit", variant="primary")
with gr.Column():
reconstruction = gr.Video(label="Reconstruction",show_download_button=True)
# hands_detected = gr.Textbox(label="Hands Detected")
submit.click(fn=render_reconstruction, inputs=[input_video, img_focal], outputs=[reconstruction])
with gr.Row():
example_images = gr.Examples([
['./example/video_0.mp4'],
['./example/segment_037.mp4'],
['./example/segment_018.mp4']
],
inputs=input_video)
demo.launch(debug=True) |