|
import torch |
|
import torchvision.transforms as T |
|
from PIL import Image |
|
from pathlib import Path |
|
from timm.models.vision_transformer import VisionTransformer |
|
from functools import partial |
|
from torch import nn |
|
|
|
|
|
def process_single_image(image_path, input_size=224, dataset_mean=[0.3464, 0.2280, 0.2228], dataset_std=[0.2520, 0.2128, 0.2093]): |
|
|
|
transform = T.Compose([ |
|
T.Resize((input_size, input_size)), |
|
T.ToTensor(), |
|
T.Normalize(mean=dataset_mean, std=dataset_std) |
|
]) |
|
|
|
|
|
image = Image.open(image_path).convert('RGB') |
|
|
|
|
|
processed_image = transform(image) |
|
|
|
return processed_image |
|
|
|
|
|
image_paths = sorted(Path('demo_images').glob('*.png')) |
|
images = torch.stack([process_single_image(image_path) for image_path in image_paths]) |
|
|
|
device = "cuda" |
|
dtype = torch.float16 |
|
|
|
model_weights = torch.load('endovit_seg.pth')['model'] |
|
|
|
model = VisionTransformer(patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6)).to(device, dtype).eval() |
|
loading = model.load_state_dict(model_weights, strict=False) |
|
print(loading) |
|
output = model.forward_features(images.to(device, dtype)) |
|
print(output.shape) |
|
|