Spaces:

cedssama
/

I3D_Sign_Language_Classification

Running

I3D_Sign_Language_Classification / app.py

shin-mashita

bugfix

17d99b0 over 2 years ago

3.32 kB

	import torch
	import cv2
	import videotransforms
	import numpy as np
	import gradio as gr
	from einops import rearrange
	from torchvision import transforms
	from pytorch_i3d import InceptionI3d


	def preprocess(vidpath):
	cap = cv2.VideoCapture(vidpath)

	frames = []
	cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
	num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

	for _ in range(num):
	_, img = cap.read()

	if img is None:
	continue

	w, h, c = img.shape
	if w < 226 or h < 226:
	d = 226. - min(w, h)
	sc = 1 + d / min(w, h)
	img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc)
	img = (img / 255.) * 2 - 1

	frames.append(img)

	# frames = torch.cuda.FloatTensor(np.asarray(frames, dtype=np.float32)) if torch.cuda.is_available() else torch.Tensor(np.asarray(frames, dtype=np.float32))
	frames = torch.Tensor(np.asarray(frames, dtype=np.float32))

	transform = transforms.Compose([videotransforms.CenterCrop(224)])
	frames = transform(frames)
	frames = rearrange(frames, 't h w c-> 1 c t h w')

	return frames

	def classify(video,dataset='WLASL100'):
	to_load = {
	'WLASL100':{'logits':100,'path':'weights/asl100/FINAL_nslt_100_iters=896_top1=65.89_top5=84.11_top10=89.92.pt'},
	'WLASL2000':{'logits':2000,'path':'weights/asl2000/FINAL_nslt_2000_iters=5104_top1=32.48_top5=57.31_top10=66.31.pt'}
	}

	input = preprocess(video)

	model = InceptionI3d()
	model.load_state_dict(torch.load('weights/rgb_imagenet.pt',map_location=torch.device('cpu')))
	model.replace_logits(to_load[dataset]['logits'])
	model.load_state_dict(torch.load(to_load[dataset]['path'],map_location=torch.device('cpu')))

	# device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
	# model.to(device)
	model.cpu()
	model.eval()

	with torch.no_grad():
	per_frame_logits = model(input)

	per_frame_logits.cpu()
	model.cpu()

	predictions = rearrange(per_frame_logits,'1 j k -> j k')
	predictions = torch.mean(predictions, dim = 1)

	top = torch.argmax(predictions).item()
	_, index = torch.topk(predictions,10)
	index = index.cpu().numpy()

	with open('wlasl_class_list.txt') as f:
	idx2label = dict()
	for line in f:
	idx2label[int(line.split()[0])]=line.split()[1]

	predictions = torch.nn.functional.softmax(predictions, dim=0).cpu().numpy()

	return {idx2label[i]:float(predictions[i]) for i in index}

	title = "I3D Sign Language Recognition"
	description = "Description here"
	examples = [
	['videos/no.mp4','WLASL100'],
	['videos/all.mp4','WLASL100'],
	['videos/before.mp4','WLASL100'],
	['videos/blue.mp4','WLASL2000'],
	['videos/white.mp4','WLASL2000'],
	['videos/accident2.mp4','WLASL2000']
	]


	gr.Interface( fn=classify,
	inputs=[gr.inputs.Video(label="VIDEO"),gr.inputs.Dropdown(choices=['WLASL100','WLASL2000'], default='WLASL100', label='DATASET USED')],
	outputs=[gr.outputs.Label(num_top_classes=5, label='Top 5 Predictions')],
	allow_flagging="never",
	title=title,
	description=description,
	examples=examples).launch()