Spaces:

mshukor
/

eP-ALM

Runtime error

App Files Files Community

mshukor commited on Jul 14, 2023

Commit

ce7469b

•

1 Parent(s): 902be23

vqa

Browse files

Files changed (1) hide show

app.py +95 -26

app.py CHANGED Viewed

@@ -37,6 +37,7 @@ from ruamel.yaml import YAML
 import torch
 import gradio as gr
 yaml=YAML(typ='safe')
@@ -82,33 +83,33 @@ msg = model_caption.load_state_dict(state_dict,strict=False)
 model_caption.bfloat16()
-###### VQA
-config = 'configs/image/ePALM_vqa.yaml'
-config = yaml.load(open(config, 'r'))
-start_layer_idx = 19
-end_layer_idx = 31
-low_cpu = True
-model_vqa = ePALM(opt_model_name=text_model,
-               vision_model_name=vision_model_name,
-               use_vis_prefix=True,
-               start_layer_idx=start_layer_idx,
-               end_layer_idx=end_layer_idx,
-               return_hidden_state_vision=True,
-               config=config,
-               low_cpu=low_cpu
-)
-print("Model Built")
-model_vqa.to(device)
 checkpoint_path = 'checkpoints/float32/ePALM_vqa/checkpoint_best.pth'
 checkpoint = torch.load(checkpoint_path, map_location='cpu')
-state_dict = checkpoint['model']
-msg = model_vqa.load_state_dict(state_dict,strict=False)
-model_vqa.bfloat16()
@@ -154,13 +155,80 @@ transform = transforms.Compose([
             normalize,
             ])
 do_sample=False
 num_beams=3
 max_length=30
@@ -188,19 +256,20 @@ def inference(image, audio, video, task_type, instruction):
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
-        model = model_vqa.clone()
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
-        model_vqa = model_vqa.load_state_dict(state_dict_video_qa,strict=False)
-        model = model_vqa.clone()
     else:
         raise NotImplemented
     if "Video" in task_type:
-        pass
     elif "Audio" in task_type:
-        pass
     else:
         image = transform(image)
         image = image.to(device,non_blocking=True).unsqueeze(0)

 import torch
 import gradio as gr
+import torchaudio
 yaml=YAML(typ='safe')
 model_caption.bfloat16()
+# ###### VQA
+# config = 'configs/image/ePALM_vqa.yaml'
+# config = yaml.load(open(config, 'r'))
+# start_layer_idx = 19
+# end_layer_idx = 31
+# low_cpu = True
+# model_vqa = ePALM(opt_model_name=text_model,
+#                vision_model_name=vision_model_name,
+#                use_vis_prefix=True,
+#                start_layer_idx=start_layer_idx,
+#                end_layer_idx=end_layer_idx,
+#                return_hidden_state_vision=True,
+#                config=config,
+#                low_cpu=low_cpu
+# )
+# print("Model Built")
+# model_vqa.to(device)
 checkpoint_path = 'checkpoints/float32/ePALM_vqa/checkpoint_best.pth'
 checkpoint = torch.load(checkpoint_path, map_location='cpu')
+state_dict_vqa = checkpoint['model']
+# msg = model_vqa.load_state_dict(state_dict,strict=False)
+# model_vqa.bfloat16()
             normalize,
             ])
+type_transform = transforms.Lambda(lambda x: x.float().div(255.0))
+test_transform = transforms.Compose([
+    transforms.Resize((image_size,image_size),interpolation=Image.BICUBIC),
+    type_transform,
+    normalize,
+    ])
+from dataset.video_utils import VIDEO_READER_FUNCS
+video_reader = VIDEO_READER_FUNCS['decord']
+def read_video(path, num_frames=16):
+    frames, frame_indices, video_duration = video_reader(
+        path, num_frames, 'rand', max_num_frames=-1
+    )
+    video = test_transform(frames)
+    return video
+def read_audio(path):
+    melbins = 128
+    target_length = 1024
+    skip_norm = False
+    norm_mean = -4.2677393
+    norm_std = 4.5689974
+    waveform, sr = torchaudio.load(path)
+    waveform = waveform - waveform.mean()
+    # audio
+    fbank = torchaudio.compliance.kaldi.fbank(waveform, htk_compat=True, sample_frequency=sr, use_energy=False,
+                                                window_type='hanning', num_mel_bins=melbins, dither=0.0,
+                                                frame_shift=10)
+    n_frames = fbank.shape[0]
+    p = target_length - n_frames
+    # cut and pad
+    if p > 0:
+        m = torch.nn.ZeroPad2d((0, 0, 0, p))
+        fbank = m(fbank)
+    elif p < 0:
+        fbank = fbank[0:target_length, :]
+    # SpecAug, not do for eval set
+    fbank = torch.transpose(fbank, 0, 1)
+    # this is just to satisfy new torchaudio version, which only accept [1, freq, time]
+    fbank = fbank.unsqueeze(0)
+    # squeeze it back, it is just a trick to satisfy new torchaudio version
+    fbank = fbank.squeeze(0)
+    fbank = torch.transpose(fbank, 0, 1)
+    # normalize the input for both training and test
+    if not skip_norm:
+        fbank = (fbank - norm_mean) / (norm_std * 2)
+    # skip normalization the input if you are trying to get the normalization stats.
+    else:
+        pass
+    audio = fbank
+    return audio
 do_sample=False
 num_beams=3
 max_length=30
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
+        model_caption = model_caption.load_state_dict(state_dict_vqa,strict=False)
+        model = model_caption.clone()
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
+        model_caption = model_caption.load_state_dict(state_dict_video_qa,strict=False)
+        model = model_caption.clone()
     else:
         raise NotImplemented
     if "Video" in task_type:
+        image = read_video(image)
     elif "Audio" in task_type:
+        image = read_audio(image)
     else:
         image = transform(image)
         image = image.to(device,non_blocking=True).unsqueeze(0)