Spaces:

hbXNov
/

owl-con-demo

Sleeping

App Files Files Community

Hritik commited on Nov 28, 2023

Commit

0ba1d16

1 Parent(s): cfe5653

add app and nle code

Browse files

Files changed (2) hide show

app.py +27 -20
nle_inference.py +9 -83

app.py CHANGED Viewed

@@ -17,11 +17,7 @@ from utils import batchify
 import gradio as gr
 from entailment_inference import get_scores
-print(f"Is CUDA available: {torch.cuda.is_available()}")
-# True
-print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
-# Tesla T4
 pretrained_ckpt = "mplugowl7bvideo/"
 trained_ckpt = "owl-con/checkpoint-5178/pytorch_model.bin"
@@ -30,19 +26,13 @@ tokenizer = LlamaTokenizer.from_pretrained(pretrained_ckpt)
 image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
 processor = MplugOwlProcessor(image_processor, tokenizer)
 # Instantiate model
 model = MplugOwlForConditionalGeneration.from_pretrained(
     pretrained_ckpt,
     torch_dtype=torch.bfloat16,
     device_map={'': 'cpu'}
-    # device_map={'':0}
 )
-# for name, param in model.named_parameters():
-#     print(param.device)
-#     break
 peft_config = LoraConfig(
     target_modules=r'.*language_model.*\.(q_proj|v_proj|k_proj|o_proj|gate_proj|down_proj|up_proj)',
     inference_mode=True,
@@ -56,14 +46,31 @@ with open(trained_ckpt, 'rb') as f:
     ckpt = torch.load(f, map_location = torch.device("cpu"))
 model.load_state_dict(ckpt)
 model = model.to("cuda:0").to(torch.bfloat16)
-print('Model Loaded')
-PROMPT = """The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
-Human: <|video|>
-Human: Does this video entail the description: ""A soccer team walking off the field while the audience claps.""?
-AI: """
-valid_data = MultiModalDataset("examples/y5xuvHpDPZQ_000005_000015.mp4", PROMPT, tokenizer, processor, max_length = 256, loss_objective = 'sequential')
-dataloader = DataLoader(valid_data, pin_memory=True, collate_fn=batchify)
-score = get_scores(model, tokenizer, dataloader)
-print(score)

 import gradio as gr
 from entailment_inference import get_scores
+from nle_inference import VideoCaptionDataset, get_nle
 pretrained_ckpt = "mplugowl7bvideo/"
 trained_ckpt = "owl-con/checkpoint-5178/pytorch_model.bin"
 image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
 processor = MplugOwlProcessor(image_processor, tokenizer)
 # Instantiate model
 model = MplugOwlForConditionalGeneration.from_pretrained(
     pretrained_ckpt,
     torch_dtype=torch.bfloat16,
     device_map={'': 'cpu'}
 )
 peft_config = LoraConfig(
     target_modules=r'.*language_model.*\.(q_proj|v_proj|k_proj|o_proj|gate_proj|down_proj|up_proj)',
     inference_mode=True,
     ckpt = torch.load(f, map_location = torch.device("cpu"))
 model.load_state_dict(ckpt)
 model = model.to("cuda:0").to(torch.bfloat16)
+def inference(videopath, text):
+    PROMPT = """The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+    Human: <|video|>
+    Human: Does this video entail the description: "{caption}"?
+    AI: """
+    valid_data = MultiModalDataset(videopath, PROMPT.format(caption = text), tokenizer, processor, max_length = 256, loss_objective = 'sequential')
+    dataloader = DataLoader(valid_data, pin_memory=True, collate_fn=batchify)
+    score = get_scores(model, tokenizer, dataloader)
+    if score < 0.5:
+        dataset = VideoCaptionDataset(videopath, text)
+        dataloader = DataLoader(dataset)
+        nle = get_nle(model, processor, tokenizer, dataloader)
+    else:
+        nle = "None (NLE is only triggered when entailment score < 0.5)"
+    return score, nle
+demo = gr.Interface(inference,
+                    title="Owl-Con Demo (ode: https://github.com/Hritikbansal/videocon | Paper: https://arxiv.org/abs/2311.10111)",
+                    inputs=[gr.Video(label='input_video'), gr.Textbox(label='input_caption')],
+                    outputs=[gr.Number(label='Entailemnt Score'), gr.Textbox(label='Natural Language Explanation')])
+if __name__ == "__main__":
+    demo.launch()

nle_inference.py CHANGED Viewed

@@ -11,19 +11,6 @@ from transformers.models.llama.tokenization_llama import LlamaTokenizer
 from mplug_owl_video.modeling_mplug_owl import MplugOwlForConditionalGeneration
 from mplug_owl_video.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor
-parser = argparse.ArgumentParser()
-parser.add_argument('--input_file', type = str, required = True, help = 'input csv file')
-parser.add_argument('--output_file', type = str, help = 'output csv file')
-parser.add_argument('--pretrained_ckpt', type = str, required = True, help = 'pretrained ckpt')
-parser.add_argument('--trained_ckpt', type = str, help = 'trained ckpt')
-parser.add_argument('--lora_r', type = int, default = 32)
-parser.add_argument('--use_lora', action = 'store_true', help = 'lora model')
-parser.add_argument('--all_params', action = 'store_true', help = 'all params')
-parser.add_argument('--batch_size', type = int, default = 1)
-parser.add_argument('--num_frames', type = int, default = 32)
-args = parser.parse_args()
 PROMPT_FEEDBACK = '''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
 Human: <|video|>
@@ -38,89 +25,28 @@ generate_kwargs = {
 class VideoCaptionDataset(Dataset):
-    def __init__(self, input_file):
-        self.data = pd.read_csv(input_file)
     def __len__(self):
-        return len(self.data)
     def __getitem__(self, index):
         item = {}
-        item['videopath'] = self.data.iloc[index]['videopath']
-        item['neg_caption'] = self.data.iloc[index]['neg_caption']
         return item
-def get_nle(args, model, processor, tokenizer, dataloader):
     with torch.no_grad():
         for _, batch in tqdm(enumerate(dataloader)):
             videopaths = batch['videopath']
             neg_caption = batch['neg_caption'][0]
             prompts = [PROMPT_FEEDBACK.format(caption = neg_caption)]
-            inputs = processor(text=prompts, videos=videopaths, num_frames=args.num_frames, return_tensors='pt')
             inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
             inputs = {k: v.to(model.device) for k, v in inputs.items()}
             res = model.generate(**inputs, **generate_kwargs)
             generated_nle = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
-            with open(args.output_file, 'a') as f:
-                writer = csv.writer(f)
-                writer.writerow([videopaths[0], neg_caption, generated_nle])
-def main():
-    # Create dataloader
-    dataset = VideoCaptionDataset(args.input_file)
-    dataloader = DataLoader(dataset, batch_size = args.batch_size)
-    pretrained_ckpt = args.pretrained_ckpt
-    # Processors
-    tokenizer = LlamaTokenizer.from_pretrained(pretrained_ckpt)
-    image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
-    processor = MplugOwlProcessor(image_processor, tokenizer)
-    # Instantiate model
-    model = MplugOwlForConditionalGeneration.from_pretrained(
-        pretrained_ckpt,
-        torch_dtype=torch.bfloat16,
-        device_map={'':0}
-    )
-    if args.use_lora:
-        for name, param in model.named_parameters():
-            param.requires_grad = False
-        if args.all_params:
-            peft_config = LoraConfig(
-                target_modules=r'.*language_model.*\.(q_proj|v_proj|k_proj|o_proj|gate_proj|down_proj|up_proj)',
-                inference_mode=True,
-                r=args.lora_r,
-                lora_alpha=16,
-                lora_dropout=0.05
-            )
-        else:
-            peft_config = LoraConfig(
-                target_modules=r'.*language_model.*\.(q_proj|v_proj|k_proj|o_proj)',
-                inference_mode=True,
-                r=args.lora_r,
-                lora_alpha=16,
-                lora_dropout=0.05
-            )
-        model = get_peft_model(model, peft_config)
-        model.print_trainable_parameters()
-        with open(args.trained_ckpt, 'rb') as f:
-            ckpt = torch.load(f, map_location = torch.device(f"cuda:0"))
-        model.load_state_dict(ckpt)
-        model = model.to(torch.bfloat16)
-        print('Model Loaded')
-    model.eval()
-    # get nle
-    get_nle(args, model, processor, tokenizer, dataloader)
-if __name__  == "__main__":
-    main()

 from mplug_owl_video.modeling_mplug_owl import MplugOwlForConditionalGeneration
 from mplug_owl_video.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor
 PROMPT_FEEDBACK = '''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
 Human: <|video|>
 class VideoCaptionDataset(Dataset):
+    def __init__(self, videopath, text):
+        self.videopath = videopath
+        self.text = text
     def __len__(self):
+        return 1
     def __getitem__(self, index):
         item = {}
+        item['videopath'] = self.videopath
+        item['neg_caption'] = self.text
         return item
+def get_nle(model, processor, tokenizer, dataloader):
     with torch.no_grad():
         for _, batch in tqdm(enumerate(dataloader)):
             videopaths = batch['videopath']
             neg_caption = batch['neg_caption'][0]
             prompts = [PROMPT_FEEDBACK.format(caption = neg_caption)]
+            inputs = processor(text=prompts, videos=videopaths, num_frames=32, return_tensors='pt')
             inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
             inputs = {k: v.to(model.device) for k, v in inputs.items()}
             res = model.generate(**inputs, **generate_kwargs)
             generated_nle = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
+            return generated_nle