OpenCHAT-mini2

Runtime error

App Files Files Community

KingNish commited on Jul 21

Commit

f46faa5

•

1 Parent(s): 0f6e2c0

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -15

app.py CHANGED Viewed

@@ -17,9 +17,11 @@ model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
 processor = LlavaProcessor.from_pretrained(model_id)
-model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
 model.to("cpu")
 def sample_frames(video_file) :
     try:
@@ -88,26 +90,51 @@ def respond(message, history):
     vqa = ""
     user_prompt = message
     # Handle image processing
-    if message["files"]:
-        image = user_prompt["files"][-1]
         txt = user_prompt["text"]
         img = user_prompt["files"]
         video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
         image_extensions = Image.registered_extensions()
         image_extensions = tuple([ex for ex, f in image_extensions.items()])
-        if image.endswith(video_extensions):
-            gr.Info(f"Analyzing {video_extensions} file")
-            image = sample_frames(image)
-            image_tokens = "<image>" * int(len(image))
             prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
-        elif image.endswith(image_extensions):
-            gr.Info("Analyzing image")
-            image = Image.open(image).convert("RGB")
-            prompt = f"<|im_start|>user <image>\n{user_prompt}<|im_end|><|im_start|>assistant"
         inputs = processor(prompt, image, return_tensors="pt")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, **{"skip_special_tokens": True})
@@ -116,7 +143,6 @@ def respond(message, history):
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
-        gr.Info("Generating output")
         buffer = ""
         for new_text in streamer:
@@ -132,7 +158,6 @@ def respond(message, history):
         {"type": "function", "function": {"name": "image_qna", "description": "Answer question asked by user related to image", "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "Question by user"}}, "required": ["query"]}}},
     ]
-    message_text = message["text"]
     func_caller.append({"role": "user", "content": f'[SYSTEM]You are a helpful assistant. You have access to the following functions: \n {str(functions_metadata)}\n\nTo use these functions respond with:\n<functioncall> {{ "name": "function_name", "arguments": {{ "arg_1": "value_1", "arg_1": "value_1", ... }} }}  </functioncall>  [USER] {message} {vqa}'})
     response = client_gemma.chat_completion(func_caller, max_tokens=150)

 processor = LlavaProcessor.from_pretrained(model_id)
+model = LlavaForConditionalGeneration.from_pretrained(model_id)
 model.to("cpu")
+def replace_video_with_images(text, frames):
+    return text.replace("<video>", "<image>" * frames)
 def sample_frames(video_file) :
     try:
     vqa = ""
     user_prompt = message
+    message_text = message["text"]
     # Handle image processing
+    if message["files"]:
         txt = user_prompt["text"]
         img = user_prompt["files"]
+        if len(message["files"]) == 1:
+            image = [message["files"][0]]
+        elif len(message["files"]) > 1:
+            image = [for msg in message["files"]]
         video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
         image_extensions = Image.registered_extensions()
         image_extensions = tuple([ex for ex, f in image_extensions.items()])
+        if len(image) == 1:
+            if image[0].endswith(video_extensions):
+                gr.Info(f"Analyzing video")
+                image = sample_frames(image[0])
+                image_tokens = "<image>" * int(len(image))
+                prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
+            elif image[0].endswith(image_extensions):
+                gr.Info("Analyzing image")
+                image = Image.open(image[0]).convert("RGB")
+                prompt = f"<|im_start|>user <image>\n{user_prompt}<|im_end|><|im_start|>assistant"
+        elif len(image) > 1:
+            image_list = []
+            for img in image:
+                if img.endswith(image_extensions):
+                    gr.Info("Analyzing image")
+                    img = Image.open(img).convert("RGB")
+                    image_list.append(img)
+                elif img.endswith(video_extensions):
+                    gr.Info(f"Analyzing video")
+                    frames = sample_frames(img)
+                    for frame in frames:
+                        image_list.append(frame)
+            image_tokens = "<image>" * len(image_list)
             prompt = f"<|im_start|>user {image_tokens}\n{user_prompt}<|im_end|><|im_start|>assistant"
+            image = image_list
         inputs = processor(prompt, image, return_tensors="pt")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, **{"skip_special_tokens": True})
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         for new_text in streamer:
         {"type": "function", "function": {"name": "image_qna", "description": "Answer question asked by user related to image", "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "Question by user"}}, "required": ["query"]}}},
     ]
     func_caller.append({"role": "user", "content": f'[SYSTEM]You are a helpful assistant. You have access to the following functions: \n {str(functions_metadata)}\n\nTo use these functions respond with:\n<functioncall> {{ "name": "function_name", "arguments": {{ "arg_1": "value_1", "arg_1": "value_1", ... }} }}  </functioncall>  [USER] {message} {vqa}'})
     response = client_gemma.chat_completion(func_caller, max_tokens=150)