Multimodal-OCR

Paused

App Files Files Community

prithivMLmods commited on May 5

Commit

c307af6

verified ·

1 Parent(s): ce03905

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -102

app.py CHANGED Viewed

@@ -14,9 +14,7 @@ from transformers import (
 )
 from transformers import Qwen2_5_VLForConditionalGeneration
-# ---------------------------
 # Helper Functions
-# ---------------------------
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
@@ -49,7 +47,6 @@ def downsample_video(video_path):
     if total_frames <= 0 or fps <= 0:
         vidcap.release()
         return frames
-    # Determine 10 evenly spaced frame indices.
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -63,8 +60,7 @@ def downsample_video(video_path):
     return frames
 # Model and Processor Setup
-# Qwen2VL OCR (default branch)
-QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"  # [or] prithivMLmods/Qwen2-VL-OCR2-2B-Instruct
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
@@ -72,7 +68,6 @@ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# RolmOCR branch (@RolmOCR)
 ROLMOCR_MODEL_ID = "reducto/RolmOCR"
 rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
 rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -83,111 +78,62 @@ rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Main Inference Function
 @spaces.GPU
-def model_inference(input_dict, history):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
-    # RolmOCR Inference (@RolmOCR)
-    if text.lower().startswith("@rolmocr"):
-        # Remove the tag from the query.
-        text_prompt = text[len("@rolmocr"):].strip()
-        # Check if a video is provided for inference.
-        if files and isinstance(files[0], str) and files[0].lower().endswith((".mp4", ".avi", ".mov")):
-            video_path = files[0]
-            frames = downsample_video(video_path)
             if not frames:
                 yield "Error: Could not extract frames from the video."
                 return
-            # Build the message: prompt followed by each frame with its timestamp.
-            content_list = [{"type": "text", "text": text_prompt}]
-            for image, timestamp in frames:
-                content_list.append({"type": "text", "text": f"Frame {timestamp}:"})
-                content_list.append({"type": "image", "image": image})
-            messages = [{"role": "user", "content": content_list}]
-            # For video, extract images only.
-            video_images = [image for image, _ in frames]
-            prompt_full = rolmocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            inputs = rolmocr_processor(
-                text=[prompt_full],
-                images=video_images,
-                return_tensors="pt",
-                padding=True,
-            ).to("cuda")
         else:
-            # Assume image(s) or text query.
-            if len(files) > 1:
-                images = [load_image(image) for image in files]
-            elif len(files) == 1:
-                images = [load_image(files[0])]
-            else:
-                images = []
-            if text_prompt == "" and not images:
-                yield "Error: Please input a text query and/or provide an image for the @RolmOCR feature."
                 return
-            messages = [{
-                "role": "user",
-                "content": [
-                    *[{"type": "image", "image": image} for image in images],
-                    {"type": "text", "text": text_prompt},
-                ],
-            }]
-            prompt_full = rolmocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            inputs = rolmocr_processor(
-                text=[prompt_full],
-                images=images if images else None,
-                return_tensors="pt",
-                padding=True,
-            ).to("cuda")
-        streamer = TextIteratorStreamer(rolmocr_processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-        thread = Thread(target=rolmocr_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        # Use a different color scheme for RolmOCR (purple-themed).
-        yield progress_bar_html("Processing with Qwen2.5VL (RolmOCR)")
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-        return
-    # Default Inference: Qwen2VL OCR
-    # Process files: support multiple images.
-    if len(files) > 1:
-        images = [load_image(image) for image in files]
-    elif len(files) == 1:
-        images = [load_image(files[0])]
-    else:
-        images = []
-    if text == "" and not images:
-        yield "Error: Please input a text query and optionally image(s)."
-        return
-    if text == "" and images:
-        yield "Error: Please input a text query along with the image(s)."
-        return
-    messages = [{
-        "role": "user",
-        "content": [
-            *[{"type": "image", "image": image} for image in images],
-            {"type": "text", "text": text},
-        ],
-    }]
-    prompt_full = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = qwen_processor(
         text=[prompt_full],
-        images=images if images else None,
         return_tensors="pt",
         padding=True,
     ).to("cuda")
-    streamer = TextIteratorStreamer(qwen_processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-    thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
-    yield progress_bar_html("Processing with Qwen2VL OCR")
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
@@ -196,25 +142,26 @@ def model_inference(input_dict, history):
 # Gradio Interface
 examples = [
-    [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
-    [{"text": "@RolmOCR Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
-    [{"text": "@RolmOCR OCR the Image", "files": ["rolm/3.jpeg"]}],
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
-    description="# **Multimodal OCR `@RolmOCR and Default Qwen2VL OCR`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
-        label="Query Input",
-        file_types=["image", "video"],
-        file_count="multiple",
-        placeholder="Use tag @RolmOCR for RolmOCR, or leave blank for default Qwen2VL OCR"
     ),
     stop_btn="Stop Generation",
     multimodal=True,
     cache_examples=False,
 )
 demo.launch(debug=True)

 )
 from transformers import Qwen2_5_VLForConditionalGeneration
 # Helper Functions
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
     if total_frames <= 0 or fps <= 0:
         vidcap.release()
         return frames
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
     return frames
 # Model and Processor Setup
+QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     QV_MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
 ROLMOCR_MODEL_ID = "reducto/RolmOCR"
 rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
 rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Main Inference Function
 @spaces.GPU
+def model_inference(input_dict, history, use_rolmocr=False):
     text = input_dict["text"].strip()
     files = input_dict.get("files", [])
+    if not text and not files:
+        yield "Error: Please input a text query or provide files (images or videos)."
+        return
+    # Process files: images and videos
+    image_list = []
+    for idx, file in enumerate(files):
+        if file.lower().endswith((".mp4", ".avi", ".mov")):
+            frames = downsample_video(file)
             if not frames:
                 yield "Error: Could not extract frames from the video."
                 return
+            for frame, timestamp in frames:
+                label = f"Video {idx+1} Frame {timestamp}:"
+                image_list.append((label, frame))
         else:
+            try:
+                img = load_image(file)
+                label = f"Image {idx+1}:"
+                image_list.append((label, img))
+            except Exception as e:
+                yield f"Error loading image: {str(e)}"
                 return
+    # Build content list
+    content = [{"type": "text", "text": text}]
+    for label, img in image_list:
+        content.append({"type": "text", "text": label})
+        content.append({"type": "image", "image": img})
+    messages = [{"role": "user", "content": content}]
+    # Select processor and model
+    processor = rolmocr_processor if use_rolmocr else qwen_processor
+    model = rolmocr_model if use_rolmocr else qwen_model
+    model_name = "RolmOCR" if use_rolmocr else "Qwen2VL OCR"
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    all_images = [item["image"] for item in content if item["type"] == "image"]
+    inputs = processor(
         text=[prompt_full],
+        images=all_images if all_images else None,
         return_tensors="pt",
         padding=True,
     ).to("cuda")
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
+    yield progress_bar_html(f"Processing with {model_name}")
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
 # Gradio Interface
 examples = [
+    [{"text": "OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
+    [{"text": "Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
+    [{"text": "OCR the Image", "files": ["rolm/3.jpeg"]}],
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
+    description="# **Multimodal OCR with Model Selection**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
+        label="Query Input",
+        file_types=["image", "video"],
+        file_count="multiple",
+        placeholder="Input your query and optionally upload image(s) or video(s). Select the model using the checkbox."
     ),
     stop_btn="Stop Generation",
     multimodal=True,
     cache_examples=False,
+    additional_inputs=[gr.Checkbox(label="Use RolmOCR", value=False, info="Check to use RolmOCR, uncheck to use Qwen2VL OCR")],
 )
 demo.launch(debug=True)