Spaces:

pranshh
/

ocr-assignment

Running

App Files Files Community

pranshh commited on Sep 30, 2024

Commit

4af6e9e

verified ·

1 Parent(s): 8222a16

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -33

app.py CHANGED Viewed

@@ -7,64 +7,49 @@ Original file is located at
     https://colab.research.google.com/drive/1vzsQ17-W1Vy6yJ60XUwFy0QRkOR_SIg7
 """
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
 import torch
 import gradio as gr
 from PIL import Image
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
-# Initialize the model with float16 precision and handle fallback to CPU
-# Simplified model loading function for CPU
 def load_model():
-    return Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
-        torch_dtype=torch.float32,  # Use float32 for CPU
-        low_cpu_mem_usage=True
     )
-# Load the model
 vlm = load_model()
-# OCR function to extract text from an image
 def ocr_image(image, query="Extract text from the image", keyword=""):
     messages = [
         {
             "role": "user",
             "content": [
                 {
                     "type": "image",
-                    "image": image,
                 },
                 {"type": "text", "text": query},
             ],
         }
     ]
-    # Prepare inputs for the model
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
     inputs = inputs.to("cpu")
-    # Generate the output text using the model
-    generated_ids = vlm.generate(**inputs, max_new_tokens=512)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
     if keyword:
         keyword_lower = keyword.lower()
         if keyword_lower in output_text.lower():
@@ -75,14 +60,14 @@ def ocr_image(image, query="Extract text from the image", keyword=""):
     else:
         return output_text
-# Gradio interface
 def process_image(image, keyword=""):
     max_size = 1024
     if max(image.size) > max_size:
         image.thumbnail((max_size, max_size))
     return ocr_image(image, keyword=keyword)
-# Update the Gradio interface:
 interface = gr.Interface(
     fn=process_image,
     inputs=[

     https://colab.research.google.com/drive/1vzsQ17-W1Vy6yJ60XUwFy0QRkOR_SIg7
 """
+from transformers import AutoProcessor
 import torch
 import gradio as gr
 from PIL import Image
+# Hypothetical imports
+from byaldi import ByaldiProcessor
+from colpali import ColPaliQwen2VLModel
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+byaldi_processor = ByaldiProcessor()
 def load_model():
+    return ColPaliQwen2VLModel.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct",
+        torch_dtype=torch.float32,
+        low_cpu_mem_usage=True,
+        device_map="auto"
     )
 vlm = load_model()
 def ocr_image(image, query="Extract text from the image", keyword=""):
+    processed_image = byaldi_processor.process_image(image)
     messages = [
         {
             "role": "user",
             "content": [
                 {
                     "type": "image",
+                    "image": processed_image,
                 },
                 {"type": "text", "text": query},
             ],
         }
     ]
+    inputs = processor(messages, return_tensors="pt")
     inputs = inputs.to("cpu")
+    output = vlm.generate(**inputs, max_new_tokens=512)
+    output_text = processor.decode(output[0], skip_special_tokens=True)
     if keyword:
         keyword_lower = keyword.lower()
         if keyword_lower in output_text.lower():
     else:
         return output_text
 def process_image(image, keyword=""):
+    # Resize image if it's too large
     max_size = 1024
     if max(image.size) > max_size:
         image.thumbnail((max_size, max_size))
     return ocr_image(image, keyword=keyword)
+# Gradio interface:
 interface = gr.Interface(
     fn=process_image,
     inputs=[