Spaces:

Swekerr
/

Qwen2VL-OCR

Running

Swekerr commited on Sep 25

Commit

f6aa2ce

•

1 Parent(s): f782070

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,16 +10,15 @@ import torch
 def load_models():
     RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
     model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
-                                                            trust_remote_code=True, torch_dtype=torch.float32)  # Change to float32 for CPU
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
     return RAG, model, processor
 RAG, model, processor = load_models()
 # Function for OCR and search
-# Skip RAG search and use Qwen2VL for direct OCR
 def ocr_and_search(image, keyword):
-    # Hardcoded query to extract text in English, Sanskrit, and Hindi
     text_query = "Extract all the text in Sanskrit and English from the image."
     # Prepare message for Qwen model
@@ -64,9 +63,9 @@ def ocr_and_search(image, keyword):
     return extracted_text, matched_sentences, json_output
-# Gradio App function
 def app(image, keyword):
-    # Call OCR and search function
     extracted_text, search_results, json_output = ocr_and_search(image, keyword)
     search_results_str = "\n".join(search_results) if search_results else "No matches found."
@@ -77,7 +76,7 @@ def app(image, keyword):
 iface = gr.Interface(
     fn=app,
     inputs=[
-        gr.Image(type="pil", label="Upload an Image"),  # Corrected to gr.Image
         gr.Textbox(label="Enter keyword to search in extracted text", placeholder="Keyword")
     ],
     outputs=[

 def load_models():
     RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
     model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
+                                                            trust_remote_code=True, torch_dtype=torch.float32)  # float32 for CPU
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
     return RAG, model, processor
 RAG, model, processor = load_models()
 # Function for OCR and search
 def ocr_and_search(image, keyword):
     text_query = "Extract all the text in Sanskrit and English from the image."
     # Prepare message for Qwen model
     return extracted_text, matched_sentences, json_output
+# Gradio App
 def app(image, keyword):
     extracted_text, search_results, json_output = ocr_and_search(image, keyword)
     search_results_str = "\n".join(search_results) if search_results else "No matches found."
 iface = gr.Interface(
     fn=app,
     inputs=[
+        gr.Image(type="pil", label="Upload an Image"),
         gr.Textbox(label="Enter keyword to search in extracted text", placeholder="Keyword")
     ],
     outputs=[