Spaces:

pranshh
/

ocr-assignment

Running

App Files Files Community

pranshh commited on Sep 30, 2024

Commit

d2212a0

verified ·

1 Parent(s): 45f2b08

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -142

app.py CHANGED Viewed

@@ -7,151 +7,76 @@ Original file is located at
     https://colab.research.google.com/drive/1vzsQ17-W1Vy6yJ60XUwFy0QRkOR_SIg7
 """
 import gradio as gr
-from transformers import AutoModel, AutoTokenizer
 from PIL import Image
-import os
-revision = "5364fe1ab774ef13c2c79023dc91d8c1e7cfdce4"
-# Load tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
-model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, low_cpu_mem_usage=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
-model = model.eval()
-# Function to perform OCR and optional keyword search
-def process_image_with_search(image, keyword):
-    try:
-        # Save the PIL image to a temporary file
-        temp_img_path = "temp_image.png"
-        image.save(temp_img_path)
-        # Perform OCR with the model using the file path
-        extracted_text = model.chat(tokenizer, temp_img_path, ocr_type='format')
-        # Delete the temporary file
-        if os.path.exists(temp_img_path):
-            os.remove(temp_img_path)
-        # Convert extracted text to string if it's not already
-        extracted_text = extracted_text if isinstance(extracted_text, str) else str(extracted_text)
-        # If a keyword is provided, search for it
-        if keyword:
-            # Perform keyword search (case-insensitive)
-            if keyword.lower() in extracted_text.lower():
-                # Highlight the keyword in the extracted text
-                highlighted_text = extracted_text.replace(keyword, f"**{keyword}**", 1)  # Highlight first occurrence
-                result = f"Keyword '{keyword}' found:\n\n{highlighted_text}"
-            else:
-                result = f"Keyword '{keyword}' not found in the extracted text.\n\nExtracted Text:\n{extracted_text}"
-        else:
-            # If no keyword is provided, return the extracted text without searching
-            result = f"Extracted Text:\n\n{extracted_text}"
-        return result
-    except Exception as e:
-        return str(e)  # Return error message in case of failure
-# Define Gradio interface
-iface = gr.Interface(
-    fn=process_image_with_search,  # The function to process the image and search keyword
-    inputs=[gr.Image(type='pil'), gr.Textbox(label="Enter keyword to search (optional)")],  # Image input + Keyword input
-    outputs='text',  # Output will be plain text with the search result
-    title="OCR with GOT and Keyword Search",
-    description="Upload an image to get OCR results. You can also search for a keyword in the extracted text."
 )
-# Launch the interface
-iface.launch(debug=True)
-# !pip install --upgrade git+https://github.com/huggingface/transformers.git byaldi accelerate flash-attn qwen_vl_utils pdf2image gradio
-# !sudo apt-get install -y poppler-utils
-# from byaldi import RAGMultiModalModel
-# from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-# from qwen_vl_utils import process_vision_info
-# import torch
-# import gradio as gr
-# from PIL import Image
-# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
-# # Initialize the model with float16 precision and handle fallback to CPU
-# def load_model():
-#     try:
-#         vlm = Qwen2VLForConditionalGeneration.from_pretrained(
-#             "Qwen/Qwen2-VL-2B-Instruct",
-#             torch_dtype=torch.float16,
-#             attn_implementation="flash_attention_2",  # FlashAttention enabled
-#             device_map="cuda"
-#         )
-#         print("Model loaded with FlashAttention on GPU")
-#     except RuntimeError as e:
-#         if "FlashAttention only supports Ampere GPUs" in str(e):
-#             print("FlashAttention not supported. Falling back to standard attention.")
-#             vlm = Qwen2VLForConditionalGeneration.from_pretrained(
-#                 "Qwen/Qwen2-VL-2B-Instruct",
-#                 torch_dtype=torch.float16,  # Still use float16 to save memory
-#                 attn_implementation="default",  # Use standard attention mechanism
-#                 device_map="cuda" if torch.cuda.is_available() else "cpu"
-#             )
-#         else:
-#             raise e  # Raise other runtime errors if not related to FlashAttention
-#     return vlm
-# # Load the model
-# vlm = load_model()
-# # OCR function to extract text from an image
-# def ocr_image(image, query="Extract text from the image"):
-#     messages = [
-#         {
-#             "role": "user",
-#             "content": [
-#                 {
-#                     "type": "image",
-#                     "image": image,
-#                 },
-#                 {"type": "text", "text": query},
-#             ],
-#         }
-#     ]
-#     # Prepare inputs for the model
-#     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-#     image_inputs, video_inputs = process_vision_info(messages)
-#     inputs = processor(
-#         text=[text],
-#         images=image_inputs,
-#         videos=video_inputs,
-#         padding=True,
-#         return_tensors="pt",
-#     )
-#     inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
-#     # Generate the output text using the model
-#     generated_ids = vlm.generate(**inputs, max_new_tokens=512)
-#     generated_ids_trimmed = [
-#         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-#     ]
-#     output_text = processor.batch_decode(
-#         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-#     )
-#     return output_text[0]
-# # Gradio interface
-# def process_image(image):
-#     return ocr_image(image)
-# # Create Gradio interface for uploading an image
-# interface = gr.Interface(
-#     fn=process_image,
-#     inputs=gr.Image(type="pil"),
-#     outputs="text",
-#     title="Hindi & English OCR",
-#     description="Upload an image containing text in Hindi or English to extract the text using OCR."
-# )
-# # Launch Gradio interface in Colab
-# interface.launch(share=True, debug=True)

     https://colab.research.google.com/drive/1vzsQ17-W1Vy6yJ60XUwFy0QRkOR_SIg7
 """
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
 import gradio as gr
 from PIL import Image
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+# Initialize the model with float16 precision and handle fallback to CPU
+# Simplified model loading function for CPU
+def load_model():
+    return Qwen2VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        torch_dtype=torch.float32,  # Use float32 for CPU
+        device_map="cpu"
+    )
+# Load the model
+vlm = load_model()
+# OCR function to extract text from an image
+def ocr_image(image, query="Extract text from the image"):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {"type": "text", "text": query},
+            ],
+        }
+    ]
+    # Prepare inputs for the model
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cpu")
+    # Generate the output text using the model
+    generated_ids = vlm.generate(**inputs, max_new_tokens=512)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]
+# Gradio interface
+def process_image(image):
+    return ocr_image(image)
+# Create Gradio interface for uploading an image
+interface = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type="pil"),
+    outputs="text",
+    title="Hindi & English OCR",
+    description="Upload an image containing text in Hindi or English to extract the text using OCR."
 )
+# Launch Gradio interface in Colab
+interface.launch()