Chatiolmocr / app.py
Tharjama's picture
Update app.py
33b67be verified
import gradio as gr
import fitz # PyMuPDF for handling PDFs
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch
import torchvision
from PIL import Image
import io
# Initialize the OCR model and processor from Hugging Face
model_name = "allenai/olmOCR-2-7B-1025-FP8"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(model_name)
# Function to perform OCR on a PDF, page by page using olmocr
def ocr_pdf(pdf_file):
# Open the PDF with PyMuPDF
doc = fitz.open(pdf_file.name)
ocr_results = [] # To store OCR results for each page
for page_num in range(len(doc)):
# Get the page and convert it to an image
page = doc.load_page(page_num)
pix = page.get_pixmap()
# Convert pixmap to image
img = Image.open(io.BytesIO(pix.tobytes("png")))
# Process the image for OCR (olmocr expects image in a specific format)
inputs = processor(images=img, return_tensors="pt")
# Perform OCR using olmocr model
with torch.no_grad():
outputs = model.generate(**inputs)
# Decode the generated output (OCR text)
ocr_text = processor.decode(outputs[0], skip_special_tokens=True)
# Prepend page number to the OCR text
page_result = f"Page {page_num + 1}:\n{ocr_text}"
# Store result in list
ocr_results.append(page_result)
# Join all OCR results into one string (for displaying purposes)
return "\n\n".join(ocr_results)
# Gradio interface
def create_gradio_interface():
with gr.Blocks() as demo:
gr.Markdown("### OCR of PDF Pages using olmocr Model")
file_input = gr.File(label="Upload PDF", type="file")
output_text = gr.Textbox(label="OCR Results", lines=15)
file_input.change(ocr_pdf, inputs=file_input, outputs=output_text)
return demo
# Create and launch the Gradio app
if __name__ == "__main__":
app = create_gradio_interface()
app.launch()