olmocr-demo / app.py
leonarb's picture
Update app.py
0225b8c verified
raw
history blame
4.02 kB
import os
import torch
import base64
from io import BytesIO
from PIL import Image
import gradio as gr
from ebooklib import epub
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from PyPDF2 import PdfReader
# Set a writable cache directory for HF
os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
# Load processor and model
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def extract_text_from_page(pdf_path, page_num):
# Render image
image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
image = Image.open(BytesIO(base64.b64decode(image_base64)))
# Prompt and input
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
output = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=256,
num_return_sequences=1,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
return decoded, image_base64 if page_num == 1 else None
def process_pdf(file, title="Extracted PDF", author="olmOCR", language="en"):
file_path = file.name
reader = PdfReader(file_path)
num_pages = len(reader.pages)
all_text = []
cover_image_data = None
for page in range(1, num_pages + 1):
text, cover_image = extract_text_from_page(file_path, page)
all_text.append(f"<h2>Page {page}</h2><p>{text}</p>")
if cover_image and not cover_image_data:
cover_image_data = cover_image # base64
# Build EPUB
book = epub.EpubBook()
book.set_identifier("id123456")
book.set_title(title)
book.set_language(language)
book.add_author(author)
# Add cover image
if cover_image_data:
cover_bytes = base64.b64decode(cover_image_data)
book.set_cover("cover.jpg", cover_bytes)
# Create chapter with all text
chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
chapter.content = f"<h1>{title}</h1>{''.join(all_text)}"
book.add_item(chapter)
book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
book.add_item(epub.EpubNavi())
book.add_item(epub.EpubNCX())
book.spine = ['nav', chapter]
epub_path = f"/tmp/{title.replace(' ', '_')}.epub"
epub.write_epub(epub_path, book)
return epub_path
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="Upload PDF"),
gr.Textbox(value="Extracted PDF", label="EPUB Title"),
gr.Textbox(value="olmOCR", label="Author"),
gr.Textbox(value="en", label="Language"),
],
outputs=gr.File(label="Download EPUB"),
title="olmOCR PDF to EPUB (Full PDF + Cover Image)",
description="Extract text from ALL pages of a PDF and generate an EPUB with the first page as cover.",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()