pdf-to-markdown / app.py
Liam Dyer
rewrite with pypdf and ocrmypdf
312add7 unverified
raw
history blame
912 Bytes
import spaces
import gradio as gr
from pypdf import PdfReader
import ocrmypdf
@spaces.GPU
def convert(pdf_file):
reader = PdfReader(pdf_file)
# Check if there are any images
image_count = 0
for page in reader.pages:
image_count += len(page.images)
# If there are images, perform OCR on the document
if image_count > 0:
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
pdf_file = out_pdf_file
# Extract text
full_text = ""
for idx, page in enumerate(reader.pages):
full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
return full_text, reader.metadata
gr.Interface(
convert,
inputs=[
gr.File(label="Upload PDF", type="filepath"),
],
outputs=[
gr.Text(label="Markdown"),
gr.JSON(label="Metadata"),
],
).launch()