import gradio as gr import spaces import subprocess import os import shutil import string import random from pypdf import PdfReader import ocrmypdf def random_word(length): letters = string.ascii_lowercase return "".join(random.choice(letters) for _ in range(length)) def convert_pdf(input_file): reader = PdfReader(input_file) metadata = extract_metadata_from_pdf(reader) text = extract_text_from_pdf(reader) # Check if there are any images image_count = 0 for page in reader.pages: image_count += len(page.images) # If there are images and not much content, perform OCR on the document if image_count > 0 and len(text) < 1000: out_pdf_file = input_file.replace(".pdf", "_ocr.pdf") ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True) # Re-extract text text = extract_text_from_pdf(PdfReader(input_file)) # Delete the OCR file os.remove(out_pdf_file) return text, metadata def extract_text_from_pdf(reader): full_text = "" for idx, page in enumerate(reader.pages): text = page.extract_text() if len(text) > 0: full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n" return full_text.strip() def extract_metadata_from_pdf(reader): return { "author": reader.metadata.author, "creator": reader.metadata.creator, "producer": reader.metadata.producer, "subject": reader.metadata.subject, "title": reader.metadata.title, } def convert_pandoc(input_file, filename): # Temporarily copy the file shutil.copyfile(input_file, filename) # Convert the file to markdown with pandoc output_file = f"{random_word(16)}.md" result = subprocess.call(["pandoc", filename, "-t", "markdown", "-o", output_file]) if result != 0: raise ValueError("Error converting file to markdown with pandoc") # Read the file and delete temporary files with open(output_file, "r") as f: markdown = f.read() os.remove(output_file) os.remove(filename) return markdown @spaces.GPU def convert(input_file, filename): plain_text_filetypes = [ ".txt", ".csv", ".tsv", ".md", ".yaml", ".toml", ".json", ".json5", ".jsonc", ] # Already a plain text file that wouldn't benefit from pandoc so return the content if any(filename.endswith(ft) for ft in plain_text_filetypes): with open(input_file, "r") as f: return f.read(), {} if filename.endswith(".pdf"): return convert_pdf(input_file) return convert_pandoc(input_file, filename), {} # We accept a filename because the gradio JS interface removes this information # and it's critical for choosing the correct processing pipeline gr.Interface( convert, inputs=[gr.File(label="Upload File", type="filepath"), gr.Text(label="Filename")], outputs=[ gr.Text(label="Markdown"), gr.JSON(label="Metadata"), ], ).launch()