File size: 1,478 Bytes
145d936
 
312add7
 
34c42f9
4d1d4d1
b697ac0
 
 
 
 
 
 
 
 
 
145d936
312add7
 
c1d7645
3bf066d
 
 
 
 
 
 
 
 
b697ac0
 
 
312add7
 
 
 
c1d7645
b697ac0
 
312add7
 
c1d7645
b697ac0
 
 
145d936
b697ac0
145d936
 
 
 
c1d7645
 
 
 
 
 
 
145d936
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import spaces
import gradio as gr
from pypdf import PdfReader
import ocrmypdf


def extract_text_from_pdf(reader):
    full_text = ""
    for idx, page in enumerate(reader.pages):
        text = page.extract_text()
        if len(text) > 0:
            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"

    return full_text.strip()


@spaces.GPU
def convert(pdf_file):
    reader = PdfReader(pdf_file)

    # Extract metadata
    metadata = {
        "author": reader.metadata.author,
        "creator": reader.metadata.creator,
        "producer": reader.metadata.producer,
        "subject": reader.metadata.subject,
        "title": reader.metadata.title,
    }

    # Extract text
    full_text = extract_text_from_pdf(reader)

    # Check if there are any images
    image_count = 0
    for page in reader.pages:
        image_count += len(page.images)

    # If there are images and not much content, perform OCR on the document
    if image_count > 0 and len(full_text) < 1000:
        out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
        ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)

        # Re-extract text
        reader = PdfReader(pdf_file)
        full_text = extract_text_from_pdf(reader)

    return full_text, metadata


gr.Interface(
    convert,
    inputs=[
        gr.File(label="Upload PDF", type="filepath"),
    ],
    outputs=[
        gr.Text(label="Markdown"),
        gr.JSON(label="Metadata"),
    ],
).launch()