File size: 2,448 Bytes
dd1cb9c
1f0ed21
6c400a9
 
efce880
 
6c400a9
 
efce880
 
 
 
 
dd1cb9c
 
6c400a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efce880
 
6c400a9
efce880
 
 
 
 
 
 
dd1cb9c
 
6c400a9
 
 
 
 
 
 
 
 
 
 
 
 
 
dd1cb9c
1cf5a2d
 
 
dd1cb9c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import spaces
import subprocess
import os
import string
import random
from pypdf import PdfReader
import ocrmypdf


def random_word(length):
    letters = string.ascii_lowercase
    return "".join(random.choice(letters) for _ in range(length))


def convert_pdf(input_file):
    reader = PdfReader(input_file)
    metadata = extract_metadata_from_pdf(reader)
    text = extract_text_from_pdf(reader)

    # Check if there are any images
    image_count = 0
    for page in reader.pages:
        image_count += len(page.images)

    # If there are images and not much content, perform OCR on the document
    if image_count > 0 and len(text) < 1000:
        out_pdf_file = input_file.replace(".pdf", "_ocr.pdf")
        ocrmypdf.ocr(input_file, out_pdf_file, force_ocr=True)

        # Re-extract text
        text = extract_text_from_pdf(PdfReader(input_file))

        # Delete the OCR file
        os.remove(out_pdf_file)

    return text, metadata


def extract_text_from_pdf(reader):
    full_text = ""
    for idx, page in enumerate(reader.pages):
        text = page.extract_text()
        if len(text) > 0:
            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"

    return full_text.strip()


def extract_metadata_from_pdf(reader):
    return {
        "author": reader.metadata.author,
        "creator": reader.metadata.creator,
        "producer": reader.metadata.producer,
        "subject": reader.metadata.subject,
        "title": reader.metadata.title,
    }


def convert_pandoc(input_file):
    # Convert the file to markdown with pandoc
    output_file = f"{random_word(16)}.md"
    result = subprocess.call(f"pandoc {input_file} -t markdown -o {output_file}")

    # Read the file and delete
    with open(output_file, "r") as f:
        markdown = f.read()
    os.remove(output_file)

    return markdown


@spaces.GPU
def convert(input_file):
    plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
    # Already a plain text file that wouldn't benefit from pandoc so return the content
    if any(input_file.endswith(ft) for ft in plain_text_filetypes):
        with open(input_file, "r") as f:
            return f.read()

    if input_file.endswith(".pdf"):
        return convert_pdf(input_file)

    return convert_pandoc(input_file)


gr.Interface(
    convert,
    inputs=gr.File(label="Upload File", type="filepath"),
    outputs=gr.Text(label="Markdown"),
).launch()