File size: 2,749 Bytes
2342442
 
e1bd857
 
 
 
 
 
 
 
 
 
 
26072cc
f9aff1d
26072cc
185443c
2342442
 
e1bd857
 
 
 
 
c91ff76
 
 
 
 
 
 
 
 
 
 
e1bd857
 
 
 
 
5abf32d
2342442
26072cc
 
 
1c8ed3e
2e6fb2c
1c8ed3e
 
 
 
95e9b45
ea69e31
 
e787464
f9aff1d
 
 
 
 
 
1c8ed3e
26072cc
 
 
 
 
1c8ed3e
e1bd857
 
 
2342442
1c8ed3e
ea69e31
fa29ba8
dd899a3
e1bd857
ea69e31
e1bd857
2342442
e1bd857
dd899a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
import tempfile
import re
import os 
import spacy
import pytesseract
import pdf2image
import subprocess
from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError
)
import fitz  # PyMuPDF
from PIL import Image, UnidentifiedImageError
import io
import base64

def clean_text(text):
    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
    text = re.sub(r'\n+', '\n', text) 
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def safe_base64_decode(s):
    # add missing padding if necessary
    missing_padding = len(s) % 4
    if missing_padding:
        s += '='* (4 - missing_padding)
    try:
        return base64.b64decode(s)
    except binascii.Error as e:
        print("Error decoding base64 string:", e)
        return None

def image_to_latex(image):
    image_path = "/tmp/equation.png"  # Modify as needed
    image.save(image_path)
    result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
    return result.stdout

def pdf_to_text(file):
    doc = fitz.open(file.name)
    full_text = ''
    for i, page in enumerate(doc):
        # Extract text
        page_text = page.get_text()

        # Extract images and convert to LaTeX
        image_list = page.get_images(full=True)
        for img in image_list:
            xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
            # Check if image_data is base64 encoded string
            if isinstance(image_data, str) and re.match(r'^[A-Za-z0-9+/]+[=]{0,2}$', image_data):
                image_data = safe_base64_decode(image_data)
            try:
                image = Image.open(io.BytesIO(image_data))
                latex_code = image_to_latex(image)
                page_text += "\n" + latex_code  # Add LaTeX code to page text
            except UnidentifiedImageError:
                print(f"Could not identify image on page {i+1}")

        page_text = clean_text(page_text)
        if len(page_text.split()) > 5: 
            page_number = i + 1
            page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
            full_text += page_text + "\n\n"

    base_name = os.path.splitext(os.path.basename(file.name))[0]
    output_file_name = base_name + ".txt"
    with open(output_file_name, 'w') as f:
        f.write(full_text)

    return output_file_name

iface = gr.Interface(fn=pdf_to_text,
                     inputs=gr.inputs.File(label="Your PDF"),
                     outputs=gr.outputs.File(label="Download TXT"),
                     title="PDF to TXT",
                     description="Convert your PDF files to clean text")
iface.launch()