Spaces:
Runtime error
Runtime error
File size: 1,861 Bytes
2342442 e1bd857 2342442 e1bd857 2342442 e1bd857 2342442 e1bd857 2342442 f8b90c3 e1bd857 2342442 e1bd857 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import gradio as gr
import tempfile
import re
from PyPDF2 import PdfReader, PdfFileReader
import os
import spacy
import pytesseract
import pdf2image
import subprocess
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
def clean_text(text):
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def image_to_latex(image):
image_path = "/tmp/equation.png" # Modify as needed
image.save(image_path)
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
return result.stdout
def pdf_to_text(file):
with open(file.name, 'rb') as f:
reader = PdfReader(f)
full_text = ''
for i, page in enumerate(reader.pages):
page_text = page.extract_text()
if page_text is None:
images = pdf2image.convert_from_path(file.name, first_page=i+1, last_page=i+2)
for image in images:
page_text = image_to_latex(image)
page_text = clean_text(page_text)
if len(page_text.split()) > 5:
page_text = "## Metadata: Page Number " + str(i+1) + "\n" + page_text
full_text += page_text + "\n\n"
base_name = os.path.splitext(os.path.basename(file.name))[0]
output_file_name = base_name + ".txt"
with open(output_file_name, 'w') as f:
f.write(full_text)
return output_file_name
iface = gr.Interface(fn=pdf_to_text,
inputs=gr.inputs.File(label="Your PDF"),
outputs=gr.outputs.File(label="Download TXT"),
title="PDF to TXT",
description="Convert your PDF files to clean text")
iface.launch()
|