pdftotext / app.py
HAOUARI Noureddine
test
119e740
raw
history blame
1.31 kB
import gradio as gr
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor
def convert_pdf_to_text(pdf_file):
print(pdf_file.name)
if not pdf_file.name.endswith(".pdf"):
raise ValueError("Invalid file format. Please upload PDF files only.")
text = "\n---\n"
text += f"file name: {pdf_file.name}\n content: \n"
with open(pdf_file.name, "rb") as file:
pdf_reader = PdfReader(file)
# Extract all text at once
text += "".join([page.extract_text() for page in pdf_reader.pages])
text += "\n---\n"
return text
def pdf_to_text(pdf_files):
# Create a ThreadPoolExecutor to run the conversion in parallel
with ThreadPoolExecutor() as executor:
# Use the executor to map the convert_pdf_to_text function over all the pdf_files
results = executor.map(convert_pdf_to_text, pdf_files)
# Concatenate the text from all the PDFs
text = "\n".join(results)
return text
iface = gr.Interface(
fn=pdf_to_text,
inputs=gr.inputs.File(
type="file", label="Upload a PDF file", file_count="multiple"),
outputs="text",
title="PDF to Text Converter",
description="Upload PDF files and get their content in text format.",
)
if __name__ == "__main__":
iface.launch()