|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
|
|
def convert_pdf_to_text(pdf_file): |
|
print(pdf_file.name) |
|
if not pdf_file.name.endswith(".pdf"): |
|
raise ValueError("Invalid file format. Please upload PDF files only.") |
|
|
|
text = "\n---\n" |
|
text += f"file name: {pdf_file.name}\n content: \n" |
|
with open(pdf_file.name, "rb") as file: |
|
pdf_reader = PdfReader(file) |
|
|
|
text += "".join([page.extract_text() for page in pdf_reader.pages]) |
|
text += "\n---\n" |
|
return text |
|
|
|
|
|
def pdf_to_text(pdf_files): |
|
|
|
|
|
with ThreadPoolExecutor() as executor: |
|
|
|
results = executor.map(convert_pdf_to_text, pdf_files) |
|
|
|
text = "\n".join(results) |
|
|
|
return text |
|
|
|
|
|
iface = gr.Interface( |
|
fn=pdf_to_text, |
|
inputs=gr.inputs.File( |
|
type="file", label="Upload a PDF file", file_count="multiple"), |
|
outputs="text", |
|
title="PDF to Text Converter", |
|
description="Upload PDF files and get their content in text format.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|