Spaces:

ShayanRl
/

pdf2text

Running

File size: 2,448 Bytes

cadae78
 
 
 
ffd8879
cadae78
 
 
c45f030
ffd8879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cadae78
 
 
a673cfb
6ef59e8
ffd8879
cadae78
ffd8879
cadae78

import streamlit as st
import io
import requests
import pdfplumber
import os

def fextractURL(pdf_path):
    extracted_data = ""

    try:
        if pdf_path.endswith('.pdf'):
            # If the URL ends with .pdf, use pdfplumber directly
            r = requests.get(pdf_path)
            f = io.BytesIO(r.content)
            with pdfplumber.open(f) as pdf:
                for page in pdf.pages:
                    extracted_data += page.extract_text() + "\n"  # Extract text
                    tables = page.extract_tables()  # Extract tables
                    for table in tables:
                        for row in table:
                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"
        else:
            # If the URL does not end with .pdf, download the PDF first
            response = requests.get(pdf_path)
            pdf_content = response.content

            # Save the PDF locally
            pdf_filename = 'downloaded_document.pdf'
            with open(pdf_filename, 'wb') as pdf_file:
                pdf_file.write(pdf_content)

            # Extract content using pdfplumber
            with pdfplumber.open(pdf_filename) as pdf:
                for page in pdf.pages:
                    extracted_data += page.extract_text() + "\n"  # Extract text
                    tables = page.extract_tables()  # Extract tables
                    for table in tables:
                        for row in table:
                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"

            # Delete the PDF file
            os.remove(pdf_filename)
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")

    return extracted_data


vert_space = '<div style="padding: 3rem 1rem;"></div>'
st.markdown(vert_space, unsafe_allow_html=True)
st.write("Extract full text from PDF URL")

pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
extractedText = st.empty()

if button:
    try:
        text = fextractURL(pdfURL)
        extractedText.text(text)
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")