pdf2text / app.py
ShayanRl's picture
Update app.py
ffd8879 verified
import streamlit as st
import io
import requests
import pdfplumber
import os
def fextractURL(pdf_path):
extracted_data = ""
try:
if pdf_path.endswith('.pdf'):
# If the URL ends with .pdf, use pdfplumber directly
r = requests.get(pdf_path)
f = io.BytesIO(r.content)
with pdfplumber.open(f) as pdf:
for page in pdf.pages:
extracted_data += page.extract_text() + "\n" # Extract text
tables = page.extract_tables() # Extract tables
for table in tables:
for row in table:
extracted_data += "\t".join(str(cell) for cell in row) + "\n"
else:
# If the URL does not end with .pdf, download the PDF first
response = requests.get(pdf_path)
pdf_content = response.content
# Save the PDF locally
pdf_filename = 'downloaded_document.pdf'
with open(pdf_filename, 'wb') as pdf_file:
pdf_file.write(pdf_content)
# Extract content using pdfplumber
with pdfplumber.open(pdf_filename) as pdf:
for page in pdf.pages:
extracted_data += page.extract_text() + "\n" # Extract text
tables = page.extract_tables() # Extract tables
for table in tables:
for row in table:
extracted_data += "\t".join(str(cell) for cell in row) + "\n"
# Delete the PDF file
os.remove(pdf_filename)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
return extracted_data
vert_space = '<div style="padding: 3rem 1rem;"></div>'
st.markdown(vert_space, unsafe_allow_html=True)
st.write("Extract full text from PDF URL")
pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
extractedText = st.empty()
if button:
try:
text = fextractURL(pdfURL)
extractedText.text(text)
except Exception as e:
st.error(f"An error occurred: {str(e)}")