|
import streamlit as st |
|
import io |
|
import requests |
|
import pdfplumber |
|
import os |
|
|
|
def fextractURL(pdf_path): |
|
extracted_data = "" |
|
|
|
try: |
|
if pdf_path.endswith('.pdf'): |
|
|
|
r = requests.get(pdf_path) |
|
f = io.BytesIO(r.content) |
|
with pdfplumber.open(f) as pdf: |
|
for page in pdf.pages: |
|
extracted_data += page.extract_text() + "\n" |
|
tables = page.extract_tables() |
|
for table in tables: |
|
for row in table: |
|
extracted_data += "\t".join(str(cell) for cell in row) + "\n" |
|
else: |
|
|
|
response = requests.get(pdf_path) |
|
pdf_content = response.content |
|
|
|
|
|
pdf_filename = 'downloaded_document.pdf' |
|
with open(pdf_filename, 'wb') as pdf_file: |
|
pdf_file.write(pdf_content) |
|
|
|
|
|
with pdfplumber.open(pdf_filename) as pdf: |
|
for page in pdf.pages: |
|
extracted_data += page.extract_text() + "\n" |
|
tables = page.extract_tables() |
|
for table in tables: |
|
for row in table: |
|
extracted_data += "\t".join(str(cell) for cell in row) + "\n" |
|
|
|
|
|
os.remove(pdf_filename) |
|
except Exception as e: |
|
st.error(f"An error occurred: {str(e)}") |
|
|
|
return extracted_data |
|
|
|
|
|
vert_space = '<div style="padding: 3rem 1rem;"></div>' |
|
st.markdown(vert_space, unsafe_allow_html=True) |
|
st.write("Extract full text from PDF URL") |
|
|
|
pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible") |
|
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False) |
|
extractedText = st.empty() |
|
|
|
if button: |
|
try: |
|
text = fextractURL(pdfURL) |
|
extractedText.text(text) |
|
except Exception as e: |
|
st.error(f"An error occurred: {str(e)}") |