Spaces:

ShayanRl
/

pdf2text

Running

App Files Files Community

pdf2text / app.py

ShayanRl

Update app.py

ffd8879 verified about 1 year ago

raw

history blame contribute delete

2.45 kB

	import streamlit as st
	import io
	import requests
	import pdfplumber
	import os

	def fextractURL(pdf_path):
	extracted_data = ""

	try:
	if pdf_path.endswith('.pdf'):
	# If the URL ends with .pdf, use pdfplumber directly
	r = requests.get(pdf_path)
	f = io.BytesIO(r.content)
	with pdfplumber.open(f) as pdf:
	for page in pdf.pages:
	extracted_data += page.extract_text() + "\n" # Extract text
	tables = page.extract_tables() # Extract tables
	for table in tables:
	for row in table:
	extracted_data += "\t".join(str(cell) for cell in row) + "\n"
	else:
	# If the URL does not end with .pdf, download the PDF first
	response = requests.get(pdf_path)
	pdf_content = response.content

	# Save the PDF locally
	pdf_filename = 'downloaded_document.pdf'
	with open(pdf_filename, 'wb') as pdf_file:
	pdf_file.write(pdf_content)

	# Extract content using pdfplumber
	with pdfplumber.open(pdf_filename) as pdf:
	for page in pdf.pages:
	extracted_data += page.extract_text() + "\n" # Extract text
	tables = page.extract_tables() # Extract tables
	for table in tables:
	for row in table:
	extracted_data += "\t".join(str(cell) for cell in row) + "\n"

	# Delete the PDF file
	os.remove(pdf_filename)
	except Exception as e:
	st.error(f"An error occurred: {str(e)}")

	return extracted_data


	vert_space = '<div style="padding: 3rem 1rem;"></div>'
	st.markdown(vert_space, unsafe_allow_html=True)
	st.write("Extract full text from PDF URL")

	pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
	button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
	extractedText = st.empty()

	if button:
	try:
	text = fextractURL(pdfURL)
	extractedText.text(text)
	except Exception as e:
	st.error(f"An error occurred: {str(e)}")