import streamlit as st
import numpy as np
import pandas as pd
import subprocess
from subprocess import STDOUT, check_call
import os
import base64
import camelot
# to run this only once and it's cached
@st.cache
def ghostscript():
"""install ghostscript on the linux machine"""
proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
proc.wait()
ghostscript()
#heading
html_temp = """
PDF Table Extractor WebApp
"""
st.markdown(html_temp,unsafe_allow_html=True)
# file uploader on streamlit
#st.sidebar.markdown('Upload PDF files')
input_pdf = st.sidebar.file_uploader(label = "Upload PDF files here", type = 'pdf')
# run this only when a PDF is uploaded
if input_pdf is not None:
# byte object into a PDF file
with open("input.pdf", "wb") as f:
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
f.write(base64.b64decode(base64_pdf))
f.close()
#To print uploaded pdf
def show_pdf(file_path):
with open(file_path,"rb") as f:
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
pdf_display = f''
st.markdown('## Uploaded PDF')
st.markdown(pdf_display, unsafe_allow_html=True)
#st.sidebar.markdown('Display Uploaded PDF')
#if st.sidebar.button('Show'):
#show_pdf("input.pdf")
# read the pdf and parse it using stream
if input_pdf is not None:
table = camelot.read_pdf('input.pdf', flavor='stream',split_text = True,layout_kwargs={'detect_vertical':True},backend='poppler')
df = table[0].df
#df = df.dropna(axis=1, thresh=int(len(df)*0.7),inplace=True)
df = pd.DataFrame(df)
for i in df.index:
if df['Alerted'][i]=='o':
df['Alerted'][i]='NO'
else:
df['Alerted'][i]='YES'
st.sidebar.markdown('Extract tables from PDF')
if st.sidebar.button('Extract Table'):
st.markdown('## Extracted table from PDF')
st.dataframe(df)
if input_pdf is not None:
st.sidebar.markdown('Download Extracted Table as CSV file')
st.sidebar.download_button("Download",df.to_csv(),file_name = 'extracted_table.csv', mime = 'text/csv')