import streamlit as st import numpy as np import pandas as pd import subprocess from subprocess import STDOUT, check_call import os import base64 import camelot # to run this only once and it's cached @st.cache def ghostscript(): """install ghostscript on the linux machine""" proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash") proc.wait() ghostscript() #heading html_temp = """

PDF Table Extractor WebApp

""" st.markdown(html_temp,unsafe_allow_html=True) # file uploader on streamlit #st.sidebar.markdown('Upload PDF files') input_pdf = st.sidebar.file_uploader(label = "Upload PDF files here", type = 'pdf') # run this only when a PDF is uploaded if input_pdf is not None: # byte object into a PDF file with open("input.pdf", "wb") as f: base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8') f.write(base64.b64decode(base64_pdf)) f.close() #To print uploaded pdf def show_pdf(file_path): with open(file_path,"rb") as f: base64_pdf = base64.b64encode(f.read()).decode('utf-8') pdf_display = f'' st.markdown('## Uploaded PDF') st.markdown(pdf_display, unsafe_allow_html=True) #st.sidebar.markdown('Display Uploaded PDF') #if st.sidebar.button('Show'): #show_pdf("input.pdf") # read the pdf and parse it using stream if input_pdf is not None: table = camelot.read_pdf('input.pdf', flavor='stream',split_text = True,layout_kwargs={'detect_vertical':True},backend='poppler') df = table[0].df #df = df.dropna(axis=1, thresh=int(len(df)*0.7),inplace=True) df = pd.DataFrame(df) for i in df.index: if df['Alerted'][i]=='o': df['Alerted'][i]='NO' else: df['Alerted'][i]='YES' st.sidebar.markdown('Extract tables from PDF') if st.sidebar.button('Extract Table'): st.markdown('## Extracted table from PDF') st.dataframe(df) if input_pdf is not None: st.sidebar.markdown('Download Extracted Table as CSV file') st.sidebar.download_button("Download",df.to_csv(),file_name = 'extracted_table.csv', mime = 'text/csv')