Spaces:
Build error
Build error
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import subprocess | |
from subprocess import STDOUT, check_call | |
import os | |
import base64 | |
import camelot | |
# to run this only once and it's cached | |
def ghostscript(): | |
"""install ghostscript on the linux machine""" | |
proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash") | |
proc.wait() | |
ghostscript() | |
#heading | |
html_temp = """ | |
<div style="background-color:tomato;padding:10px"> | |
<h2 style="color:white;text-align:center;">PDF Table Extractor WebApp </h2> | |
</div> | |
""" | |
st.markdown(html_temp,unsafe_allow_html=True) | |
# file uploader on streamlit | |
#st.sidebar.markdown('Upload PDF files') | |
input_pdf = st.sidebar.file_uploader(label = "Upload PDF files here", type = 'pdf') | |
# run this only when a PDF is uploaded | |
if input_pdf is not None: | |
# byte object into a PDF file | |
with open("input.pdf", "wb") as f: | |
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8') | |
f.write(base64.b64decode(base64_pdf)) | |
f.close() | |
#To print uploaded pdf | |
def show_pdf(file_path): | |
with open(file_path,"rb") as f: | |
base64_pdf = base64.b64encode(f.read()).decode('utf-8') | |
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>' | |
st.markdown('## Uploaded PDF') | |
st.markdown(pdf_display, unsafe_allow_html=True) | |
#st.sidebar.markdown('Display Uploaded PDF') | |
#if st.sidebar.button('Show'): | |
#show_pdf("input.pdf") | |
# read the pdf and parse it using stream | |
if input_pdf is not None: | |
table = camelot.read_pdf('input.pdf', flavor='stream',split_text = True,layout_kwargs={'detect_vertical':True},backend='poppler') | |
df = table[0].df | |
#df = df.dropna(axis=1, thresh=int(len(df)*0.7),inplace=True) | |
for i in df.index: | |
if df['Alerted'][i]=='o': | |
df['Alerted'][i]='NO' | |
else: | |
df['Alerted'][i]='YES' | |
st.sidebar.markdown('Extract tables from PDF') | |
if st.sidebar.button('Extract Table'): | |
st.markdown('## Extracted table from PDF') | |
st.dataframe(df) | |
if input_pdf is not None: | |
st.sidebar.markdown('Download Extracted Table as CSV file') | |
st.sidebar.download_button("Download",df.to_csv(),file_name = 'extracted_table.csv', mime = 'text/csv') |