File size: 2,204 Bytes
d317fef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import streamlit as st
import numpy as np 
import pandas as pd 
import subprocess 
from subprocess import STDOUT, check_call 
import os 
import base64  
import camelot 

# to run this only once and it's cached
@st.cache
def ghostscript():
    """install ghostscript on the linux machine"""
    proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
    proc.wait()

ghostscript()

#heading
html_temp = """
    <div style="background-color:tomato;padding:10px">
    <h2 style="color:white;text-align:center;">PDF Table Extractor WebApp </h2>
    </div>
    """
st.markdown(html_temp,unsafe_allow_html=True)


# file uploader on streamlit 
#st.sidebar.markdown('Upload PDF files')
input_pdf = st.sidebar.file_uploader(label = "Upload PDF files here", type = 'pdf')

# run this only when a PDF is uploaded
if input_pdf is not None:
    # byte object into a PDF file 
    with open("input.pdf", "wb") as f:
        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
        f.write(base64.b64decode(base64_pdf))
    f.close()

#To print uploaded pdf    
def show_pdf(file_path):
    with open(file_path,"rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
        pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'
    st.markdown('## Uploaded PDF')
    st.markdown(pdf_display, unsafe_allow_html=True)

#st.sidebar.markdown('Display Uploaded PDF')    
#if st.sidebar.button('Show'):
    #show_pdf("input.pdf")

# read the pdf and parse it using stream
if input_pdf is not None:
    table = camelot.read_pdf('input.pdf', flavor='stream',layout_kwargs={'detect_vertical':True},backend='poppler')
    csv_table = table[0].df   

st.sidebar.markdown('Extract tables from PDF')
if st.sidebar.button('Extract Table'):
    st.markdown('## Extracted table from PDF')
    st.dataframe(csv_table)
    
if input_pdf is not None:
    st.sidebar.markdown('Download Extracted Table as CSV file')
    st.sidebar.download_button("Download",csv_table.to_csv(),file_name = 'extracted_table.csv', mime = 'text/csv')