ashcodes commited on
Commit
5e09655
1 Parent(s): 77a3937

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +64 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import subprocess
5
+ from subprocess import STDOUT, check_call
6
+ import os
7
+ import base64
8
+ import camelot
9
+
10
+ # to run this only once and it's cached
11
+ @st.cache
12
+ def ghostscript():
13
+ """install ghostscript on the linux machine"""
14
+ proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
15
+ proc.wait()
16
+
17
+ ghostscript()
18
+
19
+ #heading
20
+ html_temp = """
21
+ <div style="background-color:tomato;padding:10px">
22
+ <h2 style="color:white;text-align:center;">PDF Table Extractor WebApp </h2>
23
+ </div>
24
+ """
25
+ st.markdown(html_temp,unsafe_allow_html=True)
26
+
27
+
28
+ # file uploader on streamlit
29
+ #st.sidebar.markdown('Upload PDF files')
30
+ input_pdf = st.sidebar.file_uploader(label = "Upload PDF files here", type = 'pdf')
31
+
32
+ # run this only when a PDF is uploaded
33
+ if input_pdf is not None:
34
+ # byte object into a PDF file
35
+ with open("input.pdf", "wb") as f:
36
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
37
+ f.write(base64.b64decode(base64_pdf))
38
+ f.close()
39
+
40
+ #To print uploaded pdf
41
+ def show_pdf(file_path):
42
+ with open(file_path,"rb") as f:
43
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
44
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'
45
+ st.markdown('## Uploaded PDF')
46
+ st.markdown(pdf_display, unsafe_allow_html=True)
47
+
48
+ #st.sidebar.markdown('Display Uploaded PDF')
49
+ #if st.sidebar.button('Show'):
50
+ #show_pdf("input.pdf")
51
+
52
+ # read the pdf and parse it using stream
53
+ if input_pdf is not None:
54
+ table = camelot.read_pdf('input.pdf', flavor='stream',layout_kwargs={'detect_vertical':True},backend='poppler')
55
+ csv_table = table[0].df
56
+
57
+ st.sidebar.markdown('Extract tables from PDF')
58
+ if st.sidebar.button('Extract Table'):
59
+ st.markdown('## Extracted table from PDF')
60
+ st.dataframe(csv_table)
61
+
62
+ if input_pdf is not None:
63
+ st.sidebar.markdown('Download Extracted Table as CSV file')
64
+ st.sidebar.download_button("Download",csv_table.to_csv(),file_name = 'extracted_table.csv', mime = 'text/csv')
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ opencv-python
4
+ streamlit
5
+ camelot-py