Spaces:

ashcodes
/

pdf-table-extractor-tabula

Build error

App Files Files Community

ashcodes commited on Oct 25, 2022

Commit

5e09655

1 Parent(s): 77a3937

Upload 2 files

Browse files

Files changed (2) hide show

app.py +64 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import subprocess
+from subprocess import STDOUT, check_call
+import os
+import base64
+import camelot
+# to run this only once and it's cached
+@st.cache
+def ghostscript():
+    """install ghostscript on the linux machine"""
+    proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
+    proc.wait()
+ghostscript()
+#heading
+html_temp = """
+    <div style="background-color:tomato;padding:10px">
+    <h2 style="color:white;text-align:center;">PDF Table Extractor WebApp </h2>
+    </div>
+    """
+st.markdown(html_temp,unsafe_allow_html=True)
+# file uploader on streamlit
+#st.sidebar.markdown('Upload PDF files')
+input_pdf = st.sidebar.file_uploader(label = "Upload PDF files here", type = 'pdf')
+# run this only when a PDF is uploaded
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+#To print uploaded pdf
+def show_pdf(file_path):
+    with open(file_path,"rb") as f:
+        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+        pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'
+    st.markdown('## Uploaded PDF')
+    st.markdown(pdf_display, unsafe_allow_html=True)
+#st.sidebar.markdown('Display Uploaded PDF')
+#if st.sidebar.button('Show'):
+    #show_pdf("input.pdf")
+# read the pdf and parse it using stream
+if input_pdf is not None:
+    table = camelot.read_pdf('input.pdf', flavor='stream',layout_kwargs={'detect_vertical':True},backend='poppler')
+    csv_table = table[0].df
+st.sidebar.markdown('Extract tables from PDF')
+if st.sidebar.button('Extract Table'):
+    st.markdown('## Extracted table from PDF')
+    st.dataframe(csv_table)
+if input_pdf is not None:
+    st.sidebar.markdown('Download Extracted Table as CSV file')
+    st.sidebar.download_button("Download",csv_table.to_csv(),file_name = 'extracted_table.csv', mime = 'text/csv')

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy
+pandas
+opencv-python
+streamlit
+camelot-py