|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
import subprocess |
|
from subprocess import STDOUT, check_call |
|
import os |
|
import base64 |
|
import camelot |
|
|
|
|
|
@st.cache |
|
def ghostscript(): |
|
"""install ghostscript on the linux machine""" |
|
proc = subprocess.Popen('apt-get install -y ghostscript', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash") |
|
proc.wait() |
|
|
|
ghostscript() |
|
|
|
|
|
html_temp = """ |
|
<div style="background-color:tomato;padding:10px"> |
|
<h2 style="color:white;text-align:center;">PDF Table Extractor WebApp </h2> |
|
</div> |
|
""" |
|
st.markdown(html_temp,unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
input_pdf = st.sidebar.file_uploader(label = "Upload PDF files here", type = 'pdf') |
|
|
|
|
|
if input_pdf is not None: |
|
|
|
with open("input.pdf", "wb") as f: |
|
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8') |
|
f.write(base64.b64decode(base64_pdf)) |
|
f.close() |
|
|
|
|
|
def show_pdf(file_path): |
|
with open(file_path,"rb") as f: |
|
base64_pdf = base64.b64encode(f.read()).decode('utf-8') |
|
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>' |
|
st.markdown('## Uploaded PDF') |
|
st.markdown(pdf_display, unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if input_pdf is not None: |
|
table = camelot.read_pdf('input.pdf', flavor='stream',layout_kwargs={'detect_vertical':True},backend='poppler') |
|
csv_table = table[0].df |
|
|
|
st.sidebar.markdown('Extract tables from PDF') |
|
if st.sidebar.button('Extract Table'): |
|
st.markdown('## Extracted table from PDF') |
|
st.dataframe(csv_table) |
|
|
|
if input_pdf is not None: |
|
st.sidebar.markdown('Download Extracted Table as CSV file') |
|
st.sidebar.download_button("Download",csv_table.to_csv(),file_name = 'extracted_table.csv', mime = 'text/csv') |