pdf-to-excel / app.py
Joe Hare
add ml-based version of this
ba05b5a
import streamlit as st
import pandas as pd
from transformers import pipeline
import base64
def upload_and_extract_text():
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
if uploaded_file is not None:
ocr_pipeline = pipeline("text2text-generation", model="google/t5-v1_1-large")
extracted_text = ocr_pipeline(uploaded_file.read(), max_length=1024, do_sample=False)[0]["generated_text"]
return extracted_text
def text_to_dataframe(text):
lines = text.split("\n")
data = []
for line in lines:
data.append([line])
df = pd.DataFrame(data, columns=["Text"])
return df
def main():
st.title("PDF to Spreadsheet Converter")
st.write("Upload a PDF file to extract text and save it as a spreadsheet.")
extracted_text = upload_and_extract_text()
if extracted_text is not None:
st.write("### Extracted Text")
st.write(extracted_text)
df = text_to_dataframe(extracted_text)
st.write("### Spreadsheet Preview")
st.write(df)
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="extracted_text.csv">Download Extracted Text as CSV</a>'
st.markdown(href, unsafe_allow_html=True)
if __name__ == "__main__":
main()