Joe Hare commited on
Commit
ba05b5a
1 Parent(s): 4d331dd

add ml-based version of this

Browse files
Files changed (2) hide show
  1. app.py +42 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from transformers import pipeline
4
+ import base64
5
+
6
+ def upload_and_extract_text():
7
+ uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
8
+
9
+ if uploaded_file is not None:
10
+ ocr_pipeline = pipeline("text2text-generation", model="google/t5-v1_1-large")
11
+ extracted_text = ocr_pipeline(uploaded_file.read(), max_length=1024, do_sample=False)[0]["generated_text"]
12
+ return extracted_text
13
+
14
+ def text_to_dataframe(text):
15
+ lines = text.split("\n")
16
+ data = []
17
+ for line in lines:
18
+ data.append([line])
19
+ df = pd.DataFrame(data, columns=["Text"])
20
+ return df
21
+
22
+ def main():
23
+ st.title("PDF to Spreadsheet Converter")
24
+ st.write("Upload a PDF file to extract text and save it as a spreadsheet.")
25
+
26
+ extracted_text = upload_and_extract_text()
27
+
28
+ if extracted_text is not None:
29
+ st.write("### Extracted Text")
30
+ st.write(extracted_text)
31
+
32
+ df = text_to_dataframe(extracted_text)
33
+ st.write("### Spreadsheet Preview")
34
+ st.write(df)
35
+
36
+ csv = df.to_csv(index=False)
37
+ b64 = base64.b64encode(csv.encode()).decode()
38
+ href = f'<a href="data:file/csv;base64,{b64}" download="extracted_text.csv">Download Extracted Text as CSV</a>'
39
+ st.markdown(href, unsafe_allow_html=True)
40
+
41
+ if __name__ == "__main__":
42
+ main()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit==1.3.0
2
+ transformers==4.13.0
3
+ pandas==1.4.1