Md919191 commited on
Commit
f3f2c2f
·
verified ·
1 Parent(s): a0dcf2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -47
app.py CHANGED
@@ -1,52 +1,53 @@
1
  import streamlit as st
2
- import pandas as pd
3
  import pdfplumber
4
- import os
 
5
 
6
- # Function to extract text from a PDF file
7
- def extract_text_from_pdf(pdf_file):
8
  with pdfplumber.open(pdf_file) as pdf:
9
- text = ""
10
  for page in pdf.pages:
11
- text += page.extract_text()
12
- return text
13
-
14
- # Function to convert extracted text to Excel
15
- def convert_text_to_excel(text, output_file):
16
- rows = text.split("\n")
17
- data = [row.split() for row in rows]
18
- df = pd.DataFrame(data)
19
- df.to_excel(output_file, index=False)
20
-
21
- # Main function to build the Streamlit app
22
- def main():
23
- st.title("PDF to Excel Converter")
24
-
25
- # File uploader
26
- uploaded_pdf = st.file_uploader("Upload a PDF file", type="pdf")
27
-
28
- # Check if a file has been uploaded
29
- if uploaded_pdf:
30
- # Extract text from the PDF
31
- text = extract_text_from_pdf(uploaded_pdf)
32
- st.success("Text extracted from PDF!")
33
-
34
- # Display the extracted text
35
- st.text_area("Extracted Text", text, height=300)
36
-
37
- # Button to convert and download Excel
38
- if st.button("Convert to Excel"):
39
- output_file = "converted_file.xlsx"
40
- convert_text_to_excel(text, output_file)
41
- with open(output_file, "rb") as f:
42
- st.download_button(
43
- label="Download Excel File",
44
- data=f,
45
- file_name="converted_file.xlsx",
46
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
47
- )
48
- os.remove(output_file)
49
-
50
- # Entry point of the script
51
- if __name__ == "__main__":
52
- main()
 
 
 
1
  import streamlit as st
 
2
  import pdfplumber
3
+ import pandas as pd
4
+ from io import BytesIO
5
 
6
+ def extract_tables_from_pdf(pdf_file):
7
+ tables = []
8
  with pdfplumber.open(pdf_file) as pdf:
 
9
  for page in pdf.pages:
10
+ extracted_tables = page.extract_tables()
11
+ for table in extracted_tables:
12
+ tables.append(pd.DataFrame(table))
13
+
14
+ if not tables:
15
+ return None
16
+
17
+ # Concatenate all tables into one DataFrame
18
+ final_df = pd.concat(tables, ignore_index=True)
19
+
20
+ # Set first row as column headers (if applicable)
21
+ final_df.columns = final_df.iloc[0] # First row as headers
22
+ final_df = final_df[1:].reset_index(drop=True)
23
+
24
+ return final_df
25
+
26
+ def convert_to_excel(dataframe):
27
+ output = BytesIO()
28
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
29
+ dataframe.to_excel(writer, index=False, sheet_name='Sheet1')
30
+ output.seek(0)
31
+ return output
32
+
33
+ # Streamlit UI
34
+ st.title("PDF to Excel Converter")
35
+ st.write("Upload a PDF file with tabular data, and it will be converted into an Excel file.")
36
+
37
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
38
+
39
+ if uploaded_file is not None:
40
+ with st.spinner("Extracting tables from PDF..."):
41
+ df = extract_tables_from_pdf(uploaded_file)
42
+
43
+ if df is not None:
44
+ st.write("### Extracted Table Preview")
45
+ st.dataframe(df)
46
+
47
+ excel_data = convert_to_excel(df)
48
+ st.download_button(label="Download Excel File",
49
+ data=excel_data,
50
+ file_name="converted.xlsx",
51
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
52
+ else:
53
+ st.error("No tables found in the PDF.")