Spaces:

Md919191
/

pdf_to_excel_app

Running

App Files Files Community

Md919191 commited on 29 days ago

Commit

f3f2c2f

verified ·

1 Parent(s): a0dcf2b

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -47

app.py CHANGED Viewed

@@ -1,52 +1,53 @@
 import streamlit as st
-import pandas as pd
 import pdfplumber
-import os
-# Function to extract text from a PDF file
-def extract_text_from_pdf(pdf_file):
     with pdfplumber.open(pdf_file) as pdf:
-        text = ""
         for page in pdf.pages:
-            text += page.extract_text()
-        return text
-# Function to convert extracted text to Excel
-def convert_text_to_excel(text, output_file):
-    rows = text.split("\n")
-    data = [row.split() for row in rows]
-    df = pd.DataFrame(data)
-    df.to_excel(output_file, index=False)
-# Main function to build the Streamlit app
-def main():
-    st.title("PDF to Excel Converter")
-    # File uploader
-    uploaded_pdf = st.file_uploader("Upload a PDF file", type="pdf")
-    # Check if a file has been uploaded
-    if uploaded_pdf:
-        # Extract text from the PDF
-        text = extract_text_from_pdf(uploaded_pdf)
-        st.success("Text extracted from PDF!")
-        # Display the extracted text
-        st.text_area("Extracted Text", text, height=300)
-        # Button to convert and download Excel
-        if st.button("Convert to Excel"):
-            output_file = "converted_file.xlsx"
-            convert_text_to_excel(text, output_file)
-            with open(output_file, "rb") as f:
-                st.download_button(
-                    label="Download Excel File",
-                    data=f,
-                    file_name="converted_file.xlsx",
-                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-                )
-            os.remove(output_file)
-# Entry point of the script
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import pdfplumber
+import pandas as pd
+from io import BytesIO
+def extract_tables_from_pdf(pdf_file):
+    tables = []
     with pdfplumber.open(pdf_file) as pdf:
         for page in pdf.pages:
+            extracted_tables = page.extract_tables()
+            for table in extracted_tables:
+                tables.append(pd.DataFrame(table))
+    if not tables:
+        return None
+    # Concatenate all tables into one DataFrame
+    final_df = pd.concat(tables, ignore_index=True)
+    # Set first row as column headers (if applicable)
+    final_df.columns = final_df.iloc[0]  # First row as headers
+    final_df = final_df[1:].reset_index(drop=True)
+    return final_df
+def convert_to_excel(dataframe):
+    output = BytesIO()
+    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+        dataframe.to_excel(writer, index=False, sheet_name='Sheet1')
+    output.seek(0)
+    return output
+# Streamlit UI
+st.title("PDF to Excel Converter")
+st.write("Upload a PDF file with tabular data, and it will be converted into an Excel file.")
+uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
+if uploaded_file is not None:
+    with st.spinner("Extracting tables from PDF..."):
+        df = extract_tables_from_pdf(uploaded_file)
+        if df is not None:
+            st.write("### Extracted Table Preview")
+            st.dataframe(df)
+            excel_data = convert_to_excel(df)
+            st.download_button(label="Download Excel File",
+                               data=excel_data,
+                               file_name="converted.xlsx",
+                               mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+        else:
+            st.error("No tables found in the PDF.")