Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pymupdf4llm | |
| import tempfile | |
| import pathlib | |
| import markdown2 | |
| from docx import Document | |
| from bs4 import BeautifulSoup | |
| def pdf_to_markdown(pdf_file): | |
| # Create a temporary file to save the uploaded PDF | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: | |
| temp_file.write(pdf_file.read()) | |
| temp_file_path = temp_file.name | |
| # Convert PDF to Markdown | |
| pdf_text = pymupdf4llm.to_markdown(temp_file_path) | |
| # Save the Markdown content to a file | |
| md_file_path = pathlib.Path("Output.md") | |
| md_file_path.write_bytes(pdf_text.encode()) | |
| return pdf_text, md_file_path | |
| def create_docx_from_markdown(md_content): | |
| # Convert Markdown to HTML | |
| html_content = markdown2.markdown(md_content) | |
| # Create a new Document | |
| doc = Document() | |
| doc.add_heading('Converted PDF Content', level=1) | |
| # Use BeautifulSoup to parse the HTML and extract text | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| # Add the rendered HTML content to the document | |
| for element in soup: | |
| if element.name == 'h1': | |
| doc.add_heading(element.get_text(), level=1) | |
| elif element.name == 'h2': | |
| doc.add_heading(element.get_text(), level=2) | |
| elif element.name == 'h3': | |
| doc.add_heading(element.get_text(), level=3) | |
| elif element.name == 'p': | |
| doc.add_paragraph(element.get_text()) | |
| elif element.name == 'strong': | |
| doc.add_paragraph(element.get_text(), style='IntenseQuote') | |
| elif element.name == 'em': | |
| p = doc.add_paragraph() | |
| p.add_run(element.get_text()).italic = True | |
| # Save the document | |
| docx_file_path = pathlib.Path("Output.docx") | |
| doc.save(docx_file_path) | |
| return docx_file_path | |
| # Streamlit application | |
| st.title("📄 Structured PDF Data Extractor") | |
| st.subheader("Upload a PDF file, preview the structured content, and download it if needed.") | |
| # File uploader for PDF | |
| pdf_input = st.file_uploader("Upload PDF", type="pdf") | |
| if pdf_input is not None: | |
| # Convert to Markdown when the PDF is uploaded | |
| with st.spinner("Converting PDF to Markdown..."): | |
| try: | |
| pdf_text, md_file_path = pdf_to_markdown(pdf_input) | |
| # Display the Markdown content | |
| st.markdown("### Markdown Content Preview:", unsafe_allow_html=True) | |
| st.markdown(pdf_text, unsafe_allow_html=True) | |
| # Create a download button for the Markdown file | |
| st.markdown("### Download Markdown File:") | |
| with open(md_file_path, "rb") as file: | |
| st.download_button( | |
| label="Download Markdown", | |
| data=file, | |
| file_name=md_file_path.name, | |
| mime="text/markdown" | |
| ) | |
| # Create the .docx file from rendered Markdown content | |
| docx_file_path = create_docx_from_markdown(pdf_text) | |
| # Create a download button for the .docx file | |
| st.markdown("### Download Word Document:") | |
| with open(docx_file_path, "rb") as file: | |
| st.download_button( | |
| label="Download Word Document", | |
| data=file, | |
| file_name=docx_file_path.name, | |
| mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
| ) | |
| except Exception as e: | |
| st.error(f"An error occurred during conversion: {e}") | |
| # Add some styling to make it visually appealing | |
| st.markdown( | |
| """ | |
| <style> | |
| body { | |
| font-family: 'Arial', sans-serif; | |
| line-height: 1.6; | |
| font-size: 16px; | |
| color: #333; | |
| } | |
| .stButton>button { | |
| background-color: #4CAF50; /* Green */ | |
| border: none; | |
| color: white; | |
| padding: 10px 20px; | |
| text-align: center; | |
| text-decoration: none; | |
| display: inline-block; | |
| font-size: 16px; | |
| margin: 4px 2px; | |
| cursor: pointer; | |
| border-radius: 5px; | |
| transition: background-color 0.3s; | |
| } | |
| .stButton>button:hover { | |
| background-color: #45a049; /* Darker green */ | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |