Spaces:
Sleeping
Sleeping
import fitz # PyMuPDF | |
from bs4 import BeautifulSoup | |
import io | |
import re | |
import requests | |
import fitz # PyMuPDF | |
from PyPDF2 import PdfReader, PdfWriter | |
def pdf_to_markdown(pdf_path): | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (X11; Windows; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36'} | |
response = requests.get(url=pdf_path, headers=headers, timeout=120) | |
on_fly_mem_obj = io.BytesIO(response.content) | |
pdf_document = PdfReader(on_fly_mem_obj) | |
# Initialize the Markdown content | |
markdown_content = "" | |
# Iterate through each page of the PDF | |
for page_number in range(len(pdf_document.pages)): | |
page = pdf_document.pages[page_number] | |
# Extract text from the page | |
text = page.extract_text("text") | |
# Process the text (you can customize this part based on your needs) | |
soup = BeautifulSoup(text, "html.parser") | |
formatted_text = soup.get_text(separator="\n") | |
# Append the processed text to the Markdown content | |
markdown_content += f"\n# Page {page_number + 1}\n\n{formatted_text}\n" | |
return markdown_content | |
# Example usage | |
pdf_path = "" | |
markdown_content = pdf_to_markdown(pdf_path) | |
# Save the Markdown content to a file | |
with open("output.md", "w", encoding="utf-8") as markdown_file: | |
markdown_file.write(markdown_content) |