chatBotBasket / pdfToMarkdown.py
aleegr10's picture
subir proyecto
b928387
raw
history blame contribute delete
No virus
1.38 kB
import fitz # PyMuPDF
from bs4 import BeautifulSoup
import io
import re
import requests
import fitz # PyMuPDF
from PyPDF2 import PdfReader, PdfWriter
def pdf_to_markdown(pdf_path):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Windows; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36'}
response = requests.get(url=pdf_path, headers=headers, timeout=120)
on_fly_mem_obj = io.BytesIO(response.content)
pdf_document = PdfReader(on_fly_mem_obj)
# Initialize the Markdown content
markdown_content = ""
# Iterate through each page of the PDF
for page_number in range(len(pdf_document.pages)):
page = pdf_document.pages[page_number]
# Extract text from the page
text = page.extract_text("text")
# Process the text (you can customize this part based on your needs)
soup = BeautifulSoup(text, "html.parser")
formatted_text = soup.get_text(separator="\n")
# Append the processed text to the Markdown content
markdown_content += f"\n# Page {page_number + 1}\n\n{formatted_text}\n"
return markdown_content
# Example usage
pdf_path = ""
markdown_content = pdf_to_markdown(pdf_path)
# Save the Markdown content to a file
with open("output.md", "w", encoding="utf-8") as markdown_file:
markdown_file.write(markdown_content)