Spaces:

aleegr10
/

chatBotBasket

Sleeping

File size: 1,376 Bytes

b928387

import fitz  # PyMuPDF
from bs4 import BeautifulSoup
import io
import re
import requests
import fitz  # PyMuPDF
from PyPDF2 import PdfReader, PdfWriter


def pdf_to_markdown(pdf_path):

    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Windows; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36'}

    response = requests.get(url=pdf_path, headers=headers, timeout=120)
    on_fly_mem_obj = io.BytesIO(response.content)
    pdf_document = PdfReader(on_fly_mem_obj)
    # Initialize the Markdown content
    markdown_content = ""

    # Iterate through each page of the PDF
    for page_number in range(len(pdf_document.pages)):
        page = pdf_document.pages[page_number]

        # Extract text from the page
        text = page.extract_text("text")

        # Process the text (you can customize this part based on your needs)
        soup = BeautifulSoup(text, "html.parser")
        formatted_text = soup.get_text(separator="\n")

        # Append the processed text to the Markdown content
        markdown_content += f"\n# Page {page_number + 1}\n\n{formatted_text}\n"

    return markdown_content


# Example usage
pdf_path = ""
markdown_content = pdf_to_markdown(pdf_path)

# Save the Markdown content to a file
with open("output.md", "w", encoding="utf-8") as markdown_file:
    markdown_file.write(markdown_content)