Spaces:

aleegr10
/

chatBotBasket

Sleeping

chatBotBasket / pdfToMarkdown.py

subir proyecto

b928387 4 months ago

No virus

1.38 kB

	import fitz # PyMuPDF
	from bs4 import BeautifulSoup
	import io
	import re
	import requests
	import fitz # PyMuPDF
	from PyPDF2 import PdfReader, PdfWriter


	def pdf_to_markdown(pdf_path):

	headers = {
	'User-Agent': 'Mozilla/5.0 (X11; Windows; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36'}

	response = requests.get(url=pdf_path, headers=headers, timeout=120)
	on_fly_mem_obj = io.BytesIO(response.content)
	pdf_document = PdfReader(on_fly_mem_obj)
	# Initialize the Markdown content
	markdown_content = ""

	# Iterate through each page of the PDF
	for page_number in range(len(pdf_document.pages)):
	page = pdf_document.pages[page_number]

	# Extract text from the page
	text = page.extract_text("text")

	# Process the text (you can customize this part based on your needs)
	soup = BeautifulSoup(text, "html.parser")
	formatted_text = soup.get_text(separator="\n")

	# Append the processed text to the Markdown content
	markdown_content += f"\n# Page {page_number + 1}\n\n{formatted_text}\n"

	return markdown_content


	# Example usage
	pdf_path = ""
	markdown_content = pdf_to_markdown(pdf_path)

	# Save the Markdown content to a file
	with open("output.md", "w", encoding="utf-8") as markdown_file:
	markdown_file.write(markdown_content)