Spaces:

Zeta611
/

easyword-translator

Sleeping

Oh Gyuhyeok

Implement PDF Upload (#2)

59e8fda unverified 5 months ago

900 Bytes

	import pymupdf # PyMuPDF


	def read_pdf(file_path):
	# Open the PDF file
	document = pymupdf.open(file_path)
	text = ""

	# Iterate through the pages
	for page_num in range(len(document)):
	# Extract text from each page
	page = document.load_page(page_num)
	text += page.get_text()

	# Close the PDF document
	document.close()

	return text


	def remove_line_breaks(text):
	# remove only single line breaks, not paragraphs
	# find line breaks and it is not followed by a period
	for i in range(len(text)):
	if i == 0 or i == len(text) - 1:
	continue
	if text[i] == "\n" and text[i - 1] != "." and text[i + 1] != "\n":
	text = text[:i] + " " + text[i + 1 :]
	return text


	if __name__ == "__main__":
	file_path = "example3.pdf"
	pdf_text = read_pdf(file_path)
	print(remove_line_breaks(pdf_text))