Oh Gyuhyeok
Implement PDF Upload (#2)
59e8fda unverified
raw
history blame
900 Bytes
import pymupdf # PyMuPDF
def read_pdf(file_path):
# Open the PDF file
document = pymupdf.open(file_path)
text = ""
# Iterate through the pages
for page_num in range(len(document)):
# Extract text from each page
page = document.load_page(page_num)
text += page.get_text()
# Close the PDF document
document.close()
return text
def remove_line_breaks(text):
# remove only single line breaks, not paragraphs
# find line breaks and it is not followed by a period
for i in range(len(text)):
if i == 0 or i == len(text) - 1:
continue
if text[i] == "\n" and text[i - 1] != "." and text[i + 1] != "\n":
text = text[:i] + " " + text[i + 1 :]
return text
if __name__ == "__main__":
file_path = "example3.pdf"
pdf_text = read_pdf(file_path)
print(remove_line_breaks(pdf_text))