Spaces:
Sleeping
Sleeping
import pymupdf # PyMuPDF | |
def read_pdf(file_path): | |
# Open the PDF file | |
document = pymupdf.open(file_path) | |
text = "" | |
# Iterate through the pages | |
for page_num in range(len(document)): | |
# Extract text from each page | |
page = document.load_page(page_num) | |
text += page.get_text() | |
# Close the PDF document | |
document.close() | |
return text | |
def remove_line_breaks(text): | |
# remove only single line breaks, not paragraphs | |
# find line breaks and it is not followed by a period | |
for i in range(len(text)): | |
if i == 0 or i == len(text) - 1: | |
continue | |
if text[i] == "\n" and text[i - 1] != "." and text[i + 1] != "\n": | |
text = text[:i] + " " + text[i + 1 :] | |
return text | |
if __name__ == "__main__": | |
file_path = "example3.pdf" | |
pdf_text = read_pdf(file_path) | |
print(remove_line_breaks(pdf_text)) | |