Spaces:
Sleeping
Sleeping
import fitz | |
import re | |
def extract_text_from_pdf(pdf_path): | |
""" | |
Extract text from a PDF file. | |
Args: | |
pdf_path (str): Path to the PDF file. | |
Returns: | |
str: Extracted text from the PDF. | |
""" | |
doc = fitz.open(pdf_path) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
def clean_text(text): | |
""" | |
Clean and normalize text. | |
Args: | |
text (str): Raw text. | |
Returns: | |
str: Cleaned text. | |
""" | |
text = re.sub(r'(\s?\.\s?)+', '', text) | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'[^\x00-\x7F]+', '', text) | |
return text.strip() | |