import fitz import re def extract_text_from_pdf(pdf_path): """ Extract text from a PDF file. Args: pdf_path (str): Path to the PDF file. Returns: str: Extracted text from the PDF. """ doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() return text def clean_text(text): """ Clean and normalize text. Args: text (str): Raw text. Returns: str: Cleaned text. """ text = re.sub(r'(\s?\.\s?)+', '', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\x00-\x7F]+', '', text) return text.strip()