Spaces:
Sleeping
Sleeping
| import fitz | |
| import re | |
| def extract_text_from_pdf(pdf_path): | |
| """ | |
| Extract text from a PDF file. | |
| Args: | |
| pdf_path (str): Path to the PDF file. | |
| Returns: | |
| str: Extracted text from the PDF. | |
| """ | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def clean_text(text): | |
| """ | |
| Clean and normalize text. | |
| Args: | |
| text (str): Raw text. | |
| Returns: | |
| str: Cleaned text. | |
| """ | |
| text = re.sub(r'(\s?\.\s?)+', '', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'[^\x00-\x7F]+', '', text) | |
| return text.strip() | |