RAG_NLP / text_processing.py
DenysPetro's picture
changed structure
2fa7106
raw
history blame contribute delete
638 Bytes
import fitz
import re
def extract_text_from_pdf(pdf_path):
"""
Extract text from a PDF file.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Extracted text from the PDF.
"""
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def clean_text(text):
"""
Clean and normalize text.
Args:
text (str): Raw text.
Returns:
str: Cleaned text.
"""
text = re.sub(r'(\s?\.\s?)+', '', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\x00-\x7F]+', '', text)
return text.strip()