SpiritualChatBot / RAG_BOT /document_processor.py
bk-anupam
feat: Implement document indexing and processing for multilingual support
7361b6f
import re
import os
import sys
from datetime import datetime
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from langchain_core.documents import Document
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)
from RAG_BOT.logger import logger
class DocumentProcessor:
"""
Base class for processing documents (PDF, HTM, etc.) to extract text,
metadata, and split content into chunks.
"""
def _devanagari_to_ascii_digits(self, devanagari_string: str) -> str:
"""Converts Devanagari numerals in a string to ASCII digits."""
mapping = {
'०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
'५': '5', '६': '6', '७': '7', '८': '8', '९': '9'
}
return "".join(mapping.get(char, char) for char in devanagari_string)
def extract_date_from_text(self, text):
"""
Attempts to extract a date from the given text and returns it in YYYY-MM-DD format.
Args:
text (str): The text to search for a date.
Returns:
str or None: The extracted date in YYYY-MM-DD format if found, otherwise None.
"""
# Specific date patterns to avoid ambiguity
date_patterns = [
(r"(\d{4})-(\d{2})-(\d{2})", "%Y-%m-%d"), # YYYY-MM-DD
(r"([०-९]{4})-([०-९]{2})-([०-९]{2})", "%Y-%m-%d"), # YYYY-MM-DD (Devanagari)
(r"(\d{2})/(\d{2})/(\d{4})", "%d/%m/%Y"), # DD/MM/YYYY
(r"([०-९]{2})/([०-९]{2})/([०-९]{4})", "%d/%m/%Y"), # DD/MM/YYYY (Devanagari)
(r"(\d{2})\.(\d{2})\.(\d{4})", "%d.%m.%Y"), # DD.MM.YYYY
(r"([०-९]{2})\.([०-९]{2})\.([०-९]{4})", "%d.%m.%Y"), # DD.MM.YYYY (Devanagari)
(r"(\d{1,2})\.(\d{1,2})\.(\d{4})", "%d.%m.%Y"), # D.M.YYYY, DD.M.YYYY, D.MM.YYYY
(r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{4})", "%d.%m.%Y"), # D.M.YYYY (Devanagari)
(r"(\d{1,2})/(\d{1,2})/(\d{4})", "%d/%m/%Y"), # D/M/YYYY, DD/M/YYYY, D/MM/YYYY
(r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{4})", "%d/%m/%Y"), # D/M/YYYY (Devanagari)
(r"(\d{1,2})-(\d{1,2})-(\d{4})", "%d-%m-%Y"), # D-M-YYYY, DD-M-YYYY, D-MM-YYYY
(r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{4})", "%d-%m-%Y"), # D-M-YYYY (Devanagari)
(r"(\d{2})\.(\d{2})\.(\d{2})", "%d.%m.%y"), # DD.MM.YY
(r"([०-९]{2})\.([०-९]{2})\.([०-९]{2})", "%d.%m.%y"), # DD.MM.YY (Devanagari)
(r"(\d{2})/(\d{2})/(\d{2})", "%d/%m/%y"), # DD/MM/YY
(r"([०-९]{2})/([०-९]{2})/([०-९]{2})", "%d/%m/%y"), # DD/MM/YY (Devanagari)
(r"(\d{2})-(\d{2})-(\d{2})", "%d-%m-%y"), # DD-MM-YY
(r"([०-९]{2})-([०-९]{2})-([०-९]{2})", "%d-%m-%y"), # DD-MM-YY (Devanagari)
(r"(\d{1,2})\.(\d{1,2})\.(\d{2})", "%d.%m.%y"), # D.M.YY, DD.M.YY, D.MM.YY
(r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{2})", "%d.%m.%y"), # D.M.YY (Devanagari)
(r"(\d{1,2})/(\d{1,2})/(\d{2})", "%d/%m/%y"), # D/M/YY, DD/M/YY, D/MM/YY
(r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{2})", "%d/%m/%y"), # D/M/YY (Devanagari)
(r"(\d{1,2})-(\d{1,2})-(\d{2})", "%d-%m-%y"), # D-M-YY, DD-M-YY, D-MM-YY
(r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{2})", "%d-%m-%y"), # D-M-YY (Devanagari)
# Add other common formats if needed (e.g., "January 21, 1969")
]
for pattern, date_format in date_patterns:
match = re.search(pattern, text)
if match:
matched_date_str = match.group(0)
ascii_date_str = self._devanagari_to_ascii_digits(matched_date_str)
try:
# Attempt to parse the date using the specified format
date_obj = datetime.strptime(ascii_date_str, date_format)
return date_obj.strftime("%Y-%m-%d")
except ValueError as e:
logger.warning(f"Date format '{date_format}' matched for '{matched_date_str}' (converted to '{ascii_date_str}'), but couldn't parse. Error: {e}")
# Continue searching other patterns
except Exception as e:
logger.error(f"Unexpected error parsing date '{matched_date_str}' (converted to '{ascii_date_str}') with format '{date_format}': {e}")
# Continue searching other patterns
logger.info(f"No date pattern matched in text: '{text[:100]}...'")
return None # Return None if no pattern matched or parsing failed
def get_murli_type(self, text):
"""
Determines if the text indicates an 'Avyakt' Murli.
Args:
text (str): The text to check.
Returns:
bool: True if 'avyakt' or 'अव्यक्त' is found, False otherwise.
"""
# Check for both Roman script (case-insensitive) and Devanagari script
if 'avyakt' in text.lower() or 'अव्यक्त' in text:
return True
return False
def split_text(self, documents, chunk_size=1000, chunk_overlap=200):
"""Splits the documents into chunks using RecursiveCharacterTextSplitter."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
logger.info(f"Split documents into {len(texts)} chunks using RecursiveCharacterTextSplitter")
return texts
def semantic_chunking(self, documents, model_name="sentence-transformers/all-MiniLM-L6-v2",
chunk_size=1000, chunk_overlap=0):
"""
Performs semantic chunking on the input documents using a sentence transformer model.
Args:
documents (list): A list of LangChain Document objects.
model_name (str): The name of the sentence transformer model to use.
chunk_size (int): The desired maximum size of each chunk in tokens.
Returns:
list: A list of LangChain Document objects representing the semantically chunked text.
"""
logger.info(f"Performing semantic chunking using model: {model_name} with chunk size : {chunk_size} tokens")
# Initialize the sentence transformer text splitter
try:
splitter = SentenceTransformersTokenTextSplitter(model_name=model_name, chunk_overlap=0, tokens_per_chunk=chunk_size)
# Split the documents into semantically meaningful chunks
chunks = splitter.split_documents(documents)
logger.info(f"Split documents into {len(chunks)} chunks using semantic chunking")
return chunks
except Exception as e:
logger.error(f"Error during semantic chunking: {e}")
# Consider re-raising or returning empty list based on desired behavior
# raise # Re-raise the exception
return [] # Return empty list to indicate failure but allow continuation