Spaces:

bk-anupam
/

SpiritualChatBot

Building

SpiritualChatBot / RAG_BOT /document_processor.py

bk-anupam

feat: Implement document indexing and processing for multilingual support

7361b6f about 1 month ago

7.25 kB

	import re
	import os
	import sys
	from datetime import datetime
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_text_splitters import SentenceTransformersTokenTextSplitter
	from langchain_core.documents import Document
	# Add the project root to the Python path
	project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
	sys.path.insert(0, project_root)
	from RAG_BOT.logger import logger


	class DocumentProcessor:
	"""
	Base class for processing documents (PDF, HTM, etc.) to extract text,
	metadata, and split content into chunks.
	"""

	def _devanagari_to_ascii_digits(self, devanagari_string: str) -> str:
	"""Converts Devanagari numerals in a string to ASCII digits."""
	mapping = {
	'०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
	'५': '5', '६': '6', '७': '7', '८': '8', '९': '9'
	}
	return "".join(mapping.get(char, char) for char in devanagari_string)

	def extract_date_from_text(self, text):
	"""
	Attempts to extract a date from the given text and returns it in YYYY-MM-DD format.
	Args:
	text (str): The text to search for a date.
	Returns:
	str or None: The extracted date in YYYY-MM-DD format if found, otherwise None.
	"""
	# Specific date patterns to avoid ambiguity
	date_patterns = [
	(r"(\d{4})-(\d{2})-(\d{2})", "%Y-%m-%d"), # YYYY-MM-DD
	(r"([०-९]{4})-([०-९]{2})-([०-९]{2})", "%Y-%m-%d"), # YYYY-MM-DD (Devanagari)

	(r"(\d{2})/(\d{2})/(\d{4})", "%d/%m/%Y"), # DD/MM/YYYY
	(r"([०-९]{2})/([०-९]{2})/([०-९]{4})", "%d/%m/%Y"), # DD/MM/YYYY (Devanagari)

	(r"(\d{2})\.(\d{2})\.(\d{4})", "%d.%m.%Y"), # DD.MM.YYYY
	(r"([०-९]{2})\.([०-९]{2})\.([०-९]{4})", "%d.%m.%Y"), # DD.MM.YYYY (Devanagari)

	(r"(\d{1,2})\.(\d{1,2})\.(\d{4})", "%d.%m.%Y"), # D.M.YYYY, DD.M.YYYY, D.MM.YYYY
	(r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{4})", "%d.%m.%Y"), # D.M.YYYY (Devanagari)

	(r"(\d{1,2})/(\d{1,2})/(\d{4})", "%d/%m/%Y"), # D/M/YYYY, DD/M/YYYY, D/MM/YYYY
	(r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{4})", "%d/%m/%Y"), # D/M/YYYY (Devanagari)

	(r"(\d{1,2})-(\d{1,2})-(\d{4})", "%d-%m-%Y"), # D-M-YYYY, DD-M-YYYY, D-MM-YYYY
	(r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{4})", "%d-%m-%Y"), # D-M-YYYY (Devanagari)

	(r"(\d{2})\.(\d{2})\.(\d{2})", "%d.%m.%y"), # DD.MM.YY
	(r"([०-९]{2})\.([०-९]{2})\.([०-९]{2})", "%d.%m.%y"), # DD.MM.YY (Devanagari)

	(r"(\d{2})/(\d{2})/(\d{2})", "%d/%m/%y"), # DD/MM/YY
	(r"([०-९]{2})/([०-९]{2})/([०-९]{2})", "%d/%m/%y"), # DD/MM/YY (Devanagari)

	(r"(\d{2})-(\d{2})-(\d{2})", "%d-%m-%y"), # DD-MM-YY
	(r"([०-९]{2})-([०-९]{2})-([०-९]{2})", "%d-%m-%y"), # DD-MM-YY (Devanagari)

	(r"(\d{1,2})\.(\d{1,2})\.(\d{2})", "%d.%m.%y"), # D.M.YY, DD.M.YY, D.MM.YY
	(r"([०-९]{1,2})\.([०-९]{1,2})\.([०-९]{2})", "%d.%m.%y"), # D.M.YY (Devanagari)

	(r"(\d{1,2})/(\d{1,2})/(\d{2})", "%d/%m/%y"), # D/M/YY, DD/M/YY, D/MM/YY
	(r"([०-९]{1,2})/([०-९]{1,2})/([०-९]{2})", "%d/%m/%y"), # D/M/YY (Devanagari)

	(r"(\d{1,2})-(\d{1,2})-(\d{2})", "%d-%m-%y"), # D-M-YY, DD-M-YY, D-MM-YY
	(r"([०-९]{1,2})-([०-९]{1,2})-([०-९]{2})", "%d-%m-%y"), # D-M-YY (Devanagari)
	# Add other common formats if needed (e.g., "January 21, 1969")
	]

	for pattern, date_format in date_patterns:
	match = re.search(pattern, text)
	if match:
	matched_date_str = match.group(0)
	ascii_date_str = self._devanagari_to_ascii_digits(matched_date_str)
	try:
	# Attempt to parse the date using the specified format
	date_obj = datetime.strptime(ascii_date_str, date_format)
	return date_obj.strftime("%Y-%m-%d")
	except ValueError as e:
	logger.warning(f"Date format '{date_format}' matched for '{matched_date_str}' (converted to '{ascii_date_str}'), but couldn't parse. Error: {e}")
	# Continue searching other patterns
	except Exception as e:
	logger.error(f"Unexpected error parsing date '{matched_date_str}' (converted to '{ascii_date_str}') with format '{date_format}': {e}")
	# Continue searching other patterns

	logger.info(f"No date pattern matched in text: '{text[:100]}...'")
	return None # Return None if no pattern matched or parsing failed

	def get_murli_type(self, text):
	"""
	Determines if the text indicates an 'Avyakt' Murli.
	Args:
	text (str): The text to check.
	Returns:
	bool: True if 'avyakt' or 'अव्यक्त' is found, False otherwise.
	"""
	# Check for both Roman script (case-insensitive) and Devanagari script
	if 'avyakt' in text.lower() or 'अव्यक्त' in text:
	return True
	return False

	def split_text(self, documents, chunk_size=1000, chunk_overlap=200):
	"""Splits the documents into chunks using RecursiveCharacterTextSplitter."""
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	texts = text_splitter.split_documents(documents)
	logger.info(f"Split documents into {len(texts)} chunks using RecursiveCharacterTextSplitter")
	return texts

	def semantic_chunking(self, documents, model_name="sentence-transformers/all-MiniLM-L6-v2",
	chunk_size=1000, chunk_overlap=0):
	"""
	Performs semantic chunking on the input documents using a sentence transformer model.
	Args:
	documents (list): A list of LangChain Document objects.
	model_name (str): The name of the sentence transformer model to use.
	chunk_size (int): The desired maximum size of each chunk in tokens.
	Returns:
	list: A list of LangChain Document objects representing the semantically chunked text.
	"""
	logger.info(f"Performing semantic chunking using model: {model_name} with chunk size : {chunk_size} tokens")
	# Initialize the sentence transformer text splitter
	try:
	splitter = SentenceTransformersTokenTextSplitter(model_name=model_name, chunk_overlap=0, tokens_per_chunk=chunk_size)
	# Split the documents into semantically meaningful chunks
	chunks = splitter.split_documents(documents)
	logger.info(f"Split documents into {len(chunks)} chunks using semantic chunking")
	return chunks
	except Exception as e:
	logger.error(f"Error during semantic chunking: {e}")
	# Consider re-raising or returning empty list based on desired behavior
	# raise # Re-raise the exception
	return [] # Return empty list to indicate failure but allow continuation