InsightFlowAI_test / tests /test_rag_utils.py
suh4s
Working AIE midterm InsightFlow AI
31add3b
import pytest
import os
from unittest.mock import patch, MagicMock
# Make sure 'utils' is discoverable, or adjust path.
# This might require __init__.py in 'utils' and 'tests' and correct pythonpath.
from utils.rag_utils import load_and_split_documents, get_embedding_model, EMBEDDING_MODEL_NAME
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
# --- Tests for load_and_split_documents ---
def test_load_and_split_documents_no_directory(tmp_path):
"""Test behavior when the persona data directory does not exist."""
persona_id = "non_existent_persona"
result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
assert result_docs == []
def test_load_and_split_documents_no_txt_files(tmp_path):
"""Test behavior when directory exists but contains no .txt files."""
persona_id = "empty_persona"
persona_dir = tmp_path / persona_id
persona_dir.mkdir()
(persona_dir / "other_file.md").write_text("some markdown content")
result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
assert result_docs == []
def test_load_and_split_documents_loads_and_splits_txt_files(tmp_path):
"""Test successful loading and splitting of .txt files."""
persona_id = "test_persona"
data_sources_path = tmp_path / "data_sources" # Simulate the data_sources structure
data_sources_path.mkdir()
persona_dir = data_sources_path / persona_id
persona_dir.mkdir()
# Create dummy .txt files
(persona_dir / "doc1.txt").write_text("This is the first document. It has some text.")
(persona_dir / "doc2.txt").write_text("Another document here with more words to ensure splitting might occur if long enough.")
# Mocking DirectoryLoader.load() to control what it returns,
# as testing the actual loader behavior deeply is out of scope for this unit test.
# We are more interested in the interaction with text_splitter.
# However, for this test, let's allow it to run to verify basic integration.
# For more complex scenarios or to avoid actual file loading, mocking loader.load() would be better.
# For simplicity, we'll assume RecursiveCharacterTextSplitter works as expected.
# We are mainly testing that documents are loaded and passed to the splitter.
split_docs = load_and_split_documents(persona_id, data_path=str(data_sources_path))
assert len(split_docs) > 0 # Expecting at least one chunk per document if short, or more if split
assert isinstance(split_docs[0], Document)
# Check if content from original docs is present (simplified check)
content_doc1_present = any("first document" in doc.page_content for doc in split_docs)
content_doc2_present = any("Another document" in doc.page_content for doc in split_docs)
assert content_doc1_present or content_doc2_present # At least one should be found if files are small
# A more robust test would mock text_splitter.split_documents and verify it's called with loaded docs.
def test_load_and_split_documents_uses_correct_loader_and_splitter_params():
"""Test that DirectoryLoader and RecursiveCharacterTextSplitter are called with expected parameters."""
persona_id = "params_test_persona"
data_path = "dummy_data_path"
dummy_persona_path = os.path.join(data_path, persona_id)
# Mock os.path.isdir to simulate directory existence
with patch('os.path.isdir', return_value=True):
# Mock DirectoryLoader
mock_doc_instance = Document(page_content="Test content from loader.")
mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [mock_doc_instance] # Simulate loader returning one doc
with patch('utils.rag_utils.DirectoryLoader', return_value=mock_loader_instance) as mock_directory_loader:
# Mock RecursiveCharacterTextSplitter
mock_splitter_instance = MagicMock()
mock_splitter_instance.split_documents.return_value = [Document(page_content="Split chunk 1")] # Simulate splitter returning one chunk
with patch('utils.rag_utils.RecursiveCharacterTextSplitter', return_value=mock_splitter_instance) as mock_text_splitter:
load_and_split_documents(persona_id, data_path=data_path)
# Assert DirectoryLoader was called correctly
mock_directory_loader.assert_called_once_with(
dummy_persona_path,
glob="**/*.txt",
loader_cls=UnstructuredFileLoader,
show_progress=True,
use_multithreading=True,
silent_errors=True
)
mock_loader_instance.load.assert_called_once()
# Assert RecursiveCharacterTextSplitter was called correctly
mock_text_splitter.assert_called_once_with(
chunk_size=1000,
chunk_overlap=150,
length_function=len,
is_separator_regex=False
)
mock_splitter_instance.split_documents.assert_called_once_with([mock_doc_instance])
# --- Tests for get_embedding_model ---
def test_get_embedding_model_default():
"""Test that get_embedding_model returns a HuggingFaceEmbeddings instance with the default model."""
# Patching the HuggingFaceEmbeddings constructor to avoid actual model loading/download
with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
mock_hf_embeddings.return_value = mock_instance
embedding_model = get_embedding_model()
mock_hf_embeddings.assert_called_once_with(model_name=EMBEDDING_MODEL_NAME)
assert embedding_model == mock_instance
def test_get_embedding_model_custom_name():
"""Test get_embedding_model with a custom model name."""
custom_model = "sentence-transformers/paraphrase-MiniLM-L3-v2"
with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
mock_hf_embeddings.return_value = mock_instance
embedding_model = get_embedding_model(model_name=custom_model)
mock_hf_embeddings.assert_called_once_with(model_name=custom_model)
assert embedding_model == mock_instance