Spaces:

suh4s
/

InsightFlowAI_test

Running

InsightFlowAI_test / tests /test_rag_utils.py

suh4s

Working AIE midterm InsightFlow AI

31add3b 26 days ago

6.63 kB

	import pytest
	import os
	from unittest.mock import patch, MagicMock

	# Make sure 'utils' is discoverable, or adjust path.
	# This might require __init__.py in 'utils' and 'tests' and correct pythonpath.
	from utils.rag_utils import load_and_split_documents, get_embedding_model, EMBEDDING_MODEL_NAME
	from langchain_community.document_loaders import UnstructuredFileLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from langchain_community.embeddings import HuggingFaceEmbeddings

	# --- Tests for load_and_split_documents ---

	def test_load_and_split_documents_no_directory(tmp_path):
	"""Test behavior when the persona data directory does not exist."""
	persona_id = "non_existent_persona"
	result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
	assert result_docs == []

	def test_load_and_split_documents_no_txt_files(tmp_path):
	"""Test behavior when directory exists but contains no .txt files."""
	persona_id = "empty_persona"
	persona_dir = tmp_path / persona_id
	persona_dir.mkdir()
	(persona_dir / "other_file.md").write_text("some markdown content")

	result_docs = load_and_split_documents(persona_id, data_path=str(tmp_path))
	assert result_docs == []

	def test_load_and_split_documents_loads_and_splits_txt_files(tmp_path):
	"""Test successful loading and splitting of .txt files."""
	persona_id = "test_persona"
	data_sources_path = tmp_path / "data_sources" # Simulate the data_sources structure
	data_sources_path.mkdir()
	persona_dir = data_sources_path / persona_id
	persona_dir.mkdir()

	# Create dummy .txt files
	(persona_dir / "doc1.txt").write_text("This is the first document. It has some text.")
	(persona_dir / "doc2.txt").write_text("Another document here with more words to ensure splitting might occur if long enough.")

	# Mocking DirectoryLoader.load() to control what it returns,
	# as testing the actual loader behavior deeply is out of scope for this unit test.
	# We are more interested in the interaction with text_splitter.
	# However, for this test, let's allow it to run to verify basic integration.
	# For more complex scenarios or to avoid actual file loading, mocking loader.load() would be better.

	# For simplicity, we'll assume RecursiveCharacterTextSplitter works as expected.
	# We are mainly testing that documents are loaded and passed to the splitter.

	split_docs = load_and_split_documents(persona_id, data_path=str(data_sources_path))

	assert len(split_docs) > 0 # Expecting at least one chunk per document if short, or more if split
	assert isinstance(split_docs[0], Document)

	# Check if content from original docs is present (simplified check)
	content_doc1_present = any("first document" in doc.page_content for doc in split_docs)
	content_doc2_present = any("Another document" in doc.page_content for doc in split_docs)
	assert content_doc1_present or content_doc2_present # At least one should be found if files are small

	# A more robust test would mock text_splitter.split_documents and verify it's called with loaded docs.


	def test_load_and_split_documents_uses_correct_loader_and_splitter_params():
	"""Test that DirectoryLoader and RecursiveCharacterTextSplitter are called with expected parameters."""
	persona_id = "params_test_persona"
	data_path = "dummy_data_path"
	dummy_persona_path = os.path.join(data_path, persona_id)

	# Mock os.path.isdir to simulate directory existence
	with patch('os.path.isdir', return_value=True):
	# Mock DirectoryLoader
	mock_doc_instance = Document(page_content="Test content from loader.")
	mock_loader_instance = MagicMock()
	mock_loader_instance.load.return_value = [mock_doc_instance] # Simulate loader returning one doc

	with patch('utils.rag_utils.DirectoryLoader', return_value=mock_loader_instance) as mock_directory_loader:
	# Mock RecursiveCharacterTextSplitter
	mock_splitter_instance = MagicMock()
	mock_splitter_instance.split_documents.return_value = [Document(page_content="Split chunk 1")] # Simulate splitter returning one chunk

	with patch('utils.rag_utils.RecursiveCharacterTextSplitter', return_value=mock_splitter_instance) as mock_text_splitter:

	load_and_split_documents(persona_id, data_path=data_path)

	# Assert DirectoryLoader was called correctly
	mock_directory_loader.assert_called_once_with(
	dummy_persona_path,
	glob="*/.txt",
	loader_cls=UnstructuredFileLoader,
	show_progress=True,
	use_multithreading=True,
	silent_errors=True
	)
	mock_loader_instance.load.assert_called_once()

	# Assert RecursiveCharacterTextSplitter was called correctly
	mock_text_splitter.assert_called_once_with(
	chunk_size=1000,
	chunk_overlap=150,
	length_function=len,
	is_separator_regex=False
	)
	mock_splitter_instance.split_documents.assert_called_once_with([mock_doc_instance])


	# --- Tests for get_embedding_model ---

	def test_get_embedding_model_default():
	"""Test that get_embedding_model returns a HuggingFaceEmbeddings instance with the default model."""
	# Patching the HuggingFaceEmbeddings constructor to avoid actual model loading/download
	with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
	mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
	mock_hf_embeddings.return_value = mock_instance

	embedding_model = get_embedding_model()

	mock_hf_embeddings.assert_called_once_with(model_name=EMBEDDING_MODEL_NAME)
	assert embedding_model == mock_instance

	def test_get_embedding_model_custom_name():
	"""Test get_embedding_model with a custom model name."""
	custom_model = "sentence-transformers/paraphrase-MiniLM-L3-v2"
	with patch('utils.rag_utils.HuggingFaceEmbeddings') as mock_hf_embeddings:
	mock_instance = MagicMock(spec=HuggingFaceEmbeddings)
	mock_hf_embeddings.return_value = mock_instance

	embedding_model = get_embedding_model(model_name=custom_model)

	mock_hf_embeddings.assert_called_once_with(model_name=custom_model)
	assert embedding_model == mock_instance