agentbee / test /test_file_parser.py
mangubee's picture
fix: correct author name formatting in multiple files
e7b4937
"""
Tests for file parser tool
Author: @mangubee
Date: 2026-01-02
Tests cover:
- PDF parsing
- Excel parsing
- Word document parsing
- Text/CSV parsing
- Retry logic
- Error handling
"""
import pytest
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
from src.tools.file_parser import (
parse_pdf,
parse_excel,
parse_word,
parse_text,
parse_file,
)
# ============================================================================
# Test Fixtures
# ============================================================================
FIXTURES_DIR = Path(__file__).parent / "fixtures"
@pytest.fixture
def sample_text_file():
"""Path to sample text file"""
return str(FIXTURES_DIR / "sample.txt")
@pytest.fixture
def sample_csv_file():
"""Path to sample CSV file"""
return str(FIXTURES_DIR / "sample.csv")
@pytest.fixture
def sample_excel_file():
"""Path to sample Excel file"""
return str(FIXTURES_DIR / "sample.xlsx")
@pytest.fixture
def sample_word_file():
"""Path to sample Word file"""
return str(FIXTURES_DIR / "sample.docx")
@pytest.fixture
def mock_pdf_reader():
"""Mock PyPDF2 PdfReader"""
mock_page_1 = Mock()
mock_page_1.extract_text.return_value = "Test PDF page 1 content"
mock_page_2 = Mock()
mock_page_2.extract_text.return_value = "Test PDF page 2 content"
mock_reader = Mock()
mock_reader.pages = [mock_page_1, mock_page_2]
return mock_reader
# ============================================================================
# PDF Parser Tests
# ============================================================================
def test_parse_pdf_success(mock_pdf_reader):
"""Test successful PDF parsing"""
with patch('PyPDF2.PdfReader') as mock_reader_class:
with patch('src.tools.file_parser.Path') as mock_path_class:
# Mock file exists
mock_path = Mock()
mock_path.exists.return_value = True
mock_path_class.return_value = mock_path
# Mock PdfReader
mock_reader_class.return_value = mock_pdf_reader
result = parse_pdf("test.pdf")
assert result["file_type"] == "PDF"
assert result["pages"] == 2
assert "page 1 content" in result["content"].lower()
assert "page 2 content" in result["content"].lower()
def test_parse_pdf_file_not_found():
"""Test PDF parsing with missing file"""
with patch('src.tools.file_parser.Path') as mock_path_class:
mock_path = Mock()
mock_path.exists.return_value = False
mock_path_class.return_value = mock_path
with pytest.raises(FileNotFoundError):
parse_pdf("nonexistent.pdf")
def test_parse_pdf_io_error_retry():
"""Test PDF parsing with IO error triggers retry"""
with patch('PyPDF2.PdfReader') as mock_reader_class:
with patch('src.tools.file_parser.Path') as mock_path_class:
# Mock file exists
mock_path = Mock()
mock_path.exists.return_value = True
mock_path_class.return_value = mock_path
# Mock IO error
mock_reader_class.side_effect = IOError("Disk error")
with pytest.raises(IOError):
parse_pdf("test.pdf")
# Verify retry happened (should be called MAX_RETRIES times)
assert mock_reader_class.call_count == 3
# ============================================================================
# Excel Parser Tests
# ============================================================================
def test_parse_excel_success(sample_excel_file):
"""Test successful Excel parsing with real file"""
result = parse_excel(sample_excel_file)
assert result["file_type"] == "Excel"
assert len(result["sheets"]) == 2
assert "Data" in result["sheets"]
assert "Summary" in result["sheets"]
assert "Apple" in result["content"]
assert "Banana" in result["content"]
def test_parse_excel_file_not_found():
"""Test Excel parsing with missing file"""
with pytest.raises(FileNotFoundError):
parse_excel("nonexistent.xlsx")
def test_parse_excel_io_error_retry():
"""Test Excel parsing with IO error triggers retry"""
with patch('openpyxl.load_workbook') as mock_load:
with patch('src.tools.file_parser.Path') as mock_path_class:
# Mock file exists
mock_path = Mock()
mock_path.exists.return_value = True
mock_path_class.return_value = mock_path
# Mock IO error
mock_load.side_effect = IOError("Disk error")
with pytest.raises(IOError):
parse_excel("test.xlsx")
# Verify retry happened
assert mock_load.call_count == 3
# ============================================================================
# Word Document Parser Tests
# ============================================================================
def test_parse_word_success(sample_word_file):
"""Test successful Word document parsing with real file"""
result = parse_word(sample_word_file)
assert result["file_type"] == "Word"
assert result["paragraphs"] > 0
assert "Test Word Document" in result["content"]
assert "first paragraph" in result["content"]
def test_parse_word_file_not_found():
"""Test Word parsing with missing file"""
with pytest.raises(FileNotFoundError):
parse_word("nonexistent.docx")
def test_parse_word_io_error_retry():
"""Test Word parsing with IO error triggers retry"""
with patch('docx.Document') as mock_doc_class:
with patch('src.tools.file_parser.Path') as mock_path_class:
# Mock file exists
mock_path = Mock()
mock_path.exists.return_value = True
mock_path_class.return_value = mock_path
# Mock IO error
mock_doc_class.side_effect = IOError("Disk error")
with pytest.raises(IOError):
parse_word("test.docx")
# Verify retry happened
assert mock_doc_class.call_count == 3
# ============================================================================
# Text/CSV Parser Tests
# ============================================================================
def test_parse_text_success(sample_text_file):
"""Test successful text file parsing with real file"""
result = parse_text(sample_text_file)
assert result["file_type"] == "Text"
assert result["lines"] > 0
assert "test text file" in result["content"].lower()
def test_parse_csv_success(sample_csv_file):
"""Test successful CSV file parsing with real file"""
result = parse_text(sample_csv_file)
assert result["file_type"] == "CSV"
assert result["lines"] > 0
assert "Name,Age,City" in result["content"]
assert "Alice" in result["content"]
def test_parse_text_file_not_found():
"""Test text parsing with missing file"""
with pytest.raises(FileNotFoundError):
parse_text("nonexistent.txt")
def test_parse_text_io_error_retry():
"""Test text parsing with IO error triggers retry"""
with patch('builtins.open') as mock_open:
with patch('src.tools.file_parser.Path') as mock_path_class:
# Mock file exists
mock_path = Mock()
mock_path.exists.return_value = True
mock_path.suffix = '.txt'
mock_path_class.return_value = mock_path
# Mock IO error
mock_open.side_effect = IOError("Disk error")
with pytest.raises(IOError):
parse_text("test.txt")
# Verify retry happened
assert mock_open.call_count == 3
# ============================================================================
# Unified Parser Tests
# ============================================================================
def test_parse_file_pdf():
"""Test unified parser dispatches to PDF parser"""
with patch('src.tools.file_parser.parse_pdf') as mock_parse_pdf:
mock_parse_pdf.return_value = {"file_type": "PDF"}
result = parse_file("test.pdf")
assert result["file_type"] == "PDF"
mock_parse_pdf.assert_called_once()
def test_parse_file_excel():
"""Test unified parser dispatches to Excel parser"""
with patch('src.tools.file_parser.parse_excel') as mock_parse_excel:
mock_parse_excel.return_value = {"file_type": "Excel"}
result = parse_file("test.xlsx")
assert result["file_type"] == "Excel"
mock_parse_excel.assert_called_once()
def test_parse_file_word():
"""Test unified parser dispatches to Word parser"""
with patch('src.tools.file_parser.parse_word') as mock_parse_word:
mock_parse_word.return_value = {"file_type": "Word"}
result = parse_file("test.docx")
assert result["file_type"] == "Word"
mock_parse_word.assert_called_once()
def test_parse_file_text():
"""Test unified parser dispatches to text parser"""
with patch('src.tools.file_parser.parse_text') as mock_parse_text:
mock_parse_text.return_value = {"file_type": "Text"}
result = parse_file("test.txt")
assert result["file_type"] == "Text"
mock_parse_text.assert_called_once()
def test_parse_file_unsupported_extension():
"""Test unified parser rejects unsupported file type"""
with pytest.raises(ValueError, match="Unsupported file type"):
parse_file("test.mp4")
def test_parse_file_xls_extension():
"""Test unified parser handles .xls extension"""
with patch('src.tools.file_parser.parse_excel') as mock_parse_excel:
mock_parse_excel.return_value = {"file_type": "Excel"}
result = parse_file("test.xls")
assert result["file_type"] == "Excel"
mock_parse_excel.assert_called_once()