"""Tests for fetch functionality.""" import pytest from unittest.mock import AsyncMock, patch, MagicMock from app.fetch.fetcher import get_paragraphs_for_url @pytest.mark.asyncio async def test_get_paragraphs_success(): """Test successful content fetching and extraction.""" # Mock HTML content mock_html = """

This is the first paragraph of the main content.

This is the second paragraph with important information.

This is the third paragraph continuing the story.

""" with patch('httpx.AsyncClient.get') as mock_get: mock_response = MagicMock() mock_response.text = mock_html mock_response.headers = {"content-type": "text/html"} mock_response.raise_for_status = MagicMock() mock_get.return_value = mock_response paragraphs = await get_paragraphs_for_url("https://example.com/article") assert len(paragraphs) >= 1 # Should contain meaningful content assert any("paragraph" in p.lower() for p in paragraphs) @pytest.mark.asyncio async def test_get_paragraphs_trafilatura_fallback(): """Test that trafilatura extraction works.""" # Test with a simple HTML structure that trafilatura can handle mock_html = """ Test Article

Test Article Title

This is a test paragraph that should be extracted by trafilatura.

This is another paragraph with substantive content for testing.

""" with patch('httpx.AsyncClient.get') as mock_get: mock_response = MagicMock() mock_response.text = mock_html mock_response.headers = {"content-type": "text/html"} mock_response.raise_for_status = MagicMock() mock_get.return_value = mock_response paragraphs = await get_paragraphs_for_url("https://example.com/test") # Should extract at least some content assert len(paragraphs) >= 1 # Content should be meaningful (not just whitespace) assert any(len(p.strip()) > 10 for p in paragraphs) @pytest.mark.asyncio async def test_get_paragraphs_min_length_filter(): """Test that short paragraphs are filtered out.""" mock_html = """

Short.

This is a longer paragraph that should be included in the results.

OK

Another substantial paragraph with enough content to be useful for analysis.

""" with patch('httpx.AsyncClient.get') as mock_get: mock_response = MagicMock() mock_response.text = mock_html mock_response.headers = {"content-type": "text/html"} mock_response.raise_for_status = MagicMock() mock_get.return_value = mock_response paragraphs = await get_paragraphs_for_url("https://example.com/test") # Should filter out very short paragraphs for p in paragraphs: assert len(p.strip()) >= 10 @pytest.mark.asyncio async def test_get_paragraphs_http_error(): """Test handling of HTTP errors.""" with patch('httpx.AsyncClient.get') as mock_get: mock_get.side_effect = Exception("HTTP 404") paragraphs = await get_paragraphs_for_url("https://example.com/notfound") # Should return empty list on error assert paragraphs == [] @pytest.mark.asyncio async def test_get_paragraphs_empty_content(): """Test handling of empty or minimal content.""" mock_html = "" with patch('httpx.AsyncClient.get') as mock_get: mock_response = MagicMock() mock_response.text = mock_html mock_response.headers = {"content-type": "text/html"} mock_response.raise_for_status = MagicMock() mock_get.return_value = mock_response paragraphs = await get_paragraphs_for_url("https://example.com/empty") # Should handle empty content gracefully assert isinstance(paragraphs, list)