Spaces:
No application file
No application file
import hashlib | |
from unittest.mock import Mock, patch | |
import pytest | |
from requests import Response | |
from embedchain.loaders.docs_site_loader import DocsSiteLoader | |
def mock_requests_get(): | |
with patch("requests.get") as mock_get: | |
yield mock_get | |
def docs_site_loader(): | |
return DocsSiteLoader() | |
def test_get_child_links_recursive(mock_requests_get, docs_site_loader): | |
mock_response = Mock() | |
mock_response.status_code = 200 | |
mock_response.text = """ | |
<html> | |
<a href="/page1">Page 1</a> | |
<a href="/page2">Page 2</a> | |
</html> | |
""" | |
mock_requests_get.return_value = mock_response | |
docs_site_loader._get_child_links_recursive("https://example.com") | |
assert len(docs_site_loader.visited_links) == 2 | |
assert "https://example.com/page1" in docs_site_loader.visited_links | |
assert "https://example.com/page2" in docs_site_loader.visited_links | |
def test_get_child_links_recursive_status_not_200(mock_requests_get, docs_site_loader): | |
mock_response = Mock() | |
mock_response.status_code = 404 | |
mock_requests_get.return_value = mock_response | |
docs_site_loader._get_child_links_recursive("https://example.com") | |
assert len(docs_site_loader.visited_links) == 0 | |
def test_get_all_urls(mock_requests_get, docs_site_loader): | |
mock_response = Mock() | |
mock_response.status_code = 200 | |
mock_response.text = """ | |
<html> | |
<a href="/page1">Page 1</a> | |
<a href="/page2">Page 2</a> | |
<a href="https://example.com/external">External</a> | |
</html> | |
""" | |
mock_requests_get.return_value = mock_response | |
all_urls = docs_site_loader._get_all_urls("https://example.com") | |
assert len(all_urls) == 3 | |
assert "https://example.com/page1" in all_urls | |
assert "https://example.com/page2" in all_urls | |
assert "https://example.com/external" in all_urls | |
def test_load_data_from_url(mock_requests_get, docs_site_loader): | |
mock_response = Mock() | |
mock_response.status_code = 200 | |
mock_response.content = """ | |
<html> | |
<nav> | |
<h1>Navigation</h1> | |
</nav> | |
<article class="bd-article"> | |
<p>Article Content</p> | |
</article> | |
</html> | |
""".encode() | |
mock_requests_get.return_value = mock_response | |
data = docs_site_loader._load_data_from_url("https://example.com/page1") | |
assert len(data) == 1 | |
assert data[0]["content"] == "Article Content" | |
assert data[0]["meta_data"]["url"] == "https://example.com/page1" | |
def test_load_data_from_url_status_not_200(mock_requests_get, docs_site_loader): | |
mock_response = Mock() | |
mock_response.status_code = 404 | |
mock_requests_get.return_value = mock_response | |
data = docs_site_loader._load_data_from_url("https://example.com/page1") | |
assert data == [] | |
assert len(data) == 0 | |
def test_load_data(mock_requests_get, docs_site_loader): | |
mock_response = Response() | |
mock_response.status_code = 200 | |
mock_response._content = """ | |
<html> | |
<a href="/page1">Page 1</a> | |
<a href="/page2">Page 2</a> | |
""".encode() | |
mock_requests_get.return_value = mock_response | |
url = "https://example.com" | |
data = docs_site_loader.load_data(url) | |
expected_doc_id = hashlib.sha256((" ".join(docs_site_loader.visited_links) + url).encode()).hexdigest() | |
assert len(data["data"]) == 2 | |
assert data["doc_id"] == expected_doc_id | |
def test_if_response_status_not_200(mock_requests_get, docs_site_loader): | |
mock_response = Response() | |
mock_response.status_code = 404 | |
mock_requests_get.return_value = mock_response | |
url = "https://example.com" | |
data = docs_site_loader.load_data(url) | |
expected_doc_id = hashlib.sha256((" ".join(docs_site_loader.visited_links) + url).encode()).hexdigest() | |
assert len(data["data"]) == 0 | |
assert data["doc_id"] == expected_doc_id | |