Spaces:
No application file
No application file
import pytest | |
import responses | |
from bs4 import BeautifulSoup | |
def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker): | |
child_url = "https://docs.embedchain.ai/quickstart" | |
selectee = selectee.format(ignored_tag=ignored_tag) | |
html_body = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<body> | |
{selectee} | |
</body> | |
</html> | |
""" | |
html_body = html_body.format(selectee=selectee) | |
mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") | |
url = "https://docs.embedchain.ai/" | |
html_body = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<body> | |
<li><a href="/quickstart">Quickstart</a></li> | |
</body> | |
</html> | |
""" | |
mocked_responses.get(url, body=html_body, status=200, content_type="text/html") | |
mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") | |
doc_id = "mocked_hash" | |
mock_sha256.return_value.hexdigest.return_value = doc_id | |
result = loader.load_data(url) | |
selector_soup = BeautifulSoup(selectee, "html.parser") | |
expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text())) | |
assert result["doc_id"] == doc_id | |
assert result["data"] == [ | |
{ | |
"content": expected_content, | |
"meta_data": {"url": "https://docs.embedchain.ai/quickstart"}, | |
} | |
] | |
def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker): | |
child_url = "https://docs.embedchain.ai/quickstart" | |
html_body = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<body> | |
<li><a href="/">..</a></li> | |
<li><a href="/quickstart">.</a></li> | |
</body> | |
</html> | |
""" | |
mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") | |
child_url = "https://docs.embedchain.ai/introduction" | |
html_body = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<body> | |
<li><a href="/">..</a></li> | |
<li><a href="/introduction">.</a></li> | |
</body> | |
</html> | |
""" | |
mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") | |
url = "https://docs.embedchain.ai/" | |
html_body = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<body> | |
<li><a href="/quickstart">Quickstart</a></li> | |
<li><a href="/introduction">Introduction</a></li> | |
</body> | |
</html> | |
""" | |
mocked_responses.get(url, body=html_body, status=200, content_type="text/html") | |
mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") | |
doc_id = "mocked_hash" | |
mock_sha256.return_value.hexdigest.return_value = doc_id | |
result = loader.load_data(url) | |
assert result["doc_id"] == doc_id | |
expected_data = [ | |
{"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}}, | |
{"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}}, | |
] | |
assert all(item in expected_data for item in result["data"]) | |
def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker): | |
child_url = "https://docs.embedchain.ai/introduction" | |
mocked_responses.get(child_url, status=404) | |
url = "https://docs.embedchain.ai/" | |
html_body = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<body> | |
<li><a href="/introduction">Introduction</a></li> | |
</body> | |
</html> | |
""" | |
mocked_responses.get(url, body=html_body, status=200, content_type="text/html") | |
mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") | |
doc_id = "mocked_hash" | |
mock_sha256.return_value.hexdigest.return_value = doc_id | |
result = loader.load_data(url) | |
assert result["doc_id"] is doc_id | |
assert result["data"] == [] | |
def loader(): | |
from embedchain.loaders.docs_site_loader import DocsSiteLoader | |
return DocsSiteLoader() | |
def mocked_responses(): | |
with responses.RequestsMock() as rsps: | |
yield rsps | |