|
import pytest |
|
import requests_mock |
|
from bs4 import BeautifulSoup |
|
|
|
from ankigen_core.crawler import WebCrawler |
|
|
|
BASE_URL = "http://example.com" |
|
SUB_PAGE_URL = f"{BASE_URL}/subpage" |
|
EXTERNAL_URL = "http://anotherdomain.com" |
|
|
|
|
|
@pytest.fixture |
|
def crawler_fixture(): |
|
return WebCrawler(start_url=BASE_URL, max_depth=1) |
|
|
|
|
|
@pytest.fixture |
|
def crawler_with_patterns_fixture(): |
|
return WebCrawler( |
|
start_url=BASE_URL, |
|
max_depth=1, |
|
include_patterns=[r"http://example\.com/docs/.*"], |
|
exclude_patterns=[r"http://example\.com/docs/v1/.*"], |
|
) |
|
|
|
|
|
|
|
|
|
|
|
def test_is_valid_url_valid(crawler_fixture): |
|
assert crawler_fixture._is_valid_url(f"{BASE_URL}/page1") |
|
assert crawler_fixture._is_valid_url(f"{BASE_URL}/another/page") |
|
|
|
|
|
def test_is_valid_url_different_domain(crawler_fixture): |
|
assert not crawler_fixture._is_valid_url("http://otherdomain.com/page") |
|
|
|
|
|
def test_is_valid_url_different_scheme(crawler_fixture): |
|
assert not crawler_fixture._is_valid_url("ftp://example.com/page") |
|
assert not crawler_fixture._is_valid_url( |
|
"mailto:user@example.com" |
|
) |
|
|
|
|
|
def test_is_valid_url_malformed(crawler_fixture): |
|
assert not crawler_fixture._is_valid_url( |
|
"htp://example.com/page" |
|
) |
|
assert not crawler_fixture._is_valid_url( |
|
"http:///page" |
|
) |
|
|
|
|
|
def test_is_valid_url_include_patterns_match(crawler_with_patterns_fixture): |
|
assert crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/page1") |
|
assert crawler_with_patterns_fixture._is_valid_url( |
|
f"{BASE_URL}/docs/topic/subtopic" |
|
) |
|
|
|
|
|
def test_is_valid_url_include_patterns_no_match(crawler_with_patterns_fixture): |
|
assert not crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/blog/page1") |
|
|
|
|
|
def test_is_valid_url_exclude_patterns_match(crawler_with_patterns_fixture): |
|
|
|
assert not crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/v1/page1") |
|
|
|
|
|
def test_is_valid_url_exclude_patterns_no_match(crawler_with_patterns_fixture): |
|
|
|
assert crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/v2/page1") |
|
|
|
|
|
def test_is_valid_url_no_patterns_defined(crawler_fixture): |
|
|
|
assert crawler_fixture._is_valid_url(f"{BASE_URL}/any/path") |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize( |
|
"html_content, base_url, expected_links", |
|
[ |
|
|
|
( |
|
"""<a href="/page1">1</a> <a href="http://example.com/page2">2</a>""", |
|
BASE_URL, |
|
[f"{BASE_URL}/page1", f"{BASE_URL}/page2"], |
|
), |
|
|
|
( |
|
"""<a href="#section">S</a> <a href="javascript:void(0)">JS</a> <a href="/page3">3</a>""", |
|
BASE_URL, |
|
[f"{BASE_URL}/page3"], |
|
), |
|
|
|
( |
|
"""<a href="http://anotherdomain.com">Ext</a> <a href="/page4">4</a>""", |
|
BASE_URL, |
|
[f"{BASE_URL}/page4"], |
|
), |
|
|
|
("""<a>No Href</a> <a href="/page5">5</a>""", BASE_URL, [f"{BASE_URL}/page5"]), |
|
|
|
( |
|
"""<a href="">Empty Href</a> <a href="/page6">6</a>""", |
|
BASE_URL, |
|
[f"{BASE_URL}/page6"], |
|
), |
|
|
|
( |
|
"""<a href="sub/page7">7</a>""", |
|
f"{BASE_URL}/path/", |
|
[f"{BASE_URL}/path/sub/page7"], |
|
), |
|
], |
|
) |
|
def test_extract_links(crawler_fixture, html_content, base_url, expected_links): |
|
soup = BeautifulSoup(html_content, "html.parser") |
|
|
|
|
|
actual_links = crawler_fixture._extract_links(soup, base_url) |
|
assert sorted(actual_links) == sorted(expected_links) |
|
|
|
|
|
def test_extract_links_with_filtering(crawler_with_patterns_fixture): |
|
html = """ |
|
<a href="http://example.com/docs/pageA">Allowed Doc</a> |
|
<a href="http://example.com/docs/v1/pageB">Excluded Doc v1</a> |
|
<a href="http://example.com/blog/pageC">Non-Doc Page</a> |
|
<a href="http://example.com/docs/v2/pageD">Allowed Doc v2</a> |
|
""" |
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
expected = [f"{BASE_URL}/docs/pageA", f"{BASE_URL}/docs/v2/pageD"] |
|
actual_links = crawler_with_patterns_fixture._extract_links(soup, BASE_URL) |
|
assert sorted(actual_links) == sorted(expected) |
|
|
|
|
|
|
|
@pytest.mark.parametrize( |
|
"html_content, expected_text", |
|
[ |
|
( |
|
"<html><head><title>T</title><script>alert('x');</script><style>.c{}</style></head><body><p>Hello</p><div>World</div></body></html>", |
|
"T Hello World", |
|
), |
|
("<body>Just text</body>", "Just text"), |
|
( |
|
"<body><nav>Menu</nav><main><p>Main content</p></main><footer>Foot</footer></body>", |
|
"Menu Main content Foot", |
|
), |
|
], |
|
) |
|
def test_extract_text(crawler_fixture, html_content, expected_text): |
|
soup = BeautifulSoup(html_content, "html.parser") |
|
assert crawler_fixture._extract_text(soup) == expected_text |
|
|
|
|
|
|
|
|
|
|
|
def test_crawl_single_page_no_links(crawler_fixture): |
|
with requests_mock.Mocker() as m: |
|
m.get( |
|
BASE_URL, |
|
text="<html><head><title>Test Title</title></head><body>No links here.</body></html>", |
|
) |
|
|
|
pages = crawler_fixture.crawl() |
|
|
|
assert len(pages) == 1 |
|
page = pages[0] |
|
assert page.url == BASE_URL |
|
assert page.title == "Test Title" |
|
assert "No links here" in page.text_content |
|
assert page.meta_description is None |
|
assert page.meta_keywords == [] |
|
|
|
|
|
def test_crawl_with_links_and_depth(crawler_fixture): |
|
|
|
with requests_mock.Mocker() as m: |
|
m.get( |
|
BASE_URL, |
|
text=f"""<html><head><title>Main</title><meta name="description" content="Main page desc"><meta name="keywords" content="main, test"></head> |
|
<body><a href="{SUB_PAGE_URL}">Subpage</a> <a href="{EXTERNAL_URL}">External</a></body></html>""", |
|
) |
|
m.get( |
|
SUB_PAGE_URL, |
|
text="""<html><head><title>Sub</title></head><body>Subpage content. <a href="http://example.com/another_sub">Deeper</a></body></html>""", |
|
) |
|
m.get(EXTERNAL_URL, text="External content") |
|
|
|
pages = crawler_fixture.crawl() |
|
|
|
assert len(pages) == 2 |
|
|
|
main_page = next(p for p in pages if p.url == BASE_URL) |
|
sub_page = next(p for p in pages if p.url == SUB_PAGE_URL) |
|
|
|
assert main_page.title == "Main" |
|
assert main_page.meta_description == "Main page desc" |
|
assert sorted(main_page.meta_keywords) == sorted(["main", "test"]) |
|
assert "Subpage" in main_page.text_content |
|
|
|
assert sub_page.title == "Sub" |
|
assert "Subpage content" in sub_page.text_content |
|
assert sub_page.crawl_depth == 1 |
|
assert sub_page.parent_url == BASE_URL |
|
|
|
|
|
assert len(crawler_fixture.visited_urls) == 2 |
|
|
|
|
|
|
|
def test_crawl_respects_max_depth_zero(crawler_fixture): |
|
crawler_fixture.max_depth = 0 |
|
with requests_mock.Mocker() as m: |
|
m.get( |
|
BASE_URL, |
|
text=f"""<html><head><title>Depth Zero</title></head> |
|
<body><a href="{SUB_PAGE_URL}">Link</a></body></html>""", |
|
) |
|
|
|
pages = crawler_fixture.crawl() |
|
assert len(pages) == 1 |
|
assert pages[0].url == BASE_URL |
|
assert pages[0].title == "Depth Zero" |
|
assert len(crawler_fixture.visited_urls) == 1 |
|
|
|
|
|
def test_crawl_handles_http_error(crawler_fixture): |
|
with requests_mock.Mocker() as m: |
|
m.get( |
|
BASE_URL, |
|
text=f"""<html><head><title>Main</title></head><body><a href="{SUB_PAGE_URL}">Subpage</a></body></html>""", |
|
) |
|
m.get(SUB_PAGE_URL, status_code=404, text="Not Found") |
|
|
|
pages = crawler_fixture.crawl() |
|
|
|
assert len(pages) == 1 |
|
assert pages[0].url == BASE_URL |
|
|
|
assert SUB_PAGE_URL in crawler_fixture.visited_urls |
|
|
|
|
|
def test_crawl_include_exclude_patterns(crawler_with_patterns_fixture): |
|
|
|
|
|
|
|
page_docs_allowed = f"{BASE_URL}/docs/allowed" |
|
page_docs_v1_excluded = f"{BASE_URL}/docs/v1/excluded" |
|
page_docs_v2_allowed = ( |
|
f"{BASE_URL}/docs/v2/allowed_link" |
|
) |
|
page_blog_excluded = f"{BASE_URL}/blog/initial_link" |
|
|
|
crawler_with_patterns_fixture.start_url = ( |
|
page_docs_allowed |
|
) |
|
|
|
with requests_mock.Mocker() as m: |
|
|
|
m.get( |
|
page_docs_allowed, |
|
text=f"""<html><head><title>Docs Allowed</title></head> |
|
<body> |
|
<a href="{page_docs_v1_excluded}">To Excluded v1</a> |
|
<a href="{page_docs_v2_allowed}">To Allowed v2</a> |
|
<a href="{page_blog_excluded}">To Blog</a> |
|
</body></html>""", |
|
) |
|
|
|
m.get(page_docs_v1_excluded, text="V1 Excluded Content") |
|
m.get( |
|
page_docs_v2_allowed, |
|
text="<html><head><title>Docs V2 Allowed</title></head><body>V2 Content</body></html>", |
|
) |
|
m.get(page_blog_excluded, text="Blog Content") |
|
|
|
pages = crawler_with_patterns_fixture.crawl() |
|
|
|
assert len(pages) == 2 |
|
|
|
crawled_urls = [p.url for p in pages] |
|
assert page_docs_allowed in crawled_urls |
|
assert page_docs_v2_allowed in crawled_urls |
|
|
|
assert page_docs_v1_excluded not in crawled_urls |
|
assert page_blog_excluded not in crawled_urls |
|
|
|
page_v2 = next(p for p in pages if p.url == page_docs_v2_allowed) |
|
assert page_v2.title == "Docs V2 Allowed" |
|
|
|
|
|
def test_crawl_progress_callback(crawler_fixture): |
|
|
|
|
|
progress_log = [] |
|
|
|
def callback(processed_count, total_urls, current_url): |
|
progress_log.append((processed_count, total_urls, current_url)) |
|
|
|
with requests_mock.Mocker() as m: |
|
m.get( |
|
BASE_URL, |
|
text=f"""<html><head><title>Main</title></head> |
|
<body> |
|
<a href="{SUB_PAGE_URL}">Subpage</a> |
|
<a href="{BASE_URL}/another">Another</a> |
|
</body></html>""", |
|
) |
|
m.get(SUB_PAGE_URL, text="<html><body>Sub</body></html>") |
|
m.get(f"{BASE_URL}/another", text="<html><body>Another</body></html>") |
|
|
|
crawler_fixture.crawl(progress_callback=callback) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert ( |
|
len(progress_log) == 7 |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
assert progress_log[0][0] == 0 |
|
assert progress_log[0][2] == BASE_URL |
|
|
|
|
|
|
|
|
|
|
|
|