File size: 3,960 Bytes
a85c9b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import hashlib
from unittest.mock import Mock, patch

import pytest
from requests import Response

from embedchain.loaders.docs_site_loader import DocsSiteLoader


@pytest.fixture
def mock_requests_get():
    with patch("requests.get") as mock_get:
        yield mock_get


@pytest.fixture
def docs_site_loader():
    return DocsSiteLoader()


def test_get_child_links_recursive(mock_requests_get, docs_site_loader):
    mock_response = Mock()
    mock_response.status_code = 200
    mock_response.text = """
        <html>
            <a href="/page1">Page 1</a>
            <a href="/page2">Page 2</a>
        </html>
    """
    mock_requests_get.return_value = mock_response

    docs_site_loader._get_child_links_recursive("https://example.com")

    assert len(docs_site_loader.visited_links) == 2
    assert "https://example.com/page1" in docs_site_loader.visited_links
    assert "https://example.com/page2" in docs_site_loader.visited_links


def test_get_child_links_recursive_status_not_200(mock_requests_get, docs_site_loader):
    mock_response = Mock()
    mock_response.status_code = 404
    mock_requests_get.return_value = mock_response

    docs_site_loader._get_child_links_recursive("https://example.com")

    assert len(docs_site_loader.visited_links) == 0


def test_get_all_urls(mock_requests_get, docs_site_loader):
    mock_response = Mock()
    mock_response.status_code = 200
    mock_response.text = """
        <html>
            <a href="/page1">Page 1</a>
            <a href="/page2">Page 2</a>
            <a href="https://example.com/external">External</a>
        </html>
    """
    mock_requests_get.return_value = mock_response

    all_urls = docs_site_loader._get_all_urls("https://example.com")

    assert len(all_urls) == 3
    assert "https://example.com/page1" in all_urls
    assert "https://example.com/page2" in all_urls
    assert "https://example.com/external" in all_urls


def test_load_data_from_url(mock_requests_get, docs_site_loader):
    mock_response = Mock()
    mock_response.status_code = 200
    mock_response.content = """
        <html>
            <nav>
                <h1>Navigation</h1>
            </nav>
            <article class="bd-article">
                <p>Article Content</p>
            </article>
        </html>
    """.encode()
    mock_requests_get.return_value = mock_response

    data = docs_site_loader._load_data_from_url("https://example.com/page1")

    assert len(data) == 1
    assert data[0]["content"] == "Article Content"
    assert data[0]["meta_data"]["url"] == "https://example.com/page1"


def test_load_data_from_url_status_not_200(mock_requests_get, docs_site_loader):
    mock_response = Mock()
    mock_response.status_code = 404
    mock_requests_get.return_value = mock_response

    data = docs_site_loader._load_data_from_url("https://example.com/page1")

    assert data == []
    assert len(data) == 0


def test_load_data(mock_requests_get, docs_site_loader):
    mock_response = Response()
    mock_response.status_code = 200
    mock_response._content = """
        <html>
            <a href="/page1">Page 1</a>
            <a href="/page2">Page 2</a>
        """.encode()
    mock_requests_get.return_value = mock_response

    url = "https://example.com"
    data = docs_site_loader.load_data(url)
    expected_doc_id = hashlib.sha256((" ".join(docs_site_loader.visited_links) + url).encode()).hexdigest()

    assert len(data["data"]) == 2
    assert data["doc_id"] == expected_doc_id


def test_if_response_status_not_200(mock_requests_get, docs_site_loader):
    mock_response = Response()
    mock_response.status_code = 404
    mock_requests_get.return_value = mock_response

    url = "https://example.com"
    data = docs_site_loader.load_data(url)
    expected_doc_id = hashlib.sha256((" ".join(docs_site_loader.visited_links) + url).encode()).hexdigest()

    assert len(data["data"]) == 0
    assert data["doc_id"] == expected_doc_id