File size: 3,335 Bytes
a85c9b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import hashlib
from unittest.mock import Mock, patch

import pytest

from embedchain.loaders.web_page import WebPageLoader


@pytest.fixture
def web_page_loader():
    return WebPageLoader()


def test_load_data(web_page_loader):
    page_url = "https://example.com/page"
    mock_response = Mock()
    mock_response.status_code = 200
    mock_response.content = """
        <html>
            <head>
                <title>Test Page</title>
            </head>
            <body>
                <div id="content">
                    <p>This is some test content.</p>
                </div>
            </body>
        </html>
    """
    with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response):
        result = web_page_loader.load_data(page_url)

    content = web_page_loader._get_clean_content(mock_response.content, page_url)
    expected_doc_id = hashlib.sha256((content + page_url).encode()).hexdigest()
    assert result["doc_id"] == expected_doc_id

    expected_data = [
        {
            "content": content,
            "meta_data": {
                "url": page_url,
            },
        }
    ]

    assert result["data"] == expected_data


def test_get_clean_content_excludes_unnecessary_info(web_page_loader):
    mock_html = """
        <html>
        <head>
            <title>Sample HTML</title>
            <style>
                /* Stylesheet to be excluded */
                .elementor-location-header {
                    background-color: #f0f0f0;
                }
            </style>
        </head>
        <body>
            <header id="header">Header Content</header>
            <nav class="nav">Nav Content</nav>
            <aside>Aside Content</aside>
            <form>Form Content</form>
            <main>Main Content</main>
            <footer class="footer">Footer Content</footer>
            <script>Some Script</script>
            <noscript>NoScript Content</noscript>
            <svg>SVG Content</svg>
            <canvas>Canvas Content</canvas>
            
            <div id="sidebar">Sidebar Content</div>
            <div id="main-navigation">Main Navigation Content</div>
            <div id="menu-main-menu">Menu Main Menu Content</div>
            
            <div class="header-sidebar-wrapper">Header Sidebar Wrapper Content</div>
            <div class="blog-sidebar-wrapper">Blog Sidebar Wrapper Content</div>
            <div class="related-posts">Related Posts Content</div>
        </body>
        </html>
    """

    tags_to_exclude = [
        "nav",
        "aside",
        "form",
        "header",
        "noscript",
        "svg",
        "canvas",
        "footer",
        "script",
        "style",
    ]
    ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"]
    classes_to_exclude = [
        "elementor-location-header",
        "navbar-header",
        "nav",
        "header-sidebar-wrapper",
        "blog-sidebar-wrapper",
        "related-posts",
    ]

    content = web_page_loader._get_clean_content(mock_html, "https://example.com/page")

    for tag in tags_to_exclude:
        assert tag not in content

    for id in ids_to_exclude:
        assert id not in content

    for class_name in classes_to_exclude:
        assert class_name not in content

    assert len(content) > 0