| import pickle |
| import time |
| import logging |
|
|
| import pytest |
| from cssselect import SelectorError, SelectorSyntaxError |
|
|
| from scrapling import Selector |
| logging.getLogger("scrapling").setLevel(logging.DEBUG) |
|
|
|
|
| @pytest.fixture |
| def html_content(): |
| return """ |
| <html> |
| <head> |
| <title>Complex Web Page</title> |
| <style> |
| .hidden { display: none; } |
| </style> |
| </head> |
| <body> |
| <header> |
| <nav> |
| <ul> |
| <li><a href="#home">Home</a></li> |
| <li><a href="#about">About</a></li> |
| <li><a href="#contact">Contact</a></li> |
| </ul> |
| </nav> |
| </header> |
| <main> |
| <section id="products" schema='{"jsonable": "data"}'> |
| <h2>Products</h2> |
| <div class="product-list"> |
| <article class="product" data-id="1"> |
| <h3>Product 1</h3> |
| <p class="description">This is product 1</p> |
| <span class="price">$10.99</span> |
| <div class="hidden stock">In stock: 5</div> |
| </article> |
| <article class="product" data-id="2"> |
| <h3>Product 2</h3> |
| <p class="description">This is product 2</p> |
| <span class="price">$20.99</span> |
| <div class="hidden stock">In stock: 3</div> |
| </article> |
| <article class="product" data-id="3"> |
| <h3>Product 3</h3> |
| <p class="description">This is product 3</p> |
| <span class="price">$15.99</span> |
| <div class="hidden stock">Out of stock</div> |
| </article> |
| </div> |
| </section> |
| <section id="reviews"> |
| <h2>Customer Reviews</h2> |
| <div class="review-list"> |
| <div class="review" data-rating="5"> |
| <p class="review-text">Great product!</p> |
| <span class="reviewer">John Doe</span> |
| </div> |
| <div class="review" data-rating="4"> |
| <p class="review-text">Good value for money.</p> |
| <span class="reviewer">Jane Smith</span> |
| </div> |
| </div> |
| </section> |
| </main> |
| <footer> |
| <p>© 2024 Our Company</p> |
| </footer> |
| <script id="page-data" type="application/json"> |
| {"lastUpdated": "2024-09-22T10:30:00Z", "totalProducts": 3} |
| </script> |
| </body> |
| </html> |
| """ |
|
|
|
|
| @pytest.fixture |
| def page(html_content): |
| return Selector(html_content, adaptive=False) |
|
|
|
|
| |
| class TestCSSSelectors: |
| def test_basic_product_selection(self, page): |
| """Test selecting all product elements""" |
| elements = page.css("main #products .product-list article.product") |
| assert len(elements) == 3 |
|
|
| def test_in_stock_product_selection(self, page): |
| """Test selecting in-stock products""" |
| in_stock_products = page.css( |
| 'main #products .product-list article.product:not(:contains("Out of stock"))' |
| ) |
| assert len(in_stock_products) == 2 |
|
|
|
|
| |
| class TestXPathSelectors: |
| def test_high_rating_reviews(self, page): |
| """Test selecting reviews with high ratings""" |
| reviews = page.xpath( |
| '//section[@id="reviews"]//div[contains(@class, "review") and @data-rating >= 4]' |
| ) |
| assert len(reviews) == 2 |
|
|
| def test_high_priced_products(self, page): |
| """Test selecting products above a certain price""" |
| high_priced_products = page.xpath( |
| '//article[contains(@class, "product")]' |
| '[number(translate(substring-after(.//span[@class="price"], "$"), ",", "")) > 15]' |
| ) |
| assert len(high_priced_products) == 2 |
|
|
|
|
| |
| class TestTextMatching: |
| def test_regex_multiple_matches(self, page): |
| """Test finding multiple matches with regex""" |
| stock_info = page.find_by_regex(r"In stock: \d+", first_match=False) |
| assert len(stock_info) == 2 |
|
|
| def test_regex_first_match(self, page): |
| """Test finding the first match with regex""" |
| stock_info = page.find_by_regex( |
| r"In stock: \d+", first_match=True, case_sensitive=True |
| ) |
| assert stock_info.text == "In stock: 5" |
|
|
| def test_partial_text_match(self, page): |
| """Test finding elements with partial text match""" |
| stock_info = page.find_by_text(r"In stock:", partial=True, first_match=False) |
| assert len(stock_info) == 2 |
|
|
| def test_exact_text_match(self, page): |
| """Test finding elements with exact text match""" |
| out_of_stock = page.find_by_text( |
| "Out of stock", partial=False, first_match=False |
| ) |
| assert len(out_of_stock) == 1 |
|
|
|
|
| |
| class TestSimilarElements: |
| def test_finding_similar_products(self, page): |
| """Test finding similar product elements""" |
| first_product = page.css(".product").first |
| similar_products = first_product.find_similar() |
| assert len(similar_products) == 2 |
|
|
| def test_finding_similar_reviews(self, page): |
| """Test finding similar review elements with additional filtering""" |
| first_review = page.find("div", class_="review") |
| similar_high_rated_reviews = [ |
| review |
| for review in first_review.find_similar() |
| if int(review.attrib.get("data-rating", 0)) >= 4 |
| ] |
| assert len(similar_high_rated_reviews) == 1 |
|
|
|
|
| |
| class TestErrorHandling: |
| def test_invalid_selector_initialization(self): |
| """Test various invalid Selector initializations""" |
| |
| with pytest.raises(ValueError): |
| _ = Selector(adaptive=False) |
|
|
| with pytest.raises(TypeError): |
| _ = Selector(content=1, adaptive=False) |
|
|
| def test_invalid_storage(self, page, html_content): |
| """Test invalid storage parameter""" |
| with pytest.raises(ValueError): |
| _ = Selector(html_content, storage=object, adaptive=True) |
|
|
| def test_bad_selectors(self, page): |
| """Test handling of invalid selectors""" |
| with pytest.raises((SelectorError, SelectorSyntaxError)): |
| page.css("4 ayo") |
|
|
| with pytest.raises((SelectorError, SelectorSyntaxError)): |
| page.xpath("4 ayo") |
|
|
|
|
| |
| class TestPicklingAndRepresentation: |
| def test_unpickleable_objects(self, page): |
| """Test that Selector objects cannot be pickled""" |
| table = page.css(".product-list")[0] |
| with pytest.raises(TypeError): |
| pickle.dumps(table) |
|
|
| def test_string_representations(self, page): |
| """Test custom string representations of objects""" |
| table = page.css(".product-list")[0] |
| assert issubclass(type(table.__str__()), str) |
| assert issubclass(type(table.__repr__()), str) |
| assert issubclass(type(table.attrib.__str__()), str) |
| assert issubclass(type(table.attrib.__repr__()), str) |
|
|
|
|
| |
| class TestElementNavigation: |
| def test_basic_navigation_properties(self, page): |
| """Test basic navigation properties of elements""" |
| table = page.css(".product-list")[0] |
| assert table.path is not None |
| assert table.html_content != "" |
| assert table.prettify() != "" |
|
|
| def test_parent_and_sibling_navigation(self, page): |
| """Test parent and sibling navigation""" |
| table = page.css(".product-list")[0] |
| parent = table.parent |
| assert parent["id"] == "products" |
|
|
| parent_siblings = parent.siblings |
| assert len(parent_siblings) == 1 |
|
|
| def test_child_navigation(self, page): |
| """Test child navigation""" |
| table = page.css(".product-list")[0] |
| children = table.children |
| assert len(children) == 3 |
|
|
| def test_next_and_previous_navigation(self, page): |
| """Test next and previous element navigation""" |
| child = page.css(".product-list")[0].find({"data-id": "1"}) |
| next_element = child.next |
| assert next_element.attrib["data-id"] == "2" |
|
|
| prev_element = next_element.previous |
| assert prev_element.tag == child.tag |
|
|
| def test_ancestor_finding(self, page): |
| """Test finding ancestors of elements""" |
| all_prices = page.css(".price") |
| products_with_prices = [ |
| price.find_ancestor(lambda p: p.has_class("product")) |
| for price in all_prices |
| ] |
| assert len(products_with_prices) == 3 |
|
|
|
|
| |
| class TestJSONAndAttributes: |
| def test_json_conversion(self, page): |
| """Test converting content to JSON""" |
| script_content = page.css("#page-data::text")[0].get() |
| assert issubclass(type(script_content.sort()), str) |
| page_data = script_content.json() |
| assert page_data["totalProducts"] == 3 |
| assert "lastUpdated" in page_data |
|
|
| def test_attribute_operations(self, page): |
| """Test various attribute-related operations""" |
| |
| products = page.css(".product") |
| product_ids = [product.attrib["data-id"] for product in products] |
| assert product_ids == ["1", "2", "3"] |
| assert "data-id" in products[0] |
|
|
| |
| reviews = page.css(".review") |
| review_ratings = [int(review.attrib["data-rating"]) for review in reviews] |
| assert sum(review_ratings) / len(review_ratings) == 4.5 |
|
|
| |
| key_value = list(products[0].attrib.search_values("1", partial=False)) |
| assert list(key_value[0].keys()) == ["data-id"] |
|
|
| key_value = list(products[0].attrib.search_values("1", partial=True)) |
| assert list(key_value[0].keys()) == ["data-id"] |
|
|
| |
| attr_json = page.css("#products").first.attrib["schema"].json() |
| assert attr_json == {"jsonable": "data"} |
| assert isinstance(page.css("#products")[0].attrib.json_string, bytes) |
|
|
|
|
| |
| def test_large_html_parsing_performance(): |
| """Test parsing and selecting performance on large HTML""" |
| large_html = ( |
| "<html><body>" |
| + '<div class="item">' * 5000 |
| + "</div>" * 5000 |
| + "</body></html>" |
| ) |
|
|
| start_time = time.time() |
| parsed = Selector(large_html, adaptive=False) |
| elements = parsed.css(".item") |
| end_time = time.time() |
|
|
| |
| |
| |
| assert ( |
| end_time - start_time < 0.5 |
| ) |
|
|
|
|
| |
| def test_selectors_generation(page): |
| """Try to create selectors for all elements in the page""" |
|
|
| def _traverse(element: Selector): |
| assert isinstance(element.generate_css_selector, str) |
| assert isinstance(element.generate_full_css_selector, str) |
| assert isinstance(element.generate_xpath_selector, str) |
| assert isinstance(element.generate_full_xpath_selector, str) |
| for branch in element.children: |
| _traverse(branch) |
|
|
| _traverse(page) |
|
|
|
|
| |
| def test_getting_all_text(page): |
| """Test getting all text from the page""" |
| assert page.get_all_text() != "" |
|
|
|
|
| def test_regex_on_text(page): |
| """Test regex operations on text""" |
| element = page.css('[data-id="1"] .price')[0] |
| match = element.re_first(r"[\.\d]+") |
| assert match == "10.99" |
| match = element.text.re(r"(\d+)", replace_entities=False) |
| assert len(match) == 2 |
|
|