|
|
|
|
|
|
|
|
|
import pytest |
|
|
|
from autogpt.commands.web_requests import scrape_links |
|
|
|
""" |
|
Code Analysis |
|
|
|
Objective: |
|
The objective of the 'scrape_links' function is to scrape hyperlinks from a |
|
given URL and return them in a formatted way. |
|
|
|
Inputs: |
|
- url: a string representing the URL to be scraped. |
|
|
|
Flow: |
|
1. Send a GET request to the given URL using the requests library and the user agent header from the config file. |
|
2. Check if the response contains an HTTP error. If it does, return "error". |
|
3. Parse the HTML content of the response using the BeautifulSoup library. |
|
4. Remove any script and style tags from the parsed HTML. |
|
5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function. |
|
6. Format the extracted hyperlinks using the 'format_hyperlinks' function. |
|
7. Return the formatted hyperlinks. |
|
|
|
Outputs: |
|
- A list of formatted hyperlinks. |
|
|
|
Additional aspects: |
|
- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP |
|
requests and parse HTML content, respectively. |
|
- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML. |
|
- The 'format_hyperlinks' function is called to format the extracted hyperlinks. |
|
- The function checks for HTTP errors and returns "error" if any are found. |
|
""" |
|
|
|
|
|
class TestScrapeLinks: |
|
|
|
|
|
def test_valid_url_with_hyperlinks(self): |
|
url = "https://www.google.com" |
|
result = scrape_links(url) |
|
assert len(result) > 0 |
|
assert isinstance(result, list) |
|
assert isinstance(result[0], str) |
|
|
|
|
|
def test_valid_url(self, mocker): |
|
|
|
mock_response = mocker.Mock() |
|
mock_response.status_code = 200 |
|
mock_response.text = ( |
|
"<html><body><a href='https://www.google.com'>Google</a></body></html>" |
|
) |
|
mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
|
|
|
result = scrape_links("https://www.example.com") |
|
|
|
|
|
assert result == ["Google (https://www.google.com)"] |
|
|
|
|
|
def test_invalid_url(self, mocker): |
|
|
|
mock_response = mocker.Mock() |
|
mock_response.status_code = 404 |
|
mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
|
|
|
result = scrape_links("https://www.invalidurl.com") |
|
|
|
|
|
assert "Error:" in result |
|
|
|
|
|
def test_no_hyperlinks(self, mocker): |
|
|
|
mock_response = mocker.Mock() |
|
mock_response.status_code = 200 |
|
mock_response.text = "<html><body><p>No hyperlinks here</p></body></html>" |
|
mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
|
|
|
result = scrape_links("https://www.example.com") |
|
|
|
|
|
assert result == [] |
|
|
|
|
|
|
|
def test_scrape_links_with_few_hyperlinks(self, mocker): |
|
|
|
mock_response = mocker.Mock() |
|
mock_response.status_code = 200 |
|
mock_response.text = """ |
|
<html> |
|
<body> |
|
<div id="google-link"><a href="https://www.google.com">Google</a></div> |
|
<div id="github"><a href="https://github.com">GitHub</a></div> |
|
<div id="CodiumAI"><a href="https://www.codium.ai">CodiumAI</a></div> |
|
</body> |
|
</html> |
|
""" |
|
mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
|
|
|
result = scrape_links("https://www.example.com") |
|
|
|
|
|
assert isinstance(result, list) |
|
assert len(result) == 3 |
|
assert result[0] == "Google (https://www.google.com)" |
|
assert result[1] == "GitHub (https://github.com)" |
|
assert result[2] == "CodiumAI (https://www.codium.ai)" |
|
|