|
|
|
|
|
import requests |
|
|
|
from autogpt.commands.web_requests import scrape_text |
|
|
|
""" |
|
Code Analysis |
|
|
|
Objective: |
|
The objective of the "scrape_text" function is to scrape the text content from |
|
a given URL and return it as a string, after removing any unwanted HTML tags and scripts. |
|
|
|
Inputs: |
|
- url: a string representing the URL of the webpage to be scraped. |
|
|
|
Flow: |
|
1. Send a GET request to the given URL using the requests library and the user agent header from the config file. |
|
2. Check if the response contains an HTTP error. If it does, return an error message. |
|
3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags. |
|
4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup. |
|
5. Split the text into lines and then into chunks, removing any extra whitespace. |
|
6. Join the chunks into a single string with newline characters between them. |
|
7. Return the cleaned text. |
|
|
|
Outputs: |
|
- A string representing the cleaned text content of the webpage. |
|
|
|
Additional aspects: |
|
- The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively. |
|
- The function removes script and style tags from the HTML to avoid including unwanted content in the text output. |
|
- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text. |
|
""" |
|
|
|
|
|
class TestScrapeText: |
|
|
|
def test_scrape_text_with_valid_url(self, mocker): |
|
|
|
expected_text = "This is some sample text" |
|
mock_response = mocker.Mock() |
|
mock_response.status_code = 200 |
|
mock_response.text = f"<html><body><div><p style='color: blue;'>{expected_text}</p></div></body></html>" |
|
mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
|
|
|
url = "http://www.example.com" |
|
assert scrape_text(url) == expected_text |
|
|
|
|
|
def test_invalid_url(self, mocker): |
|
|
|
mocker.patch( |
|
"requests.Session.get", side_effect=requests.exceptions.RequestException |
|
) |
|
|
|
|
|
url = "http://www.invalidurl.com" |
|
error_message = scrape_text(url) |
|
assert "Error:" in error_message |
|
|
|
|
|
def test_no_text(self, mocker): |
|
|
|
mock_response = mocker.Mock() |
|
mock_response.status_code = 200 |
|
mock_response.text = "<html><body></body></html>" |
|
mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
|
|
|
url = "http://www.example.com" |
|
assert scrape_text(url) == "" |
|
|
|
|
|
def test_http_error(self, mocker): |
|
|
|
mocker.patch("requests.Session.get", return_value=mocker.Mock(status_code=404)) |
|
|
|
|
|
result = scrape_text("https://www.example.com") |
|
|
|
|
|
assert result == "Error: HTTP 404 error" |
|
|
|
|
|
def test_scrape_text_with_html_tags(self, mocker): |
|
|
|
html = "<html><body><p>This is <b>bold</b> text.</p></body></html>" |
|
mock_response = mocker.Mock() |
|
mock_response.status_code = 200 |
|
mock_response.text = html |
|
mocker.patch("requests.Session.get", return_value=mock_response) |
|
|
|
|
|
result = scrape_text("https://www.example.com") |
|
|
|
|
|
assert result == "This is bold text." |
|
|