File size: 2,745 Bytes
d4974c7
 
 
 
 
 
 
2d8250d
d4974c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97d7e99
d4974c7
 
 
97d7e99
d4974c7
 
 
97d7e99
d4974c7
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import requests
from bs4 import BeautifulSoup


def download_text_and_title(url):
    try:
        # Remove the query string from the URL
        url = url.strip()
        url = url.split("?")[0]
        # Remove emojis and other special characters
        url = url.encode("ascii", "ignore").decode("ascii")

        # Send a GET request to the URL
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/50.0.2661.102 Safari/537.36"
        }

        response = requests.get(url, headers=headers, allow_redirects=True)
        # While response is a redirect, follow it
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.title.string if soup.title else "No Title Found"
        while title.startswith("http:/") or title.startswith("https:/"):
            url = title
            response = requests.get(url, headers=headers, allow_redirects=True)
            soup = BeautifulSoup(response.text, "html.parser")
            title = soup.title.string if soup.title else "No Title Found"

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract the title
            title = soup.title.string if soup.title else "No Title Found"

            # Extract all the text from the webpage
            text = [p.get_text() for p in soup.find_all("p")]
            text = [
                p.replace("\n", " ").replace("\r", " ").replace("\t", " ") for p in text
            ]
            text = [" ".join(p.strip().split()) for p in text]
            text = [p for p in text if len(p) > 0 and len(p.split()) > 5]

            # Clean text
            text = "\n".join(text)

            title = title.replace("\n", " ").replace("\r", " ").replace("\t", " ")
            title = " ".join(title.strip().split())

            return title, text, url
        else:
            print("Failed to retrieve the web page. Status code:", response.status_code)
            print("URL:", url)
            return None, None, None
    except Exception as e:
        print("An error occurred:", str(e))
        print("URL:", url)
        return None, None, None


# Example usage
if __name__ == "__main__":
    url = "https://www.huffingtonpost.es/sociedad/esta-palabra-mas-prescindible-espanol-cambia-entiende.html"  # Replace with the URL you want to scrape
    title, text = download_text_and_title(url)

    if title and text:
        print("Title:", title)
        print("Text:", text)
    else:
        print("Unable to retrieve text and title.")