Spaces:

Iker
/

ClickbaitFighter

Running on Zero

App Files Files Community

ClickbaitFighter / download_url.py

Iker

Minor improvements

2d8250d verified 10 months ago

raw

history blame

2.75 kB

	import requests
	from bs4 import BeautifulSoup


	def download_text_and_title(url):
	try:
	# Remove the query string from the URL
	url = url.strip()
	url = url.split("?")[0]
	# Remove emojis and other special characters
	url = url.encode("ascii", "ignore").decode("ascii")

	# Send a GET request to the URL
	headers = {
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/50.0.2661.102 Safari/537.36"
	}

	response = requests.get(url, headers=headers, allow_redirects=True)
	# While response is a redirect, follow it
	soup = BeautifulSoup(response.text, "html.parser")
	title = soup.title.string if soup.title else "No Title Found"
	while title.startswith("http:/") or title.startswith("https:/"):
	url = title
	response = requests.get(url, headers=headers, allow_redirects=True)
	soup = BeautifulSoup(response.text, "html.parser")
	title = soup.title.string if soup.title else "No Title Found"

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.text, "html.parser")

	# Extract the title
	title = soup.title.string if soup.title else "No Title Found"

	# Extract all the text from the webpage
	text = [p.get_text() for p in soup.find_all("p")]
	text = [
	p.replace("\n", " ").replace("\r", " ").replace("\t", " ") for p in text
	]
	text = [" ".join(p.strip().split()) for p in text]
	text = [p for p in text if len(p) > 0 and len(p.split()) > 5]

	# Clean text
	text = "\n".join(text)

	title = title.replace("\n", " ").replace("\r", " ").replace("\t", " ")
	title = " ".join(title.strip().split())

	return title, text, url
	else:
	print("Failed to retrieve the web page. Status code:", response.status_code)
	print("URL:", url)
	return None, None, None
	except Exception as e:
	print("An error occurred:", str(e))
	print("URL:", url)
	return None, None, None


	# Example usage
	if __name__ == "__main__":
	url = "https://www.huffingtonpost.es/sociedad/esta-palabra-mas-prescindible-espanol-cambia-entiende.html" # Replace with the URL you want to scrape
	title, text = download_text_and_title(url)

	if title and text:
	print("Title:", title)
	print("Text:", text)
	else:
	print("Unable to retrieve text and title.")