Spaces:

hlopez
/

Twitter-Positivity-Analyzer

Runtime error

Twitter-Positivity-Analyzer / tweet_scraper.py

Hector Lopez

Upload application logic

c5b702e almost 2 years ago

No virus

1.17 kB

	"""
	Twitter scraper.

	This module provides the functionality to retrieve
	a tweet's text given a tweet's URL.
	"""
	import re

	import requests


	def retrieve_tweet_text(tweet_url: str) -> str:
	"""
	Retrieve a tweet's text.

	Args:
	tweet_url (url): Tweet's URL.

	Returns:
	str: Tweet's parsed text.
	"""
	# Get the url to retrieve tweet-related data
	url = (
	"https://publish.twitter.com/oembed?dnt=true",
	f"&omit_script=true&url={tweet_url}",
	)
	url = str.join("", url)

	# Get the raw html containing th tweet text
	raw_html = requests.get(url).json()["html"]
	# Remove links from text
	pattern = r"<[a][^>]*>(.+?)</[a]>"
	html = re.sub(pattern, "", raw_html)

	# Remove the HTML tags from the text
	text = [i.strip() for i in re.sub("<.*?>", "", html).splitlines() if i][0]

	# If there is a picture, remove all the text after it
	if "pic" in text:
	idx = text.index("pic")
	text = text[:idx]
	# If there is no picture, the &mdash defines the tweet's
	# end.
	elif "&mdash" in text:
	idx = text.index("&mdash")
	text = text[:idx]

	return text