Spaces:
Runtime error
Runtime error
File size: 1,171 Bytes
c5b702e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
"""
Twitter scraper.
This module provides the functionality to retrieve
a tweet's text given a tweet's URL.
"""
import re
import requests
def retrieve_tweet_text(tweet_url: str) -> str:
"""
Retrieve a tweet's text.
Args:
tweet_url (url): Tweet's URL.
Returns:
str: Tweet's parsed text.
"""
# Get the url to retrieve tweet-related data
url = (
"https://publish.twitter.com/oembed?dnt=true",
f"&omit_script=true&url={tweet_url}",
)
url = str.join("", url)
# Get the raw html containing th tweet text
raw_html = requests.get(url).json()["html"]
# Remove links from text
pattern = r"<[a][^>]*>(.+?)</[a]>"
html = re.sub(pattern, "", raw_html)
# Remove the HTML tags from the text
text = [i.strip() for i in re.sub("<.*?>", "", html).splitlines() if i][0]
# If there is a picture, remove all the text after it
if "pic" in text:
idx = text.index("pic")
text = text[:idx]
# If there is no picture, the &mdash defines the tweet's
# end.
elif "&mdash" in text:
idx = text.index("&mdash")
text = text[:idx]
return text
|