File size: 1,171 Bytes
c5b702e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
Twitter scraper.

This module provides the functionality to retrieve
a tweet's text given a tweet's URL.
"""
import re

import requests


def retrieve_tweet_text(tweet_url: str) -> str:
    """
    Retrieve a tweet's text.

    Args:
        tweet_url (url): Tweet's URL.

    Returns:
        str: Tweet's parsed text.
    """
    # Get the url to retrieve tweet-related data
    url = (
        "https://publish.twitter.com/oembed?dnt=true",
        f"&omit_script=true&url={tweet_url}",
    )
    url = str.join("", url)

    # Get the raw html containing th tweet text
    raw_html = requests.get(url).json()["html"]
    # Remove links from text
    pattern = r"<[a][^>]*>(.+?)</[a]>"
    html = re.sub(pattern, "", raw_html)

    # Remove the HTML tags from the text
    text = [i.strip() for i in re.sub("<.*?>", "", html).splitlines() if i][0]

    # If there is a picture, remove all the text after it
    if "pic" in text:
        idx = text.index("pic")
        text = text[:idx]
    # If there is no picture, the &mdash defines the tweet's
    # end.
    elif "&mdash" in text:
        idx = text.index("&mdash")
        text = text[:idx]

    return text