File size: 3,003 Bytes
8158335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
import nltk
from typing import List
from transformers import pipeline
from tqdm import tqdm


def tweet_cleaner(tweet: str) -> str:
    # words = set(nltk.corpus.words.words())
    """
    Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words.

    Args:
        tweet (str): A single tweet as a string.

    Returns:
        str: The cleaned tweet.
    """
    # Remove @ mentions from the tweet
    # tweet = re.sub("@[A-Za-z0-9]+", "", tweet)

    # # Remove URLs from the tweet
    # tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)

    # # Remove extra whitespaces from the tweet
    # tweet = " ".join(tweet.split())

    # # Remove hashtag sign but keep the text
    # tweet = tweet.replace("#", "").replace("_", " ")

    # # Tokenize the tweet and keep only valid words
    # tweet = " ".join(
    #     w
    #     for w in nltk.wordpunct_tokenize(tweet)
    #     if w.lower() in words or not w.isalpha()
    # )

    # # Return the cleaned tweet
    # return tweet
    bad_start = ["http:", "https:"]
    for w in bad_start:
        tweet = re.sub(f" {w}\\S+", "", tweet)  # removes white space before url
        tweet = re.sub(f"{w}\\S+ ", "", tweet)  # in case a tweet starts with a url
        tweet = re.sub(f"\n{w}\\S+ ", "", tweet)  # in case the url is on a new line
        tweet = re.sub(
            f"\n{w}\\S+", "", tweet
        )  # in case the url is alone on a new line
        tweet = re.sub(f"{w}\\S+", "", tweet)  # any other case?
    tweet = re.sub(" +", " ", tweet)  # replace multiple spaces with one space
    return " ".join(tweet.split()).strip()


def is_boring_tweet(tweet):
    """Check if tweet is boring."""
    boring_stuff = ["http", "@", "#"]
    not_boring_words = sum(
        1
        for word in tweet.split()
        if not any(bs in word.lower() for bs in boring_stuff)
    )
    return not_boring_words < 3


def fix_text(text):
    text = text.replace("&amp;", "&")
    text = text.replace("&lt;", "<")
    text = text.replace("&gt;", ">")
    return text


def get_tweets_sentiment(tweets: List[str]) -> List[float]:
    """
    Takes in a list of tweet texts and returns their sentiment scores as a list of floats between 0 and 1.

    Parameters:
    tweets (List[str]): A list of tweet texts.

    Returns:
    List[float]: A list of sentiment scores for the input tweets, where each score is a float between 0 and 1.
    """

    # Load the sentiment analysis pipeline
    classifier = pipeline(
        "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english"
    )

    if type(tweets[0]) == dict:
        # Clean tweets
        tweet_texts = [tweet_cleaner(t["content"]) for t in tqdm(tweets)]
    else:
        tweet_texts = [tweet_cleaner(t) for t in tqdm(tweets)]

    # Get tweet sentiment score
    tweet_sentiments = classifier(tweet_texts)

    # Extract the sentiment score from each result and return as a list
    return [t["score"] for t in tweet_sentiments]