File size: 6,919 Bytes
eceff29
dc67c78
03f299a
eceff29
 
 
dc67c78
c706f5e
dc67c78
eceff29
 
 
 
 
 
dc67c78
eceff29
 
dc67c78
eceff29
 
 
 
 
 
 
dc67c78
 
eceff29
 
4406f1d
c706f5e
eceff29
dc67c78
eceff29
dc67c78
 
 
 
eceff29
 
 
 
dc67c78
eceff29
 
 
 
 
 
 
dc67c78
eceff29
4406f1d
dc67c78
eceff29
dc67c78
eceff29
 
dc67c78
4406f1d
eceff29
dc67c78
 
 
eceff29
dc67c78
eceff29
 
dc67c78
 
 
eceff29
 
 
dc67c78
eceff29
dc67c78
 
eceff29
dc67c78
 
eceff29
dc67c78
eceff29
dc67c78
 
 
 
 
4406f1d
 
dc67c78
 
4406f1d
dc67c78
eceff29
dc67c78
 
eceff29
4406f1d
 
eceff29
4406f1d
 
 
eceff29
 
 
 
dc67c78
 
 
eceff29
dc67c78
eceff29
dc67c78
eceff29
dc67c78
 
eceff29
dc67c78
eceff29
dc67c78
eceff29
dc67c78
eceff29
dc67c78
eceff29
dc67c78
eceff29
dc67c78
eceff29
dc67c78
eceff29
dc67c78
eceff29
 
dc67c78
eceff29
dc67c78
eceff29
dc67c78
 
 
eceff29
 
 
 
 
 
 
dc67c78
 
 
 
eceff29
 
 
 
 
dc67c78
4406f1d
eceff29
4406f1d
 
 
eceff29
 
03f299a
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import openai
import regex as re
from twitterscraper import TwitterScraper as tf

openai.api_key = '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'


class TextClassifier:

    def classify_topics(tweet_dict):
        tweet_list = list(tweet_dict.keys())
        prediction_dict = {}

        for tweet in tweet_list:
            prompt_string = "Classify this tweet with a general topic and two sub-topics:\n\""
            prompt_string += tweet
            prompt_string += "\".\nGeneral topic: \nSub topic 1: \nSub topic 2:\n. The classifications should not be " \
                             "more than 5 words. Numerate each topic in the output. END "
            response = openai.Completion.create(
                model="text-davinci-002",
                prompt=prompt_string,
                temperature=0,
                max_tokens=892,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
            )

            classifications_unclean = response.choices[0]['text']
            prediction_dict[tweet] = classifications_unclean

        return TextClassifier.cleanup_topic_results(prediction_dict, tweet_dict)

    def classify_sentiments(tweet_dict):
        tweet_list = list(tweet_dict.keys())

        for tweet in tweet_list:
            prompt_string = "Classify one sentiment for this tweet:\n \""
            prompt_string += tweet
            prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement," \
                             "\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire," \
                             "\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT= "

            response = openai.Completion.create(
                model="text-davinci-002",
                prompt=prompt_string,
                temperature=0,
                max_tokens=256,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
            )
            classifications_unclean = response.choices[0]['text']
            tweet_dict[tweet]['sentiment'] = classifications_unclean

        return tweet_dict

    def cleanup_topic_results(prediction_dict, tweet_dict):
        temp_list = []

        for tweet, item in prediction_dict.items():
            temp_list = []
            new_item = item.replace("\n", " ")
            new_item = new_item.replace("  ", " ")
            new_item = new_item[4:]
            new_item = re.sub('\d', '', new_item)
            sub_list = new_item.split(".")

            for item in sub_list:
                if item.startswith(' '):
                    item = item[1:]
                if item.endswith(' '):
                    item = item[:-1]
                temp_list.append(item)
            tweet_dict[tweet]['topic'] = temp_list

        return tweet_dict

    def print_results(results_dict):
        print('\033[1m' + "RESULTS" + '\033[0m', "\n")
        for key in results_dict.keys():
            predictions = results_dict[key]
            print("\"" + key + "\"" + "\n" + str(predictions), "\n" + "---------------------------------")

    def print_stats(result_dict):
        user = ""
        freq_dict = {}
        mean_likes = {}
        mean_retweets = {}
        mean_replies = {}
        sentiment_dict = {}
        nbr_sentiment = 0
        nbr_topics = 0

        for key, value in result_dict.items():

            nlikes = value['nlikes']
            nreplies = value['nreplies']
            nretweets = value['nretweets']
            topic_list = value['topic']
            sentiment = value['sentiment']

            # Count sentiment frequency
            if sentiment in sentiment_dict.keys():
                sentiment_dict[sentiment] += 1
            else:
                sentiment_dict[sentiment] = 1
                nbr_sentiment += 1

                # Count topic frequency
            for topic in topic_list:
                if topic in freq_dict.keys():
                    freq_dict[topic] += 1

                else:
                    freq_dict[topic] = 1
                    nbr_topics += 1

                # Count total likes per topic
                if topic in mean_likes.keys():
                    mean_likes[topic] += nlikes
                else:
                    mean_likes[topic] = nlikes

                    # Count total retweets per topic
                if topic in mean_retweets.keys():
                    mean_retweets[topic] += nretweets
                else:
                    mean_retweets[topic] = nretweets

                    # Count total replies per topic
                if topic in mean_replies.keys():
                    mean_replies[topic] += nreplies
                else:
                    mean_replies[topic] = nreplies

                    # Count mean of likes
        for key in mean_likes.keys():
            mean_likes[key] = mean_likes[key] / freq_dict[key]

        # Count mean of retweets
        for key in mean_retweets.keys():
            mean_retweets[key] = mean_retweets[key] / freq_dict[key]

        # Print the names of the columns.
        print('\033[1m' + "USER: " + '\033[0m', user)
        print('\033[1m' + "NBR OF TWEETS SCRAPED: " + '\033[0m', len(list(result_dict.keys())))
        print('\033[1m' + "NBR OF DIFFERENT TOPICS: " + '\033[0m', nbr_topics, "\n")
        print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY',
                                                                 'AVERAGE NBR OF LIKES', 'AVERAGE NBR OF RETWEETS',
                                                                 'AVERAGE NBR OF REPLIES', 'REACH AVERAGE' + '\033[0m'))

        # print each data item.
        for key, value in mean_likes.items():
            topic = key
            mean_likes = value
            reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic]) / 3
            print(
                "{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format(topic, freq_dict[topic], "{:.2f}".format(mean_likes),
                                                                   "{:.2f}".format(mean_retweets[topic]),
                                                                   mean_replies[topic], "{:.2f}".format(reach_avg)))

        print("\n")
        print('\033[1m' + "NBR OF DIFFERENT SENTIMENTS: " + '\033[0m', nbr_sentiment, "\n")
        print("{:<60} {:<20}".format('\033[1m' + 'SENTIMENT', 'SENTIMENT FREQUENCY' + '\033[0m'))
        for key, value in sentiment_dict.items():
            sentiment = key
            mean_sentiment = value
            print("{:<60} {:<20}".format(sentiment, sentiment_dict[sentiment], "{:.2f}".format(mean_sentiment)))


if __name__ == '__main__':
    sc = tf.TwitterScraper(num_tweets=40)
    dc = sc.scrape_by_user("jimmieakesson")
    print(dc.head())
    print(dc.shape)