EmanuelRiquelme commited on
Commit
3efdb8d
1 Parent(s): 6920130

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +42 -0
  2. extract_tweets.py +50 -0
  3. inference.py +113 -0
  4. sampling.py +34 -0
  5. sen_model.py +13 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from inference import Keyword_oracle
3
+ from datetime import date
4
+ from extract_tweets import extract_tweets
5
+ import torch
6
+ import gc
7
+ from pathlib import Path
8
+
9
+
10
+ header = st.container()
11
+ get_tweet= st.container()
12
+ features= st.container()
13
+ modelTraining = st.container()
14
+
15
+
16
+
17
+ with get_tweet:
18
+ Path('sheets/').mkdir(exist_ok=True)
19
+ st.header("Place the topic you want to research on Twitter :bird:")
20
+ input_keyword = st.text_input('Write the keyword:')
21
+ if input_keyword:
22
+ current_date = date.today()
23
+ data_since = st.date_input('from which date:',current_date)
24
+ data_until = st.date_input('until which date:',current_date)
25
+ max_kw = st.slider('maximum words per keyword', 1, 3, 1)
26
+ st.text('This process may take a few seconds')
27
+ st.text(f'plot of the keywords asociated with the topic {input_keyword}:')
28
+ extract_tweets(input_keyword,data_since,data_until)
29
+ oracle = Keyword_oracle(input_keyword,
30
+ keyphrase_ngram_range = (1,max_kw),
31
+ diversity=0.3,top_n=3)
32
+ st.pyplot(oracle.plot())
33
+ st.text("Table of the most popular keywords")
34
+ table = oracle.return_table()
35
+ st.dataframe(table)
36
+ st.download_button(
37
+ label="Download data as CSV",
38
+ data= table.to_csv().encode('utf-8'),
39
+ file_name= f'{input_keyword}.csv',
40
+ mime='text/csv',
41
+ )
42
+ del oracle
extract_tweets.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tweepy
2
+ import math
3
+ import pandas as pd
4
+ from tweepy import Client
5
+ from openpyxl import load_workbook
6
+ import re
7
+ import streamlit as st
8
+
9
+ api_key = st.secrets['api_key']
10
+ api_key_secret = st.secrets['api_key_secret']
11
+ access_token = st.secrets['access_token']
12
+ access_token_secret = st.secrets['access_token_secret']
13
+
14
+ auth = tweepy.OAuthHandler(api_key,api_key_secret)
15
+ auth.set_access_token(access_token,access_token_secret)
16
+
17
+ api = tweepy.API(auth)
18
+
19
+ def preprocess(tweets):
20
+ proccesed_tweets = []
21
+ for tweet in tweets.split():
22
+ tweet = '@user' if tweet.startswith('@') and len(tweet) > 1 else tweet
23
+ tweet = 'http' if tweet.startswith('http') else tweet
24
+ proccesed_tweets.append(tweet)
25
+ return " ".join(proccesed_tweets)
26
+
27
+
28
+ def extract_tweets(words,date_since,date_until,num_tweets=300):
29
+ tweets = tweepy.Cursor(
30
+ api.search_tweets,
31
+ words, lang="en",
32
+ since_id=date_since,
33
+ until=date_until,
34
+ tweet_mode='extended').items(num_tweets)
35
+ tweet_cont,tweet_rt,tweet_heart=[],[],[]
36
+ for tweet in tweets:
37
+ try:
38
+ tweet_cont.append(preprocess(tweet.full_text))
39
+ tweet_rt.append(tweet.retweet_count)
40
+ tweet_heart.append(tweet.retweeted_status.favorite_count)
41
+ except AttributeError:
42
+ tweet_heart.append(0)
43
+ data = {
44
+ 'Tweet': tweet_cont,
45
+ 'Retweet': tweet_rt,
46
+ 'Favs':tweet_heart
47
+ }
48
+ df = pd.DataFrame(data)
49
+ with pd.ExcelWriter(f'sheets/{words}.xlsx') as writer:
50
+ df.to_excel(writer)
inference.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from keybert import KeyBERT
2
+ from sen_model import Sentiment
3
+ from sampling import sampling_inference
4
+ import torch
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ import mplcyberpunk
9
+ from adjustText import adjust_text
10
+ class Keyword_oracle():
11
+ def __init__(self,file_name,
12
+ weight_rt_fav = [1,4],
13
+ noise_threshold = 75,
14
+ words_exp = ["user","http","rt","fav",'https'],
15
+ **kwargs
16
+ ):
17
+ self.key_bert = KeyBERT()
18
+ self.file_name = file_name
19
+ self.keybert_args = kwargs
20
+ self.weight_rt_fav = weight_rt_fav
21
+ self.raw_tweets= sampling_inference(file_name).sampled_df()
22
+ self.noise_threshold = noise_threshold if kwargs['top_n'] == 1 else 90 if kwargs['top_n'] == 2 else 95
23
+ self.tweets = self.raw_tweets['Tweet']
24
+ self.retweet = self.raw_tweets['Retweet']
25
+ self.favs = self.raw_tweets['Favs']
26
+ self.sentiment_eval = self.__sentimient_eval__()
27
+ self.words_exp = words_exp
28
+ self.mined_tweets = self.__tweets_mined__()
29
+ self.denoised_df = self.__denoised_df__()
30
+ self.percentiles = self.__find_threshold__()
31
+ self.categorical = self.__categorical__()
32
+
33
+ def __sentimient_eval__(self):
34
+ return Sentiment(self.tweets)
35
+
36
+ def __tweets_mined__(self):
37
+ raw_keywords = self.key_bert.extract_keywords(self.tweets,
38
+ keyphrase_ngram_range = self.keybert_args['keyphrase_ngram_range'],
39
+ diversity = self.keybert_args['diversity'],
40
+ top_n = self.keybert_args['top_n']
41
+ )
42
+ key_words,engagement,acum_sents = [],[],[]
43
+ for keys,retweet,fav,sent in zip(raw_keywords,self.retweet,self.favs,self.sentiment_eval):
44
+ for key in keys:
45
+ if not set(key[0].split()).intersection(set(self.words_exp)):
46
+ key_words.append(key[0])
47
+ engagement.append(1+retweet/self.weight_rt_fav[0]+fav/self.weight_rt_fav[1])
48
+ acum_sents.append(sent+retweet/self.weight_rt_fav[0]*(sent)+fav/self.weight_rt_fav[1]*sent)
49
+ key_word_data = {
50
+ "Key": key_words,
51
+ 'engagement': engagement,
52
+ 'emotions overall':acum_sents
53
+ }
54
+ return pd.DataFrame(key_word_data).groupby(['Key'], as_index=False).sum()
55
+
56
+
57
+ def __denoised_df__(self):
58
+ df = self.mined_tweets
59
+ tweets = df['engagement']
60
+ percentile = np.percentile(tweets, self.noise_threshold)
61
+ return df[tweets > percentile].reset_index(drop=True)
62
+
63
+ def __find_threshold__(self):
64
+ df = self.mined_tweets
65
+ tweets = df['emotions overall']
66
+ top_threshold = self.noise_threshold
67
+ bottom_threshold = 100-top_threshold
68
+ while np.percentile(tweets,top_threshold) <= 0 and np.percentile(tweets,100-top_threshold):
69
+ try:
70
+ top_threshold +=5
71
+ bottom_threshold -= 5
72
+ except top_threshold == 95:
73
+ top_threshold,bottom_threshold = 0,0
74
+ bottom_threshold,top_threshold = np.percentile(tweets,bottom_threshold),np.percentile(tweets,top_threshold)
75
+ return bottom_threshold,top_threshold
76
+ def __categorical__(self):
77
+ df = self.denoised_df
78
+ tweets = df['emotions overall'].to_numpy()
79
+ categorical = ['neutral','positive','negative']
80
+ bottom_threshold,top_threshold = self.percentiles
81
+ pos = (tweets >= top_threshold) if top_threshold > 0 else np.zeros(tweets.shape[0])
82
+ neg = (tweets <= bottom_threshold)*-1 if bottom_threshold < 0 else np.zeros(tweets.shape[0])
83
+ numerical = pos+neg
84
+ return [categorical[index] for index in numerical.astype(int)]
85
+
86
+ def return_table(self):
87
+ self.denoised_df['Categorical'] = self.__categorical__()
88
+ return self.denoised_df.sort_values(by=['emotions overall'],ascending = False).reset_index(drop=True)
89
+
90
+ def plot(self):
91
+ df = self.denoised_df
92
+ plt.style.use("cyberpunk")
93
+ keys = df['Key']
94
+ x,y = df['engagement'],df['emotions overall']
95
+ fig, ax = plt.subplots()
96
+ ax.scatter(x, y)
97
+ text = [plt.text(x_value,y_value,key_value) for x_value,y_value,key_value in zip(x,y,keys)]
98
+ adjust_text(text)
99
+ bottom_threshold,top_threshold = self.percentiles
100
+ plt.axhline(bottom_threshold ,c= "red", marker='.', linestyle=':') if bottom_threshold < 0 else None
101
+ plt.axhline(top_threshold,c= "magenta", marker='.', linestyle=':') if top_threshold > 0 else None
102
+ plt.title(f"Denoised sentiment analysis of {self.file_name}")
103
+ plt.xlabel("Engagement")
104
+ plt.ylabel("Emotions Overall")
105
+ return fig
106
+
107
+ if __name__ == "__main__":
108
+ file_name ='Graham Potter'
109
+ Keyword_oracle = Keyword_oracle(file_name,
110
+ keyphrase_ngram_range = (1,2),
111
+ diversity=0.3,top_n=3)
112
+ Keyword_oracle.plot()
113
+ print(Keyword_oracle.return_table())
sampling.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import pandas as pd
3
+ import numpy as np
4
+ import scipy.stats
5
+ np.set_printoptions(suppress=True)
6
+
7
+ class sampling_inference():
8
+ def __init__(self,file_name,weight = [1,4]):
9
+ self.raw_tweets= pd.read_excel(f"sheets/{file_name}.xlsx")
10
+ self.weight = weight
11
+ self.engagement = self.__engagement__()
12
+ self.perc = self.__eval_perc__()
13
+ self.perc_thres = np.percentile(self.engagement,self.perc)
14
+ def __engagement__(self):
15
+ tweets = self.raw_tweets['Tweet']
16
+ raw_retweets = self.raw_tweets['Retweet'].to_numpy()
17
+ raw_favs = self.raw_tweets['Favs'].to_numpy()
18
+ engagement = raw_retweets/self.weight[0]+raw_favs/self.weight[1]
19
+ return engagement
20
+ def __eval_perc__(self,perc=75):
21
+ engagement = self.engagement
22
+ while np.percentile(engagement,perc) == 0 and perc < 95:
23
+ perc += 5
24
+ return perc
25
+ def sampled_df(self):
26
+ engagement = self.engagement
27
+ above_perc = np.where(self.engagement >= self.perc_thres)[0]
28
+ bellow_perc = np.where(self.engagement < self.perc_thres)[0].tolist()
29
+ bellow_perc = np.array(random.sample(bellow_perc,above_perc.shape[0]))
30
+ sampled_rows = np.concatenate((above_perc,bellow_perc))
31
+ sampled_df= self.raw_tweets.loc[sampled_rows].reset_index(drop=True)
32
+ del sampled_df['Unnamed: 0']
33
+ return sampled_df
34
+
sen_model.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import numpy as np
3
+
4
+ specific_model = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")
5
+
6
+ def Sentiment(tweets):
7
+ output_model = specific_model(tweets.tolist())
8
+ labels = ["NEG","NEU","POS"]
9
+ idx = []
10
+ for output in output_model:
11
+ idx.append(labels.index(output["label"])-1)
12
+ return np.array(idx)
13
+