EmanuelRiquelme's picture
Upload 5 files
3efdb8d
raw
history blame
1.4 kB
import random
import pandas as pd
import numpy as np
import scipy.stats
np.set_printoptions(suppress=True)
class sampling_inference():
def __init__(self,file_name,weight = [1,4]):
self.raw_tweets= pd.read_excel(f"sheets/{file_name}.xlsx")
self.weight = weight
self.engagement = self.__engagement__()
self.perc = self.__eval_perc__()
self.perc_thres = np.percentile(self.engagement,self.perc)
def __engagement__(self):
tweets = self.raw_tweets['Tweet']
raw_retweets = self.raw_tweets['Retweet'].to_numpy()
raw_favs = self.raw_tweets['Favs'].to_numpy()
engagement = raw_retweets/self.weight[0]+raw_favs/self.weight[1]
return engagement
def __eval_perc__(self,perc=75):
engagement = self.engagement
while np.percentile(engagement,perc) == 0 and perc < 95:
perc += 5
return perc
def sampled_df(self):
engagement = self.engagement
above_perc = np.where(self.engagement >= self.perc_thres)[0]
bellow_perc = np.where(self.engagement < self.perc_thres)[0].tolist()
bellow_perc = np.array(random.sample(bellow_perc,above_perc.shape[0]))
sampled_rows = np.concatenate((above_perc,bellow_perc))
sampled_df= self.raw_tweets.loc[sampled_rows].reset_index(drop=True)
del sampled_df['Unnamed: 0']
return sampled_df