import random import pandas as pd import numpy as np import scipy.stats np.set_printoptions(suppress=True) class sampling_inference(): def __init__(self,file_name,weight = [1,4]): self.raw_tweets= pd.read_excel(f"sheets/{file_name}.xlsx") self.weight = weight self.engagement = self.__engagement__() self.perc = self.__eval_perc__() self.perc_thres = np.percentile(self.engagement,self.perc) def __engagement__(self): tweets = self.raw_tweets['Tweet'] raw_retweets = self.raw_tweets['Retweet'].to_numpy() raw_favs = self.raw_tweets['Favs'].to_numpy() engagement = raw_retweets/self.weight[0]+raw_favs/self.weight[1] return engagement def __eval_perc__(self,perc=75): engagement = self.engagement while np.percentile(engagement,perc) == 0 and perc < 95: perc += 5 return perc def sampled_df(self): engagement = self.engagement above_perc = np.where(self.engagement >= self.perc_thres)[0] bellow_perc = np.where(self.engagement < self.perc_thres)[0].tolist() bellow_perc = np.array(random.sample(bellow_perc,above_perc.shape[0])) sampled_rows = np.concatenate((above_perc,bellow_perc)) sampled_df= self.raw_tweets.loc[sampled_rows].reset_index(drop=True) del sampled_df['Unnamed: 0'] return sampled_df