import pandas as pd import numpy as np import datetime import tqdm import gradio as gr import matplotlib import matplotlib.pyplot as plt import seaborn as sns import snscrape.modules.twitter as sntwitter matplotlib.use("Agg") css = """ footer {display:none !important} .max-h-\[30rem\] {max-height: 15rem !important;} .min-h-\[15rem\] {max-height: 5rem !important;} .hover\:bg-orange-50:hover { --tw-bg-opacity: 1 !important; background-color: rgb(229,225,255) !important; } """ with gr.Blocks(title="Twitter Temporal Insights | Data Science Dojo", css = css) as demo: def search(text,username,since,until,retweet,replies): global filename q = text if username!='': q += f" from:{username}" if until=='': until = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d') q += f" until:{until}" if since=='': since = datetime.datetime.strftime(datetime.datetime.strptime(until, '%Y-%m-%d') - datetime.timedelta(days=7), '%Y-%m-%d') q += f" since:{since}" if retweet == True: q += f" exclude:retweets" if replies == True: q += f" exclude:replies" if username!='' and text!='': filename = f"{since}_{until}_{username}_{text}.csv" elif username!="": filename = f"{since}_{until}_{username}.csv" else: filename = f"{since}_{until}_{text}.csv" print(filename) return q,filename def file_viz(dataset): tweets_df1 = pd.read_csv(dataset.name) since= str(min(tweets_df1['Year'])) until = str(max(tweets_df1['Year'])) print(dataset.name) f, ax = plt.subplots() sns.countplot(x= tweets_df1['Year']) for p in ax.patches: ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) f2,ax2 = plt.subplots() sns.lineplot(x=tweets_df1.Year.value_counts().index,y=tweets_df1.Year.value_counts().values) ax2.set_xlabel("Year") ax2.set_ylabel('Count') ax2.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f3,ax3 = plt.subplots() sns.histplot(x=tweets_df1.Year,stat='count',binwidth=1,kde='true',discrete=True) ax3.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f4,ax4 = plt.subplots() sns.kdeplot(x=tweets_df1.Year,fill=True) ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f5,ax5 = plt.subplots() sns.kdeplot(x=tweets_df1.Year,fill=True,bw_adjust=3) ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f6, ax6 = plt.subplots() sns.countplot(x= tweets_df1['Month']) for p in ax6.patches: ax6.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) f7,ax7 = plt.subplots() sns.lineplot(x=tweets_df1.Month.value_counts().index,y=tweets_df1.Month.value_counts().values) ax7.set_xlabel("Month") ax7.set_ylabel('Count') ax7.set_xticks(np.arange(1,13,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f8,ax8 = plt.subplots() sns.histplot(x=tweets_df1.Month,stat='count',binwidth=1,kde='true',discrete=True) ax8.set_xticks(np.arange(1,13,1)) f9,ax9 = plt.subplots() sns.kdeplot(x=tweets_df1.Month,fill=True) ax9.set_xticks(np.arange(1,13,1)) f10,ax10 = plt.subplots() sns.kdeplot(x=tweets_df1.Month,fill=True,bw_adjust=3) ax10.set_xticks(np.arange(1,13,1)) f11, ax11 = plt.subplots() sns.countplot(x= tweets_df1['Week']) for p in ax11.patches: ax11.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+1), fontsize = 10) plt.xticks(fontsize=7, rotation=45,horizontalalignment = 'center') #plt.setp(ax11.get_xticklabels(), rotation=30) f12,ax12 = plt.subplots() sns.lineplot(x=tweets_df1.Week.value_counts().index,y=tweets_df1.Week.value_counts().values) ax12.set_xlabel("Week") ax12.set_ylabel('Count') #ax12.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f13,ax13 = plt.subplots() sns.histplot(x=tweets_df1.Week,stat='count',binwidth=1,kde='true',discrete=True) #ax13.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f14,ax14 = plt.subplots() sns.kdeplot(x=tweets_df1.Week,fill=True) #ax14.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f15,ax15 = plt.subplots() sns.kdeplot(x=tweets_df1.Week,fill=True,bw_adjust=3) #ax15.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f16, ax16 = plt.subplots() sns.countplot(x= tweets_df1['MonthDay']) for p in ax16.patches: ax16.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) plt.xticks(fontsize=10, rotation=45,horizontalalignment = 'center') f17,ax17 = plt.subplots() sns.lineplot(x=tweets_df1.MonthDay.value_counts().index,y=tweets_df1.MonthDay.value_counts().values) ax17.set_xlabel("MonthDay") ax17.set_ylabel('Count') #ax17.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f18,ax18 = plt.subplots() sns.histplot(x=tweets_df1.MonthDay,stat='count',binwidth=1,kde='true',discrete=True) #ax18.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f19,ax19 = plt.subplots() sns.kdeplot(x=tweets_df1.MonthDay,fill=True) #ax19.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f20,ax20 = plt.subplots() sns.kdeplot(x=tweets_df1.MonthDay,fill=True,bw_adjust=3) #ax20.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f21, ax21 = plt.subplots() sns.countplot(x= tweets_df1['Hour']) for p in ax21.patches: ax21.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+10), fontsize = 12) f22,ax22 = plt.subplots() sns.lineplot(x=tweets_df1.Hour.value_counts().index,y=tweets_df1.Hour.value_counts().values) ax22.set_xlabel("Hour") ax22.set_ylabel('Count') #ax22.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f23,ax23 = plt.subplots() sns.histplot(x=tweets_df1.Hour,stat='count',binwidth=1,kde='true',discrete=True) #ax23.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f24,ax24 = plt.subplots() sns.kdeplot(x=tweets_df1.Hour,fill=True) #ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f25,ax25 = plt.subplots() sns.kdeplot(x=tweets_df1.Hour,fill=True,bw_adjust=3) #ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) tweets_df1.to_csv(dataset.name,index=False) return [dataset.name,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25] def scrape_tweets(text,username,since,until,retweets,replies,count,progress=gr.Progress()): print(text,username,since,until,retweets,replies,count) q,filename = search(text,username,since,until,retweets,replies) # Creating list to append tweet data tweets_list1 = [] # Using TwitterSearchScraper to scrape data and append tweets to list if count == -1: for i,tweet in progress.tqdm(enumerate(sntwitter.TwitterSearchScraper(q).get_items())): tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media]) else: for i,tweet in progress.tqdm(enumerate(sntwitter.TwitterSearchScraper(q).get_items())): if i>=count: #number of tweets you want to scrape break tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media]) # pbar.update(1) # Creating a dataframe from the tweets list above tweets_df1 = pd.DataFrame(tweets_list1, columns=['DateTime', 'TweetId', 'Text', 'Username','Language', 'Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Media']) #print(tweets_df1) tweets_df1['Hour'] = tweets_df1['DateTime'].dt.hour tweets_df1['Year'] = tweets_df1['DateTime'].dt.year tweets_df1['Month'] = tweets_df1['DateTime'].dt.month tweets_df1['MonthName'] = tweets_df1['DateTime'].dt.month_name() tweets_df1['MonthDay'] = tweets_df1['DateTime'].dt.day tweets_df1['DayName'] = tweets_df1['DateTime'].dt.day_name() tweets_df1['Week'] = tweets_df1['DateTime'].dt.isocalendar().week tweets_df1['Date'] = [d.date() for d in tweets_df1['DateTime']] tweets_df1['Time'] = [d.time() for d in tweets_df1['DateTime']] tweets_df1.drop('DateTime',axis=1,inplace=True) tweets_df1.drop('Media',axis=1,inplace=True) tweets_df1=tweets_df1.reindex(columns=['Date','Time','Username','Text','Language','Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Hour','Year','Month','MonthName','MonthDay','DayName','Week','TweetId']) tweets_df1.to_csv(f"{filename}",index=False) '''fig,ax = plt.subplots() plt.plot(df["day"], df[countries].to_numpy()) #plt.title("Outbreak in " + month) #plt.ylabel("Cases") #plt.xlabel("Days since Day 0") #plt.legend(countries) return fig''' f, ax = plt.subplots() sns.countplot(x= tweets_df1['Year']) for p in ax.patches: ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) f2,ax2 = plt.subplots() sns.lineplot(x=tweets_df1.Year.value_counts().index,y=tweets_df1.Year.value_counts().values) ax2.set_xlabel("Year") ax2.set_ylabel('Count') ax2.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f3,ax3 = plt.subplots() sns.histplot(x=tweets_df1.Year,stat='count',binwidth=1,kde='true',discrete=True) ax3.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f4,ax4 = plt.subplots() sns.kdeplot(x=tweets_df1.Year,fill=True) ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f5,ax5 = plt.subplots() sns.kdeplot(x=tweets_df1.Year,fill=True,bw_adjust=3) ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f6, ax6 = plt.subplots() sns.countplot(x= tweets_df1['Month']) for p in ax6.patches: ax6.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) f7,ax7 = plt.subplots() sns.lineplot(x=tweets_df1.Month.value_counts().index,y=tweets_df1.Month.value_counts().values) ax7.set_xlabel("Month") ax7.set_ylabel('Count') ax7.set_xticks(np.arange(1,13,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f8,ax8 = plt.subplots() sns.histplot(x=tweets_df1.Month,stat='count',binwidth=1,kde='true',discrete=True) ax8.set_xticks(np.arange(1,13,1)) f9,ax9 = plt.subplots() sns.kdeplot(x=tweets_df1.Month,fill=True) ax9.set_xticks(np.arange(1,13,1)) f10,ax10 = plt.subplots() sns.kdeplot(x=tweets_df1.Month,fill=True,bw_adjust=3) ax10.set_xticks(np.arange(1,13,1)) f11, ax11 = plt.subplots() sns.countplot(x= tweets_df1['Week']) for p in ax11.patches: ax11.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+1), fontsize = 10) plt.xticks(fontsize=7, rotation=45,horizontalalignment = 'center') #plt.setp(ax11.get_xticklabels(), rotation=30) f12,ax12 = plt.subplots() sns.lineplot(x=tweets_df1.Week.value_counts().index,y=tweets_df1.Week.value_counts().values) ax12.set_xlabel("Week") ax12.set_ylabel('Count') #ax12.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f13,ax13 = plt.subplots() sns.histplot(x=tweets_df1.Week,stat='count',binwidth=1,kde='true',discrete=True) #ax13.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f14,ax14 = plt.subplots() sns.kdeplot(x=tweets_df1.Week,fill=True) #ax14.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f15,ax15 = plt.subplots() sns.kdeplot(x=tweets_df1.Week,fill=True,bw_adjust=3) #ax15.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f16, ax16 = plt.subplots() sns.countplot(x= tweets_df1['MonthDay']) for p in ax16.patches: ax16.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) plt.xticks(fontsize=10, rotation=45,horizontalalignment = 'center') f17,ax17 = plt.subplots() sns.lineplot(x=tweets_df1.MonthDay.value_counts().index,y=tweets_df1.MonthDay.value_counts().values) ax17.set_xlabel("MonthDay") ax17.set_ylabel('Count') #ax17.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f18,ax18 = plt.subplots() sns.histplot(x=tweets_df1.MonthDay,stat='count',binwidth=1,kde='true',discrete=True) #ax18.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f19,ax19 = plt.subplots() sns.kdeplot(x=tweets_df1.MonthDay,fill=True) #ax19.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f20,ax20 = plt.subplots() sns.kdeplot(x=tweets_df1.MonthDay,fill=True,bw_adjust=3) #ax20.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f21, ax21 = plt.subplots() sns.countplot(x= tweets_df1['Hour']) for p in ax21.patches: ax21.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+10), fontsize = 12) f22,ax22 = plt.subplots() sns.lineplot(x=tweets_df1.Hour.value_counts().index,y=tweets_df1.Hour.value_counts().values) ax22.set_xlabel("Hour") ax22.set_ylabel('Count') #ax22.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #f2 = plt.figure() #plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts()) f23,ax23 = plt.subplots() sns.histplot(x=tweets_df1.Hour,stat='count',binwidth=1,kde='true',discrete=True) #ax23.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f24,ax24 = plt.subplots() sns.kdeplot(x=tweets_df1.Hour,fill=True) #ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) f25,ax25 = plt.subplots() sns.kdeplot(x=tweets_df1.Hour,fill=True,bw_adjust=3) #ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) #if dataframe==False: #return [filename,gr.update(value = tweets_df1,visible=False),f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25] #return [filename,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25] #else: #return [filename,gr.update(value = tweets_df1,visible=True),f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25] return [filename,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25] #gr.Markdown("Start typing below and then click **Run** to see the output.") with gr.Tab("Input"): with gr.Row(): text = gr.Textbox(label="Query text to be matched (Optional)",max_lines=1) username = gr.Textbox(label="Twitter Username",max_lines=1,value = 'DataScienceDojo') with gr.Row(): since = gr.Textbox(label="Start Date",placeholder='yyyy-mm-dd',max_lines=1,value = '2021-01-01') until = gr.Textbox(label="End Date",max_lines=1,placeholder='yyyy-mm-dd',value = '2022-12-31') with gr.Row(): retweets = gr.Checkbox(label="Exclude Retweets?",value = True) replies = gr.Checkbox(label="Exclude Replies",value = True) with gr.Row(): count = gr.Slider(label="Count (-1 to retrieve all tweets. Increase the count for better visualizations. 5000 Tweets are retrieved in approximately 100s!)",value=-1, minimum=-1,maximum = 100000, step=100) #dataframe = gr.Checkbox(label="Show Dataframe? It is better to uncheck this option if a large number of tweets are retrieved!") with gr.Row(): submit_btn = gr.Button("Submit") with gr.Row(): gr.Markdown("""

OR Upload File

""") with gr.Row(): filein = gr.File(label = "Upload previously downloaded csv file. Example given below!") with gr.Row(): submit_file = gr.Button("Submit File") with gr.Row(): out0 = gr.File(label = "Download CSV of extracted tweets") #with gr.Row(): #out = gr.DataFrame(label = "Scroll Horizontally to see all fields. Dataframe will display when the number of steps reaches a constant value.") with gr.Tab("Visualization by Hour"): with gr.Row(): out22 = gr.Plot() out23 = gr.Plot() with gr.Row(): out24 = gr.Plot() out25 = gr.Plot() with gr.Row(): out26 = gr.Plot() with gr.Tab("Visualization by Day"): with gr.Row(): out17 = gr.Plot() out18 = gr.Plot() with gr.Row(): out19 = gr.Plot() out20 = gr.Plot() with gr.Row(): out21 = gr.Plot() with gr.Tab("Visualization by Week"): with gr.Row(): out12 = gr.Plot() out13 = gr.Plot() with gr.Row(): out14 = gr.Plot() out15 = gr.Plot() with gr.Row(): out16 = gr.Plot() with gr.Tab("Visualization by Month"): with gr.Row(): out7 = gr.Plot() out8 = gr.Plot() with gr.Row(): out9 = gr.Plot() out10 = gr.Plot() with gr.Row(): out11 = gr.Plot() with gr.Tab("Visualization by Year"): with gr.Row(): out2 = gr.Plot() out3 = gr.Plot() with gr.Row(): out4 = gr.Plot() out5 = gr.Plot() with gr.Row(): out6 = gr.Plot() gr.Examples( examples=[["DSD_Tweets.csv"]], fn = file_viz, inputs = filein, outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26] ) submit_file.click(fn=file_viz,inputs = filein, outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26]) submit_btn.click(fn=scrape_tweets, inputs=[text,username,since,until,retweets,replies,count], outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26]) demo.queue(concurrency_count=5).launch()