datasciencedojo's picture
Update app.py
84f95c3
import pandas as pd
import numpy as np
import datetime
import tqdm
import gradio as gr
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import snscrape.modules.twitter as sntwitter
matplotlib.use("Agg")
css = """
footer {display:none !important}
.max-h-\[30rem\] {max-height: 15rem !important;}
.min-h-\[15rem\] {max-height: 5rem !important;}
.hover\:bg-orange-50:hover {
--tw-bg-opacity: 1 !important;
background-color: rgb(229,225,255) !important;
}
"""
with gr.Blocks(title="Twitter Temporal Insights | Data Science Dojo", css = css) as demo:
def search(text,username,since,until,retweet,replies):
global filename
q = text
if username!='':
q += f" from:{username}"
if until=='':
until = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
q += f" until:{until}"
if since=='':
since = datetime.datetime.strftime(datetime.datetime.strptime(until, '%Y-%m-%d') - datetime.timedelta(days=7), '%Y-%m-%d')
q += f" since:{since}"
if retweet == True:
q += f" exclude:retweets"
if replies == True:
q += f" exclude:replies"
if username!='' and text!='':
filename = f"{since}_{until}_{username}_{text}.csv"
elif username!="":
filename = f"{since}_{until}_{username}.csv"
else:
filename = f"{since}_{until}_{text}.csv"
print(filename)
return q,filename
def file_viz(dataset):
tweets_df1 = pd.read_csv(dataset.name)
since= str(min(tweets_df1['Year']))
until = str(max(tweets_df1['Year']))
print(dataset.name)
f, ax = plt.subplots()
sns.countplot(x= tweets_df1['Year'])
for p in ax.patches:
ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)
f2,ax2 = plt.subplots()
sns.lineplot(x=tweets_df1.Year.value_counts().index,y=tweets_df1.Year.value_counts().values)
ax2.set_xlabel("Year")
ax2.set_ylabel('Count')
ax2.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f3,ax3 = plt.subplots()
sns.histplot(x=tweets_df1.Year,stat='count',binwidth=1,kde='true',discrete=True)
ax3.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f4,ax4 = plt.subplots()
sns.kdeplot(x=tweets_df1.Year,fill=True)
ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f5,ax5 = plt.subplots()
sns.kdeplot(x=tweets_df1.Year,fill=True,bw_adjust=3)
ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f6, ax6 = plt.subplots()
sns.countplot(x= tweets_df1['Month'])
for p in ax6.patches:
ax6.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)
f7,ax7 = plt.subplots()
sns.lineplot(x=tweets_df1.Month.value_counts().index,y=tweets_df1.Month.value_counts().values)
ax7.set_xlabel("Month")
ax7.set_ylabel('Count')
ax7.set_xticks(np.arange(1,13,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f8,ax8 = plt.subplots()
sns.histplot(x=tweets_df1.Month,stat='count',binwidth=1,kde='true',discrete=True)
ax8.set_xticks(np.arange(1,13,1))
f9,ax9 = plt.subplots()
sns.kdeplot(x=tweets_df1.Month,fill=True)
ax9.set_xticks(np.arange(1,13,1))
f10,ax10 = plt.subplots()
sns.kdeplot(x=tweets_df1.Month,fill=True,bw_adjust=3)
ax10.set_xticks(np.arange(1,13,1))
f11, ax11 = plt.subplots()
sns.countplot(x= tweets_df1['Week'])
for p in ax11.patches:
ax11.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+1), fontsize = 10)
plt.xticks(fontsize=7, rotation=45,horizontalalignment = 'center')
#plt.setp(ax11.get_xticklabels(), rotation=30)
f12,ax12 = plt.subplots()
sns.lineplot(x=tweets_df1.Week.value_counts().index,y=tweets_df1.Week.value_counts().values)
ax12.set_xlabel("Week")
ax12.set_ylabel('Count')
#ax12.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f13,ax13 = plt.subplots()
sns.histplot(x=tweets_df1.Week,stat='count',binwidth=1,kde='true',discrete=True)
#ax13.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f14,ax14 = plt.subplots()
sns.kdeplot(x=tweets_df1.Week,fill=True)
#ax14.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f15,ax15 = plt.subplots()
sns.kdeplot(x=tweets_df1.Week,fill=True,bw_adjust=3)
#ax15.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f16, ax16 = plt.subplots()
sns.countplot(x= tweets_df1['MonthDay'])
for p in ax16.patches:
ax16.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)
plt.xticks(fontsize=10, rotation=45,horizontalalignment = 'center')
f17,ax17 = plt.subplots()
sns.lineplot(x=tweets_df1.MonthDay.value_counts().index,y=tweets_df1.MonthDay.value_counts().values)
ax17.set_xlabel("MonthDay")
ax17.set_ylabel('Count')
#ax17.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f18,ax18 = plt.subplots()
sns.histplot(x=tweets_df1.MonthDay,stat='count',binwidth=1,kde='true',discrete=True)
#ax18.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f19,ax19 = plt.subplots()
sns.kdeplot(x=tweets_df1.MonthDay,fill=True)
#ax19.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f20,ax20 = plt.subplots()
sns.kdeplot(x=tweets_df1.MonthDay,fill=True,bw_adjust=3)
#ax20.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f21, ax21 = plt.subplots()
sns.countplot(x= tweets_df1['Hour'])
for p in ax21.patches:
ax21.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+10), fontsize = 12)
f22,ax22 = plt.subplots()
sns.lineplot(x=tweets_df1.Hour.value_counts().index,y=tweets_df1.Hour.value_counts().values)
ax22.set_xlabel("Hour")
ax22.set_ylabel('Count')
#ax22.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f23,ax23 = plt.subplots()
sns.histplot(x=tweets_df1.Hour,stat='count',binwidth=1,kde='true',discrete=True)
#ax23.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f24,ax24 = plt.subplots()
sns.kdeplot(x=tweets_df1.Hour,fill=True)
#ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f25,ax25 = plt.subplots()
sns.kdeplot(x=tweets_df1.Hour,fill=True,bw_adjust=3)
#ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
tweets_df1.to_csv(dataset.name,index=False)
return [dataset.name,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
def scrape_tweets(text,username,since,until,retweets,replies,count,progress=gr.Progress()):
print(text,username,since,until,retweets,replies,count)
q,filename = search(text,username,since,until,retweets,replies)
# Creating list to append tweet data
tweets_list1 = []
# Using TwitterSearchScraper to scrape data and append tweets to list
if count == -1:
for i,tweet in progress.tqdm(enumerate(sntwitter.TwitterSearchScraper(q).get_items())):
tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media])
else:
for i,tweet in progress.tqdm(enumerate(sntwitter.TwitterSearchScraper(q).get_items())):
if i>=count: #number of tweets you want to scrape
break
tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media])
# pbar.update(1)
# Creating a dataframe from the tweets list above
tweets_df1 = pd.DataFrame(tweets_list1, columns=['DateTime', 'TweetId', 'Text', 'Username','Language',
'Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Media'])
#print(tweets_df1)
tweets_df1['Hour'] = tweets_df1['DateTime'].dt.hour
tweets_df1['Year'] = tweets_df1['DateTime'].dt.year
tweets_df1['Month'] = tweets_df1['DateTime'].dt.month
tweets_df1['MonthName'] = tweets_df1['DateTime'].dt.month_name()
tweets_df1['MonthDay'] = tweets_df1['DateTime'].dt.day
tweets_df1['DayName'] = tweets_df1['DateTime'].dt.day_name()
tweets_df1['Week'] = tweets_df1['DateTime'].dt.isocalendar().week
tweets_df1['Date'] = [d.date() for d in tweets_df1['DateTime']]
tweets_df1['Time'] = [d.time() for d in tweets_df1['DateTime']]
tweets_df1.drop('DateTime',axis=1,inplace=True)
tweets_df1.drop('Media',axis=1,inplace=True)
tweets_df1=tweets_df1.reindex(columns=['Date','Time','Username','Text','Language','Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Hour','Year','Month','MonthName','MonthDay','DayName','Week','TweetId'])
tweets_df1.to_csv(f"{filename}",index=False)
'''fig,ax = plt.subplots()
plt.plot(df["day"], df[countries].to_numpy())
#plt.title("Outbreak in " + month)
#plt.ylabel("Cases")
#plt.xlabel("Days since Day 0")
#plt.legend(countries)
return fig'''
f, ax = plt.subplots()
sns.countplot(x= tweets_df1['Year'])
for p in ax.patches:
ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)
f2,ax2 = plt.subplots()
sns.lineplot(x=tweets_df1.Year.value_counts().index,y=tweets_df1.Year.value_counts().values)
ax2.set_xlabel("Year")
ax2.set_ylabel('Count')
ax2.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f3,ax3 = plt.subplots()
sns.histplot(x=tweets_df1.Year,stat='count',binwidth=1,kde='true',discrete=True)
ax3.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f4,ax4 = plt.subplots()
sns.kdeplot(x=tweets_df1.Year,fill=True)
ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f5,ax5 = plt.subplots()
sns.kdeplot(x=tweets_df1.Year,fill=True,bw_adjust=3)
ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f6, ax6 = plt.subplots()
sns.countplot(x= tweets_df1['Month'])
for p in ax6.patches:
ax6.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)
f7,ax7 = plt.subplots()
sns.lineplot(x=tweets_df1.Month.value_counts().index,y=tweets_df1.Month.value_counts().values)
ax7.set_xlabel("Month")
ax7.set_ylabel('Count')
ax7.set_xticks(np.arange(1,13,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f8,ax8 = plt.subplots()
sns.histplot(x=tweets_df1.Month,stat='count',binwidth=1,kde='true',discrete=True)
ax8.set_xticks(np.arange(1,13,1))
f9,ax9 = plt.subplots()
sns.kdeplot(x=tweets_df1.Month,fill=True)
ax9.set_xticks(np.arange(1,13,1))
f10,ax10 = plt.subplots()
sns.kdeplot(x=tweets_df1.Month,fill=True,bw_adjust=3)
ax10.set_xticks(np.arange(1,13,1))
f11, ax11 = plt.subplots()
sns.countplot(x= tweets_df1['Week'])
for p in ax11.patches:
ax11.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+1), fontsize = 10)
plt.xticks(fontsize=7, rotation=45,horizontalalignment = 'center')
#plt.setp(ax11.get_xticklabels(), rotation=30)
f12,ax12 = plt.subplots()
sns.lineplot(x=tweets_df1.Week.value_counts().index,y=tweets_df1.Week.value_counts().values)
ax12.set_xlabel("Week")
ax12.set_ylabel('Count')
#ax12.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f13,ax13 = plt.subplots()
sns.histplot(x=tweets_df1.Week,stat='count',binwidth=1,kde='true',discrete=True)
#ax13.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f14,ax14 = plt.subplots()
sns.kdeplot(x=tweets_df1.Week,fill=True)
#ax14.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f15,ax15 = plt.subplots()
sns.kdeplot(x=tweets_df1.Week,fill=True,bw_adjust=3)
#ax15.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f16, ax16 = plt.subplots()
sns.countplot(x= tweets_df1['MonthDay'])
for p in ax16.patches:
ax16.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)
plt.xticks(fontsize=10, rotation=45,horizontalalignment = 'center')
f17,ax17 = plt.subplots()
sns.lineplot(x=tweets_df1.MonthDay.value_counts().index,y=tweets_df1.MonthDay.value_counts().values)
ax17.set_xlabel("MonthDay")
ax17.set_ylabel('Count')
#ax17.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f18,ax18 = plt.subplots()
sns.histplot(x=tweets_df1.MonthDay,stat='count',binwidth=1,kde='true',discrete=True)
#ax18.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f19,ax19 = plt.subplots()
sns.kdeplot(x=tweets_df1.MonthDay,fill=True)
#ax19.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f20,ax20 = plt.subplots()
sns.kdeplot(x=tweets_df1.MonthDay,fill=True,bw_adjust=3)
#ax20.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f21, ax21 = plt.subplots()
sns.countplot(x= tweets_df1['Hour'])
for p in ax21.patches:
ax21.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+10), fontsize = 12)
f22,ax22 = plt.subplots()
sns.lineplot(x=tweets_df1.Hour.value_counts().index,y=tweets_df1.Hour.value_counts().values)
ax22.set_xlabel("Hour")
ax22.set_ylabel('Count')
#ax22.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#f2 = plt.figure()
#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())
f23,ax23 = plt.subplots()
sns.histplot(x=tweets_df1.Hour,stat='count',binwidth=1,kde='true',discrete=True)
#ax23.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f24,ax24 = plt.subplots()
sns.kdeplot(x=tweets_df1.Hour,fill=True)
#ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
f25,ax25 = plt.subplots()
sns.kdeplot(x=tweets_df1.Hour,fill=True,bw_adjust=3)
#ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
#if dataframe==False:
#return [filename,gr.update(value = tweets_df1,visible=False),f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
#return [filename,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
#else:
#return [filename,gr.update(value = tweets_df1,visible=True),f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
return [filename,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
#gr.Markdown("Start typing below and then click **Run** to see the output.")
with gr.Tab("Input"):
with gr.Row():
text = gr.Textbox(label="Query text to be matched (Optional)",max_lines=1)
username = gr.Textbox(label="Twitter Username",max_lines=1,value = 'DataScienceDojo')
with gr.Row():
since = gr.Textbox(label="Start Date",placeholder='yyyy-mm-dd',max_lines=1,value = '2021-01-01')
until = gr.Textbox(label="End Date",max_lines=1,placeholder='yyyy-mm-dd',value = '2022-12-31')
with gr.Row():
retweets = gr.Checkbox(label="Exclude Retweets?",value = True)
replies = gr.Checkbox(label="Exclude Replies",value = True)
with gr.Row():
count = gr.Slider(label="Count (-1 to retrieve all tweets. Increase the count for better visualizations. 5000 Tweets are retrieved in approximately 100s!)",value=-1, minimum=-1,maximum = 100000, step=100)
#dataframe = gr.Checkbox(label="Show Dataframe? It is better to uncheck this option if a large number of tweets are retrieved!")
with gr.Row():
submit_btn = gr.Button("Submit")
with gr.Row():
gr.Markdown("""<h1 style= "text-align:center; z-index: 14; font-family: var(--font_default); font-size: 18px; font-weight: 500; color: rgb(9, 23, 71); opacity: 1;">OR Upload File<h1>""")
with gr.Row():
filein = gr.File(label = "Upload previously downloaded csv file. Example given below!")
with gr.Row():
submit_file = gr.Button("Submit File")
with gr.Row():
out0 = gr.File(label = "Download CSV of extracted tweets")
#with gr.Row():
#out = gr.DataFrame(label = "Scroll Horizontally to see all fields. Dataframe will display when the number of steps reaches a constant value.")
with gr.Tab("Visualization by Hour"):
with gr.Row():
out22 = gr.Plot()
out23 = gr.Plot()
with gr.Row():
out24 = gr.Plot()
out25 = gr.Plot()
with gr.Row():
out26 = gr.Plot()
with gr.Tab("Visualization by Day"):
with gr.Row():
out17 = gr.Plot()
out18 = gr.Plot()
with gr.Row():
out19 = gr.Plot()
out20 = gr.Plot()
with gr.Row():
out21 = gr.Plot()
with gr.Tab("Visualization by Week"):
with gr.Row():
out12 = gr.Plot()
out13 = gr.Plot()
with gr.Row():
out14 = gr.Plot()
out15 = gr.Plot()
with gr.Row():
out16 = gr.Plot()
with gr.Tab("Visualization by Month"):
with gr.Row():
out7 = gr.Plot()
out8 = gr.Plot()
with gr.Row():
out9 = gr.Plot()
out10 = gr.Plot()
with gr.Row():
out11 = gr.Plot()
with gr.Tab("Visualization by Year"):
with gr.Row():
out2 = gr.Plot()
out3 = gr.Plot()
with gr.Row():
out4 = gr.Plot()
out5 = gr.Plot()
with gr.Row():
out6 = gr.Plot()
gr.Examples(
examples=[["DSD_Tweets.csv"]],
fn = file_viz,
inputs = filein,
outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26]
)
submit_file.click(fn=file_viz,inputs = filein, outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26])
submit_btn.click(fn=scrape_tweets, inputs=[text,username,since,until,retweets,replies,count], outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26])
demo.queue(concurrency_count=5).launch()