|
import pandas as pd |
|
import numpy as np |
|
import datetime |
|
import tqdm |
|
import gradio as gr |
|
import matplotlib |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import snscrape.modules.twitter as sntwitter |
|
|
|
matplotlib.use("Agg") |
|
|
|
css = """ |
|
footer {display:none !important} |
|
.max-h-\[30rem\] {max-height: 15rem !important;} |
|
.min-h-\[15rem\] {max-height: 5rem !important;} |
|
.hover\:bg-orange-50:hover { |
|
--tw-bg-opacity: 1 !important; |
|
background-color: rgb(229,225,255) !important; |
|
} |
|
""" |
|
|
|
with gr.Blocks(title="Twitter Temporal Insights | Data Science Dojo", css = css) as demo: |
|
def search(text,username,since,until,retweet,replies): |
|
global filename |
|
q = text |
|
if username!='': |
|
q += f" from:{username}" |
|
if until=='': |
|
until = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d') |
|
q += f" until:{until}" |
|
if since=='': |
|
since = datetime.datetime.strftime(datetime.datetime.strptime(until, '%Y-%m-%d') - datetime.timedelta(days=7), '%Y-%m-%d') |
|
q += f" since:{since}" |
|
if retweet == True: |
|
q += f" exclude:retweets" |
|
if replies == True: |
|
q += f" exclude:replies" |
|
if username!='' and text!='': |
|
filename = f"{since}_{until}_{username}_{text}.csv" |
|
elif username!="": |
|
filename = f"{since}_{until}_{username}.csv" |
|
else: |
|
filename = f"{since}_{until}_{text}.csv" |
|
print(filename) |
|
return q,filename |
|
|
|
def file_viz(dataset): |
|
tweets_df1 = pd.read_csv(dataset.name) |
|
since= str(min(tweets_df1['Year'])) |
|
until = str(max(tweets_df1['Year'])) |
|
print(dataset.name) |
|
f, ax = plt.subplots() |
|
sns.countplot(x= tweets_df1['Year']) |
|
for p in ax.patches: |
|
ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) |
|
|
|
f2,ax2 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.Year.value_counts().index,y=tweets_df1.Year.value_counts().values) |
|
ax2.set_xlabel("Year") |
|
ax2.set_ylabel('Count') |
|
ax2.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) |
|
|
|
|
|
|
|
f3,ax3 = plt.subplots() |
|
sns.histplot(x=tweets_df1.Year,stat='count',binwidth=1,kde='true',discrete=True) |
|
ax3.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) |
|
|
|
f4,ax4 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Year,fill=True) |
|
ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) |
|
|
|
f5,ax5 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Year,fill=True,bw_adjust=3) |
|
ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) |
|
|
|
f6, ax6 = plt.subplots() |
|
sns.countplot(x= tweets_df1['Month']) |
|
for p in ax6.patches: |
|
ax6.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) |
|
|
|
f7,ax7 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.Month.value_counts().index,y=tweets_df1.Month.value_counts().values) |
|
ax7.set_xlabel("Month") |
|
ax7.set_ylabel('Count') |
|
ax7.set_xticks(np.arange(1,13,1)) |
|
|
|
|
|
|
|
f8,ax8 = plt.subplots() |
|
sns.histplot(x=tweets_df1.Month,stat='count',binwidth=1,kde='true',discrete=True) |
|
ax8.set_xticks(np.arange(1,13,1)) |
|
|
|
f9,ax9 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Month,fill=True) |
|
ax9.set_xticks(np.arange(1,13,1)) |
|
|
|
f10,ax10 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Month,fill=True,bw_adjust=3) |
|
ax10.set_xticks(np.arange(1,13,1)) |
|
|
|
f11, ax11 = plt.subplots() |
|
sns.countplot(x= tweets_df1['Week']) |
|
for p in ax11.patches: |
|
ax11.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+1), fontsize = 10) |
|
plt.xticks(fontsize=7, rotation=45,horizontalalignment = 'center') |
|
|
|
|
|
f12,ax12 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.Week.value_counts().index,y=tweets_df1.Week.value_counts().values) |
|
ax12.set_xlabel("Week") |
|
ax12.set_ylabel('Count') |
|
|
|
|
|
|
|
|
|
f13,ax13 = plt.subplots() |
|
sns.histplot(x=tweets_df1.Week,stat='count',binwidth=1,kde='true',discrete=True) |
|
|
|
|
|
f14,ax14 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Week,fill=True) |
|
|
|
|
|
f15,ax15 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Week,fill=True,bw_adjust=3) |
|
|
|
|
|
f16, ax16 = plt.subplots() |
|
sns.countplot(x= tweets_df1['MonthDay']) |
|
for p in ax16.patches: |
|
ax16.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) |
|
plt.xticks(fontsize=10, rotation=45,horizontalalignment = 'center') |
|
|
|
f17,ax17 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.MonthDay.value_counts().index,y=tweets_df1.MonthDay.value_counts().values) |
|
ax17.set_xlabel("MonthDay") |
|
ax17.set_ylabel('Count') |
|
|
|
|
|
|
|
|
|
f18,ax18 = plt.subplots() |
|
sns.histplot(x=tweets_df1.MonthDay,stat='count',binwidth=1,kde='true',discrete=True) |
|
|
|
|
|
f19,ax19 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.MonthDay,fill=True) |
|
|
|
|
|
f20,ax20 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.MonthDay,fill=True,bw_adjust=3) |
|
|
|
|
|
f21, ax21 = plt.subplots() |
|
sns.countplot(x= tweets_df1['Hour']) |
|
for p in ax21.patches: |
|
ax21.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+10), fontsize = 12) |
|
|
|
f22,ax22 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.Hour.value_counts().index,y=tweets_df1.Hour.value_counts().values) |
|
ax22.set_xlabel("Hour") |
|
ax22.set_ylabel('Count') |
|
|
|
|
|
|
|
|
|
f23,ax23 = plt.subplots() |
|
sns.histplot(x=tweets_df1.Hour,stat='count',binwidth=1,kde='true',discrete=True) |
|
|
|
|
|
f24,ax24 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Hour,fill=True) |
|
|
|
|
|
f25,ax25 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Hour,fill=True,bw_adjust=3) |
|
|
|
tweets_df1.to_csv(dataset.name,index=False) |
|
|
|
return [dataset.name,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25] |
|
|
|
def scrape_tweets(text,username,since,until,retweets,replies,count,progress=gr.Progress()): |
|
print(text,username,since,until,retweets,replies,count) |
|
q,filename = search(text,username,since,until,retweets,replies) |
|
|
|
tweets_list1 = [] |
|
|
|
|
|
if count == -1: |
|
for i,tweet in progress.tqdm(enumerate(sntwitter.TwitterSearchScraper(q).get_items())): |
|
tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media]) |
|
else: |
|
for i,tweet in progress.tqdm(enumerate(sntwitter.TwitterSearchScraper(q).get_items())): |
|
if i>=count: |
|
break |
|
tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media]) |
|
|
|
|
|
tweets_df1 = pd.DataFrame(tweets_list1, columns=['DateTime', 'TweetId', 'Text', 'Username','Language', |
|
'Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Media']) |
|
|
|
tweets_df1['Hour'] = tweets_df1['DateTime'].dt.hour |
|
tweets_df1['Year'] = tweets_df1['DateTime'].dt.year |
|
tweets_df1['Month'] = tweets_df1['DateTime'].dt.month |
|
tweets_df1['MonthName'] = tweets_df1['DateTime'].dt.month_name() |
|
tweets_df1['MonthDay'] = tweets_df1['DateTime'].dt.day |
|
tweets_df1['DayName'] = tweets_df1['DateTime'].dt.day_name() |
|
tweets_df1['Week'] = tweets_df1['DateTime'].dt.isocalendar().week |
|
tweets_df1['Date'] = [d.date() for d in tweets_df1['DateTime']] |
|
tweets_df1['Time'] = [d.time() for d in tweets_df1['DateTime']] |
|
tweets_df1.drop('DateTime',axis=1,inplace=True) |
|
tweets_df1.drop('Media',axis=1,inplace=True) |
|
|
|
tweets_df1=tweets_df1.reindex(columns=['Date','Time','Username','Text','Language','Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Hour','Year','Month','MonthName','MonthDay','DayName','Week','TweetId']) |
|
|
|
tweets_df1.to_csv(f"{filename}",index=False) |
|
|
|
'''fig,ax = plt.subplots() |
|
plt.plot(df["day"], df[countries].to_numpy()) |
|
|
|
#plt.title("Outbreak in " + month) |
|
#plt.ylabel("Cases") |
|
#plt.xlabel("Days since Day 0") |
|
#plt.legend(countries) |
|
return fig''' |
|
|
|
f, ax = plt.subplots() |
|
sns.countplot(x= tweets_df1['Year']) |
|
for p in ax.patches: |
|
ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) |
|
|
|
f2,ax2 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.Year.value_counts().index,y=tweets_df1.Year.value_counts().values) |
|
ax2.set_xlabel("Year") |
|
ax2.set_ylabel('Count') |
|
ax2.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) |
|
|
|
|
|
|
|
f3,ax3 = plt.subplots() |
|
sns.histplot(x=tweets_df1.Year,stat='count',binwidth=1,kde='true',discrete=True) |
|
ax3.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) |
|
|
|
f4,ax4 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Year,fill=True) |
|
ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) |
|
|
|
f5,ax5 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Year,fill=True,bw_adjust=3) |
|
ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1)) |
|
|
|
f6, ax6 = plt.subplots() |
|
sns.countplot(x= tweets_df1['Month']) |
|
for p in ax6.patches: |
|
ax6.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) |
|
|
|
f7,ax7 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.Month.value_counts().index,y=tweets_df1.Month.value_counts().values) |
|
ax7.set_xlabel("Month") |
|
ax7.set_ylabel('Count') |
|
ax7.set_xticks(np.arange(1,13,1)) |
|
|
|
|
|
|
|
f8,ax8 = plt.subplots() |
|
sns.histplot(x=tweets_df1.Month,stat='count',binwidth=1,kde='true',discrete=True) |
|
ax8.set_xticks(np.arange(1,13,1)) |
|
|
|
f9,ax9 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Month,fill=True) |
|
ax9.set_xticks(np.arange(1,13,1)) |
|
|
|
f10,ax10 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Month,fill=True,bw_adjust=3) |
|
ax10.set_xticks(np.arange(1,13,1)) |
|
|
|
f11, ax11 = plt.subplots() |
|
sns.countplot(x= tweets_df1['Week']) |
|
for p in ax11.patches: |
|
ax11.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+1), fontsize = 10) |
|
plt.xticks(fontsize=7, rotation=45,horizontalalignment = 'center') |
|
|
|
|
|
f12,ax12 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.Week.value_counts().index,y=tweets_df1.Week.value_counts().values) |
|
ax12.set_xlabel("Week") |
|
ax12.set_ylabel('Count') |
|
|
|
|
|
|
|
|
|
f13,ax13 = plt.subplots() |
|
sns.histplot(x=tweets_df1.Week,stat='count',binwidth=1,kde='true',discrete=True) |
|
|
|
|
|
f14,ax14 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Week,fill=True) |
|
|
|
|
|
f15,ax15 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Week,fill=True,bw_adjust=3) |
|
|
|
|
|
f16, ax16 = plt.subplots() |
|
sns.countplot(x= tweets_df1['MonthDay']) |
|
for p in ax16.patches: |
|
ax16.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12) |
|
plt.xticks(fontsize=10, rotation=45,horizontalalignment = 'center') |
|
|
|
f17,ax17 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.MonthDay.value_counts().index,y=tweets_df1.MonthDay.value_counts().values) |
|
ax17.set_xlabel("MonthDay") |
|
ax17.set_ylabel('Count') |
|
|
|
|
|
|
|
|
|
f18,ax18 = plt.subplots() |
|
sns.histplot(x=tweets_df1.MonthDay,stat='count',binwidth=1,kde='true',discrete=True) |
|
|
|
|
|
f19,ax19 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.MonthDay,fill=True) |
|
|
|
|
|
f20,ax20 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.MonthDay,fill=True,bw_adjust=3) |
|
|
|
|
|
f21, ax21 = plt.subplots() |
|
sns.countplot(x= tweets_df1['Hour']) |
|
for p in ax21.patches: |
|
ax21.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+10), fontsize = 12) |
|
|
|
f22,ax22 = plt.subplots() |
|
sns.lineplot(x=tweets_df1.Hour.value_counts().index,y=tweets_df1.Hour.value_counts().values) |
|
ax22.set_xlabel("Hour") |
|
ax22.set_ylabel('Count') |
|
|
|
|
|
|
|
|
|
f23,ax23 = plt.subplots() |
|
sns.histplot(x=tweets_df1.Hour,stat='count',binwidth=1,kde='true',discrete=True) |
|
|
|
|
|
f24,ax24 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Hour,fill=True) |
|
|
|
|
|
f25,ax25 = plt.subplots() |
|
sns.kdeplot(x=tweets_df1.Hour,fill=True,bw_adjust=3) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return [filename,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25] |
|
|
|
with gr.Tab("Input"): |
|
with gr.Row(): |
|
text = gr.Textbox(label="Query text to be matched (Optional)",max_lines=1) |
|
username = gr.Textbox(label="Twitter Username",max_lines=1,value = 'DataScienceDojo') |
|
with gr.Row(): |
|
since = gr.Textbox(label="Start Date",placeholder='yyyy-mm-dd',max_lines=1,value = '2021-01-01') |
|
until = gr.Textbox(label="End Date",max_lines=1,placeholder='yyyy-mm-dd',value = '2022-12-31') |
|
with gr.Row(): |
|
retweets = gr.Checkbox(label="Exclude Retweets?",value = True) |
|
replies = gr.Checkbox(label="Exclude Replies",value = True) |
|
with gr.Row(): |
|
count = gr.Slider(label="Count (-1 to retrieve all tweets. Increase the count for better visualizations. 5000 Tweets are retrieved in approximately 100s!)",value=-1, minimum=-1,maximum = 100000, step=100) |
|
|
|
with gr.Row(): |
|
submit_btn = gr.Button("Submit") |
|
with gr.Row(): |
|
gr.Markdown("""<h1 style= "text-align:center; z-index: 14; font-family: var(--font_default); font-size: 18px; font-weight: 500; color: rgb(9, 23, 71); opacity: 1;">OR Upload File<h1>""") |
|
with gr.Row(): |
|
filein = gr.File(label = "Upload previously downloaded csv file. Example given below!") |
|
with gr.Row(): |
|
submit_file = gr.Button("Submit File") |
|
with gr.Row(): |
|
out0 = gr.File(label = "Download CSV of extracted tweets") |
|
|
|
|
|
|
|
with gr.Tab("Visualization by Hour"): |
|
with gr.Row(): |
|
out22 = gr.Plot() |
|
out23 = gr.Plot() |
|
with gr.Row(): |
|
out24 = gr.Plot() |
|
out25 = gr.Plot() |
|
with gr.Row(): |
|
out26 = gr.Plot() |
|
with gr.Tab("Visualization by Day"): |
|
with gr.Row(): |
|
out17 = gr.Plot() |
|
out18 = gr.Plot() |
|
with gr.Row(): |
|
out19 = gr.Plot() |
|
out20 = gr.Plot() |
|
with gr.Row(): |
|
out21 = gr.Plot() |
|
with gr.Tab("Visualization by Week"): |
|
with gr.Row(): |
|
out12 = gr.Plot() |
|
out13 = gr.Plot() |
|
with gr.Row(): |
|
out14 = gr.Plot() |
|
out15 = gr.Plot() |
|
with gr.Row(): |
|
out16 = gr.Plot() |
|
with gr.Tab("Visualization by Month"): |
|
with gr.Row(): |
|
out7 = gr.Plot() |
|
out8 = gr.Plot() |
|
with gr.Row(): |
|
out9 = gr.Plot() |
|
out10 = gr.Plot() |
|
with gr.Row(): |
|
out11 = gr.Plot() |
|
with gr.Tab("Visualization by Year"): |
|
with gr.Row(): |
|
out2 = gr.Plot() |
|
out3 = gr.Plot() |
|
with gr.Row(): |
|
out4 = gr.Plot() |
|
out5 = gr.Plot() |
|
with gr.Row(): |
|
out6 = gr.Plot() |
|
|
|
gr.Examples( |
|
examples=[["DSD_Tweets.csv"]], |
|
fn = file_viz, |
|
inputs = filein, |
|
outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26] |
|
) |
|
submit_file.click(fn=file_viz,inputs = filein, outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26]) |
|
submit_btn.click(fn=scrape_tweets, inputs=[text,username,since,until,retweets,replies,count], outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26]) |
|
|
|
|
|
demo.queue(concurrency_count=5).launch() |
|
|