Spaces:

datasciencedojo
/

Twitter-Scraper-with-Time-Series-Visualization

Running

App Files Files Community

Twitter-Scraper-with-Time-Series-Visualization / app.py

datasciencedojo

Update app.py

84f95c3 about 2 years ago

raw

history blame contribute delete

19 kB

	import pandas as pd
	import numpy as np
	import datetime
	import tqdm
	import gradio as gr
	import matplotlib
	import matplotlib.pyplot as plt
	import seaborn as sns
	import snscrape.modules.twitter as sntwitter

	matplotlib.use("Agg")

	css = """
	footer {display:none !important}
	.max-h-\[30rem\] {max-height: 15rem !important;}
	.min-h-\[15rem\] {max-height: 5rem !important;}
	.hover\:bg-orange-50:hover {
	--tw-bg-opacity: 1 !important;
	background-color: rgb(229,225,255) !important;
	}
	"""

	with gr.Blocks(title="Twitter Temporal Insights \| Data Science Dojo", css = css) as demo:
	def search(text,username,since,until,retweet,replies):
	global filename
	q = text
	if username!='':
	q += f" from:{username}"
	if until=='':
	until = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
	q += f" until:{until}"
	if since=='':
	since = datetime.datetime.strftime(datetime.datetime.strptime(until, '%Y-%m-%d') - datetime.timedelta(days=7), '%Y-%m-%d')
	q += f" since:{since}"
	if retweet == True:
	q += f" exclude:retweets"
	if replies == True:
	q += f" exclude:replies"
	if username!='' and text!='':
	filename = f"{since}_{until}_{username}_{text}.csv"
	elif username!="":
	filename = f"{since}_{until}_{username}.csv"
	else:
	filename = f"{since}_{until}_{text}.csv"
	print(filename)
	return q,filename

	def file_viz(dataset):
	tweets_df1 = pd.read_csv(dataset.name)
	since= str(min(tweets_df1['Year']))
	until = str(max(tweets_df1['Year']))
	print(dataset.name)
	f, ax = plt.subplots()
	sns.countplot(x= tweets_df1['Year'])
	for p in ax.patches:
	ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)

	f2,ax2 = plt.subplots()
	sns.lineplot(x=tweets_df1.Year.value_counts().index,y=tweets_df1.Year.value_counts().values)
	ax2.set_xlabel("Year")
	ax2.set_ylabel('Count')
	ax2.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f3,ax3 = plt.subplots()
	sns.histplot(x=tweets_df1.Year,stat='count',binwidth=1,kde='true',discrete=True)
	ax3.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f4,ax4 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Year,fill=True)
	ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f5,ax5 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Year,fill=True,bw_adjust=3)
	ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f6, ax6 = plt.subplots()
	sns.countplot(x= tweets_df1['Month'])
	for p in ax6.patches:
	ax6.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)

	f7,ax7 = plt.subplots()
	sns.lineplot(x=tweets_df1.Month.value_counts().index,y=tweets_df1.Month.value_counts().values)
	ax7.set_xlabel("Month")
	ax7.set_ylabel('Count')
	ax7.set_xticks(np.arange(1,13,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f8,ax8 = plt.subplots()
	sns.histplot(x=tweets_df1.Month,stat='count',binwidth=1,kde='true',discrete=True)
	ax8.set_xticks(np.arange(1,13,1))

	f9,ax9 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Month,fill=True)
	ax9.set_xticks(np.arange(1,13,1))

	f10,ax10 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Month,fill=True,bw_adjust=3)
	ax10.set_xticks(np.arange(1,13,1))

	f11, ax11 = plt.subplots()
	sns.countplot(x= tweets_df1['Week'])
	for p in ax11.patches:
	ax11.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+1), fontsize = 10)
	plt.xticks(fontsize=7, rotation=45,horizontalalignment = 'center')
	#plt.setp(ax11.get_xticklabels(), rotation=30)

	f12,ax12 = plt.subplots()
	sns.lineplot(x=tweets_df1.Week.value_counts().index,y=tweets_df1.Week.value_counts().values)
	ax12.set_xlabel("Week")
	ax12.set_ylabel('Count')
	#ax12.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f13,ax13 = plt.subplots()
	sns.histplot(x=tweets_df1.Week,stat='count',binwidth=1,kde='true',discrete=True)
	#ax13.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f14,ax14 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Week,fill=True)
	#ax14.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f15,ax15 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Week,fill=True,bw_adjust=3)
	#ax15.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f16, ax16 = plt.subplots()
	sns.countplot(x= tweets_df1['MonthDay'])
	for p in ax16.patches:
	ax16.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)
	plt.xticks(fontsize=10, rotation=45,horizontalalignment = 'center')

	f17,ax17 = plt.subplots()
	sns.lineplot(x=tweets_df1.MonthDay.value_counts().index,y=tweets_df1.MonthDay.value_counts().values)
	ax17.set_xlabel("MonthDay")
	ax17.set_ylabel('Count')
	#ax17.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f18,ax18 = plt.subplots()
	sns.histplot(x=tweets_df1.MonthDay,stat='count',binwidth=1,kde='true',discrete=True)
	#ax18.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f19,ax19 = plt.subplots()
	sns.kdeplot(x=tweets_df1.MonthDay,fill=True)
	#ax19.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f20,ax20 = plt.subplots()
	sns.kdeplot(x=tweets_df1.MonthDay,fill=True,bw_adjust=3)
	#ax20.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f21, ax21 = plt.subplots()
	sns.countplot(x= tweets_df1['Hour'])
	for p in ax21.patches:
	ax21.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+10), fontsize = 12)

	f22,ax22 = plt.subplots()
	sns.lineplot(x=tweets_df1.Hour.value_counts().index,y=tweets_df1.Hour.value_counts().values)
	ax22.set_xlabel("Hour")
	ax22.set_ylabel('Count')
	#ax22.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f23,ax23 = plt.subplots()
	sns.histplot(x=tweets_df1.Hour,stat='count',binwidth=1,kde='true',discrete=True)
	#ax23.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f24,ax24 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Hour,fill=True)
	#ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f25,ax25 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Hour,fill=True,bw_adjust=3)
	#ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	tweets_df1.to_csv(dataset.name,index=False)

	return [dataset.name,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]

	def scrape_tweets(text,username,since,until,retweets,replies,count,progress=gr.Progress()):
	print(text,username,since,until,retweets,replies,count)
	q,filename = search(text,username,since,until,retweets,replies)
	# Creating list to append tweet data
	tweets_list1 = []

	# Using TwitterSearchScraper to scrape data and append tweets to list
	if count == -1:
	for i,tweet in progress.tqdm(enumerate(sntwitter.TwitterSearchScraper(q).get_items())):
	tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media])
	else:
	for i,tweet in progress.tqdm(enumerate(sntwitter.TwitterSearchScraper(q).get_items())):
	if i>=count: #number of tweets you want to scrape
	break
	tweets_list1.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username,tweet.lang,tweet.hashtags,tweet.replyCount,tweet.retweetCount,tweet.likeCount,tweet.quoteCount,tweet.media])
	# pbar.update(1)
	# Creating a dataframe from the tweets list above
	tweets_df1 = pd.DataFrame(tweets_list1, columns=['DateTime', 'TweetId', 'Text', 'Username','Language',
	'Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Media'])
	#print(tweets_df1)
	tweets_df1['Hour'] = tweets_df1['DateTime'].dt.hour
	tweets_df1['Year'] = tweets_df1['DateTime'].dt.year
	tweets_df1['Month'] = tweets_df1['DateTime'].dt.month
	tweets_df1['MonthName'] = tweets_df1['DateTime'].dt.month_name()
	tweets_df1['MonthDay'] = tweets_df1['DateTime'].dt.day
	tweets_df1['DayName'] = tweets_df1['DateTime'].dt.day_name()
	tweets_df1['Week'] = tweets_df1['DateTime'].dt.isocalendar().week
	tweets_df1['Date'] = [d.date() for d in tweets_df1['DateTime']]
	tweets_df1['Time'] = [d.time() for d in tweets_df1['DateTime']]
	tweets_df1.drop('DateTime',axis=1,inplace=True)
	tweets_df1.drop('Media',axis=1,inplace=True)

	tweets_df1=tweets_df1.reindex(columns=['Date','Time','Username','Text','Language','Hashtags','ReplyCount','RetweetCount','LikeCount','QuoteCount','Hour','Year','Month','MonthName','MonthDay','DayName','Week','TweetId'])

	tweets_df1.to_csv(f"{filename}",index=False)

	'''fig,ax = plt.subplots()
	plt.plot(df["day"], df[countries].to_numpy())

	#plt.title("Outbreak in " + month)
	#plt.ylabel("Cases")
	#plt.xlabel("Days since Day 0")
	#plt.legend(countries)
	return fig'''

	f, ax = plt.subplots()
	sns.countplot(x= tweets_df1['Year'])
	for p in ax.patches:
	ax.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)

	f2,ax2 = plt.subplots()
	sns.lineplot(x=tweets_df1.Year.value_counts().index,y=tweets_df1.Year.value_counts().values)
	ax2.set_xlabel("Year")
	ax2.set_ylabel('Count')
	ax2.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f3,ax3 = plt.subplots()
	sns.histplot(x=tweets_df1.Year,stat='count',binwidth=1,kde='true',discrete=True)
	ax3.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f4,ax4 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Year,fill=True)
	ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f5,ax5 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Year,fill=True,bw_adjust=3)
	ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f6, ax6 = plt.subplots()
	sns.countplot(x= tweets_df1['Month'])
	for p in ax6.patches:
	ax6.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)

	f7,ax7 = plt.subplots()
	sns.lineplot(x=tweets_df1.Month.value_counts().index,y=tweets_df1.Month.value_counts().values)
	ax7.set_xlabel("Month")
	ax7.set_ylabel('Count')
	ax7.set_xticks(np.arange(1,13,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f8,ax8 = plt.subplots()
	sns.histplot(x=tweets_df1.Month,stat='count',binwidth=1,kde='true',discrete=True)
	ax8.set_xticks(np.arange(1,13,1))

	f9,ax9 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Month,fill=True)
	ax9.set_xticks(np.arange(1,13,1))

	f10,ax10 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Month,fill=True,bw_adjust=3)
	ax10.set_xticks(np.arange(1,13,1))

	f11, ax11 = plt.subplots()
	sns.countplot(x= tweets_df1['Week'])
	for p in ax11.patches:
	ax11.annotate(int(p.get_height()), (p.get_x()+0.005, p.get_height()+1), fontsize = 10)
	plt.xticks(fontsize=7, rotation=45,horizontalalignment = 'center')
	#plt.setp(ax11.get_xticklabels(), rotation=30)

	f12,ax12 = plt.subplots()
	sns.lineplot(x=tweets_df1.Week.value_counts().index,y=tweets_df1.Week.value_counts().values)
	ax12.set_xlabel("Week")
	ax12.set_ylabel('Count')
	#ax12.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f13,ax13 = plt.subplots()
	sns.histplot(x=tweets_df1.Week,stat='count',binwidth=1,kde='true',discrete=True)
	#ax13.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f14,ax14 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Week,fill=True)
	#ax14.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f15,ax15 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Week,fill=True,bw_adjust=3)
	#ax15.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f16, ax16 = plt.subplots()
	sns.countplot(x= tweets_df1['MonthDay'])
	for p in ax16.patches:
	ax16.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+5), fontsize = 12)
	plt.xticks(fontsize=10, rotation=45,horizontalalignment = 'center')

	f17,ax17 = plt.subplots()
	sns.lineplot(x=tweets_df1.MonthDay.value_counts().index,y=tweets_df1.MonthDay.value_counts().values)
	ax17.set_xlabel("MonthDay")
	ax17.set_ylabel('Count')
	#ax17.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f18,ax18 = plt.subplots()
	sns.histplot(x=tweets_df1.MonthDay,stat='count',binwidth=1,kde='true',discrete=True)
	#ax18.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f19,ax19 = plt.subplots()
	sns.kdeplot(x=tweets_df1.MonthDay,fill=True)
	#ax19.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f20,ax20 = plt.subplots()
	sns.kdeplot(x=tweets_df1.MonthDay,fill=True,bw_adjust=3)
	#ax20.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f21, ax21 = plt.subplots()
	sns.countplot(x= tweets_df1['Hour'])
	for p in ax21.patches:
	ax21.annotate(int(p.get_height()), (p.get_x()+0.05, p.get_height()+10), fontsize = 12)

	f22,ax22 = plt.subplots()
	sns.lineplot(x=tweets_df1.Hour.value_counts().index,y=tweets_df1.Hour.value_counts().values)
	ax22.set_xlabel("Hour")
	ax22.set_ylabel('Count')
	#ax22.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))
	#f2 = plt.figure()
	#plt.plot(np.arange(2021,2023,1), tweets_df1.Year.value_counts())

	f23,ax23 = plt.subplots()
	sns.histplot(x=tweets_df1.Hour,stat='count',binwidth=1,kde='true',discrete=True)
	#ax23.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f24,ax24 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Hour,fill=True)
	#ax4.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	f25,ax25 = plt.subplots()
	sns.kdeplot(x=tweets_df1.Hour,fill=True,bw_adjust=3)
	#ax5.set_xticks(np.arange(int(since[0:4]),int(until[0:4])+1,1))

	#if dataframe==False:
	#return [filename,gr.update(value = tweets_df1,visible=False),f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
	#return [filename,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
	#else:
	#return [filename,gr.update(value = tweets_df1,visible=True),f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
	return [filename,f,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25]
	#gr.Markdown("Start typing below and then click Run to see the output.")
	with gr.Tab("Input"):
	with gr.Row():
	text = gr.Textbox(label="Query text to be matched (Optional)",max_lines=1)
	username = gr.Textbox(label="Twitter Username",max_lines=1,value = 'DataScienceDojo')
	with gr.Row():
	since = gr.Textbox(label="Start Date",placeholder='yyyy-mm-dd',max_lines=1,value = '2021-01-01')
	until = gr.Textbox(label="End Date",max_lines=1,placeholder='yyyy-mm-dd',value = '2022-12-31')
	with gr.Row():
	retweets = gr.Checkbox(label="Exclude Retweets?",value = True)
	replies = gr.Checkbox(label="Exclude Replies",value = True)
	with gr.Row():
	count = gr.Slider(label="Count (-1 to retrieve all tweets. Increase the count for better visualizations. 5000 Tweets are retrieved in approximately 100s!)",value=-1, minimum=-1,maximum = 100000, step=100)
	#dataframe = gr.Checkbox(label="Show Dataframe? It is better to uncheck this option if a large number of tweets are retrieved!")
	with gr.Row():
	submit_btn = gr.Button("Submit")
	with gr.Row():
	gr.Markdown("""<h1 style= "text-align:center; z-index: 14; font-family: var(--font_default); font-size: 18px; font-weight: 500; color: rgb(9, 23, 71); opacity: 1;">OR Upload File<h1>""")
	with gr.Row():
	filein = gr.File(label = "Upload previously downloaded csv file. Example given below!")
	with gr.Row():
	submit_file = gr.Button("Submit File")
	with gr.Row():
	out0 = gr.File(label = "Download CSV of extracted tweets")
	#with gr.Row():
	#out = gr.DataFrame(label = "Scroll Horizontally to see all fields. Dataframe will display when the number of steps reaches a constant value.")

	with gr.Tab("Visualization by Hour"):
	with gr.Row():
	out22 = gr.Plot()
	out23 = gr.Plot()
	with gr.Row():
	out24 = gr.Plot()
	out25 = gr.Plot()
	with gr.Row():
	out26 = gr.Plot()
	with gr.Tab("Visualization by Day"):
	with gr.Row():
	out17 = gr.Plot()
	out18 = gr.Plot()
	with gr.Row():
	out19 = gr.Plot()
	out20 = gr.Plot()
	with gr.Row():
	out21 = gr.Plot()
	with gr.Tab("Visualization by Week"):
	with gr.Row():
	out12 = gr.Plot()
	out13 = gr.Plot()
	with gr.Row():
	out14 = gr.Plot()
	out15 = gr.Plot()
	with gr.Row():
	out16 = gr.Plot()
	with gr.Tab("Visualization by Month"):
	with gr.Row():
	out7 = gr.Plot()
	out8 = gr.Plot()
	with gr.Row():
	out9 = gr.Plot()
	out10 = gr.Plot()
	with gr.Row():
	out11 = gr.Plot()
	with gr.Tab("Visualization by Year"):
	with gr.Row():
	out2 = gr.Plot()
	out3 = gr.Plot()
	with gr.Row():
	out4 = gr.Plot()
	out5 = gr.Plot()
	with gr.Row():
	out6 = gr.Plot()

	gr.Examples(
	examples=[["DSD_Tweets.csv"]],
	fn = file_viz,
	inputs = filein,
	outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26]
	)
	submit_file.click(fn=file_viz,inputs = filein, outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26])
	submit_btn.click(fn=scrape_tweets, inputs=[text,username,since,until,retweets,replies,count], outputs=[out0,out2,out3,out4,out5,out6,out7,out8,out9,out10,out11,out12,out13,out14,out15,out16,out17,out18,out19,out20,out21,out22,out23,out24,out25,out26])


	demo.queue(concurrency_count=5).launch()