Spaces:

sentdiario
/

Sentdiario

Sleeping

App Files Files Community

Sentdiario / index.py

thak123

Update index.py

3c0d911 verified 8 months ago

raw

history blame

13.9 kB

	import dash
	from dash import Dash, html, dcc, callback, Output, Input
	from dash import dash_table
	import plotly.express as px

	from app import app
	import pandas as pd

	import datetime
	import requests
	from io import StringIO
	from datetime import date

	import dash_bootstrap_components as dbc
	import plotly.express as px

	server = app.server

	url='https://drive.google.com/file/d/1NaXOYHQFF5UO5rQr4rn8Lr3bkYMSOq4_/view?usp=sharing'
	url='https://drive.google.com/uc?id=' + url.split('/')[-2]

	# reading of file
	df = pd.read_csv(url)

	# filtering the file for more than 4 tokens
	df = df[df['Headline'].str.split().str.len().gt(4)]

	df['date'] = pd.to_datetime(df['date'])

	unique_domains = df['domain_folder_name'].unique()
	print(unique_domains)

	unique_topics = df['Topic'].unique()
	print(unique_topics)

	#copying a column
	df["Veículos de notícias"] = df["domain_folder_name"]

	# df = df.rename(columns={df.columns[4]: "Veículos de notícias"})

	df['FinBERT_label'] = df['FinBERT_label'].astype(str)
	df['FinBERT_label'].replace({
	'3.0': 'positive',
	'2.0': 'neutral',
	'1.0': 'negative'
	}, inplace=True)



	counts = df.groupby(['date', 'Topic', 'domain_folder_name', 'FinBERT_label']).size().reset_index(name='count')
	counts['count'] = counts['count'].astype('float64')
	counts['rolling_mean_counts'] = counts['count'].rolling(window=30, min_periods=2).mean()

	df_pos = counts[[x in ['positive'] for x in counts.FinBERT_label]]
	df_neu = counts[[x in ['neutral'] for x in counts.FinBERT_label]]
	df_neg = counts[[x in ['negative'] for x in counts.FinBERT_label]]



	app.layout = dbc.Container([
	dbc.Row([ # row 1
	dbc.Col([html.H1('Evolução temporal de sentimento em títulos de notícias')],
	className="text-center mt-3 mb-1")]),

	dbc.Row([ # row 2
	dbc.Label("Selecione um período (mm/dd/aaaa):", className="fw-bold")]),

	dbc.Row([ # row 3
	dcc.DatePickerRange(
	id='date-range',
	min_date_allowed=df['date'].min().date(),
	max_date_allowed=df['date'].max().date(),
	initial_visible_month=df['date'].min().date(),
	start_date=df['date'].min().date(),
	end_date=df['date'].max().date())]),

	dbc.Row([ # row 4
	dbc.Label("Escolha um tópico:", className="fw-bold")
	]),

	dbc.Row([ # row 5
	dbc.Col(
	dcc.Dropdown(
	id="topic-selector",
	options=[
	{"label": topic, "value": topic} for topic in unique_topics
	],
	value="Imigrantes", # Set the initial value
	style={"width": "50%"})


	)
	]),

	dbc.Row([ # row 6
	dbc.Col(dcc.Graph(id='line-graph-1'))
	]),

	dbc.Row([ # row 7 but needs to be updated
	dbc.Col(dcc.Graph(id="bar-graph-1"))
	]),

	# html.Div(id='pie-container-1'),
	dbc.Row([ # row 9
	dbc.Col(dcc.Graph(id='pie-graph-1'),
	)
	]),

	dbc.Row([ # row 7
	dbc.Label("Escolha um site de notícias:", className="fw-bold")
	]),

	dbc.Row([ # row 8
	dbc.Col(
	dcc.Dropdown(
	id="domain-selector",
	options=[
	{"label": domain, "value": domain} for domain in unique_domains
	],
	value="expresso-pt", # Set the initial value
	style={"width": "50%"})


	)
	]),

	dbc.Row([ # row 9
	dbc.Col(dcc.Graph(id='line-graph-2'),
	)
	]),

	# dbc.Row([ # row 9
	# dbc.Col(dcc.Graph(id='line-graph-2'),
	# )
	# ]),

	# dbc.Row([ # row 10
	# dbc.Col(dcc.Graph(id='line-graph-3'),
	# )
	# ]),

	# dbc.Row([ # row 11
	# dbc.Col(dcc.Graph(id='line-graph-4'),
	# )
	# ]),

	# html.Div(id='pie-container-2'),
	dbc.Row([ # row 9
	dbc.Col(dcc.Graph(id='pie-graph-2'),
	)
	]),

	dbc.Row([ # row 9
	dbc.Col(
	dash_table.DataTable(
	id='headlines-table',
	columns=[
	{"name": "Headline", "id": "Headline"},
	{"name": "URL", "id": "url"},
	{"name": "Date", "id": "date"},
	{"name": "Sentiment Label", "id": "FinBERT_label"}
	],
	style_table={'overflowX': 'auto'},
	style_cell={
	'textAlign': 'left',
	'whiteSpace': 'normal',
	'height': 'auto',
	'minWidth': '180px', 'width': '180px', 'maxWidth': '180px',
	},
	page_current= 0,
	page_size= 10,
	)
	)
	])

	])

	# # Create a function to generate pie charts
	# def generate_pie_chart(category):
	# labels = data[category]['labels']
	# values = data[category]['values']
	# trace = go.Pie(labels=labels, values=values)
	# layout = go.Layout(title=f'Pie Chart - {category}')
	# return dcc.Graph(
	# figure={
	# 'data': [trace],
	# 'layout': layout
	# }
	# )

	# callback decorator
	@app.callback(
	Output('line-graph-1', 'figure'),
	Output('bar-graph-1','figure'),
	Output('pie-graph-1', 'figure'),
	Output('line-graph-2', 'figure'),
	Output('pie-graph-2', 'figure'),
	Output('headlines-table', 'data'),
	Input("topic-selector", "value"),
	Input("domain-selector", "value"),
	Input('date-range', 'start_date'),
	Input('date-range', 'end_date')
	)
	def update_output(selected_topic, selected_domain, start_date, end_date):
	#log
	print("topic",selected_topic,"domain",selected_domain,"start", start_date,"date", end_date)

	# filter dataframes based on updated data range
	mask_1 = ((df["Topic"] == selected_topic) & (df['date'] >= start_date) & (df['date'] <= end_date))
	df_filtered = df.loc[mask_1]
	print(df_filtered.shape)
	if len(df_filtered)>0:
	#create line graphs based on filtered dataframes
	line_fig_1 = px.line(df_filtered, x="date", y="normalised results",
	color='Veículos de notícias', title="O gráfico mostra a evolução temporal de sentimento dos títulos de notícias. Numa escala de -1 (negativo) a 1 (positivo), sendo 0 (neutro).")

	# Veículos de notícias
	#set x-axis title and y-axis title in line graphs
	line_fig_1.update_layout(
	xaxis_title='Data',
	yaxis_title='Classificação de Sentimento')

	#set label format on y-axis in line graphs
	line_fig_1.update_xaxes(tickformat="%b %d<br>%Y")

	# Bar Graph start
	grouped_df = df_filtered.groupby(['date', 'Veículos de notícias']).size().reset_index(name='occurrences')

	# Sort DataFrame by 'period' column
	grouped_df = grouped_df.sort_values(by='date')

	# Create a list of all unique media
	all_media = df_filtered['domain_folder_name'].unique()

	# Create a date range from Jan/2000 to the last month in the dataset
	date_range = pd.date_range(start=df_filtered['date'].min().date(), end=df_filtered['date'].max().date(), freq='MS')

	# Create a MultiIndex with all combinations of date_range and all_media
	idx = pd.MultiIndex.from_product([date_range, all_media], names=['date', 'Veículos de notícias'])

	# Reindex the DataFrame to include all periods and media
	grouped_df = grouped_df.set_index(['date', 'Veículos de notícias']).reindex(idx, fill_value=0).reset_index()

	bar_fig_1 = px.bar(grouped_df, x='date', y='occurrences', color='Veículos de notícias',
	labels={'date': 'Período', 'occurrences': 'Número de notícias', 'Veículos de notícias': 'Portal'},
	title='Número de notícias por período de tempo')
	bar_fig_1.update_xaxes(tickformat="%b %d<br>%Y")
	# Bar Graph ends

	# line-fig 2 starts
	# filter dataframes based on updated data range
	# Filtering data...
	df_filtered_2 = counts[(counts['Topic'] == selected_topic) &
	(counts['domain_folder_name'] == selected_domain) &
	(counts['date'] >= start_date) &
	(counts['date'] <= end_date)]

	# Create a date range for the selected period
	date_range = pd.date_range(start=start_date, end=end_date)

	# Create a DataFrame with all possible combinations of classes, topics, and dates
	all_combinations = pd.MultiIndex.from_product([['positive', 'neutral', 'negative'],
	[selected_topic],
	[selected_domain],
	date_range],
	names=['FinBERT_label', 'Topic', 'domain_folder_name', 'date'])
	df_all_combinations = pd.DataFrame(index=all_combinations).reset_index()

	# Merge filtered DataFrame with DataFrame of all combinations
	merged_df = pd.merge(df_all_combinations, df_filtered_2, on=['FinBERT_label', 'Topic', 'domain_folder_name', 'date'], how='left')

	# Fill missing values with zeros
	merged_df['count'].fillna(0, inplace=True)
	merged_df['rolling_mean_counts'].fillna(0, inplace=True)

	# Create line graph...
	line_fig_2 = px.line(merged_df, x="date", y="count", color="FinBERT_label",
	line_group="FinBERT_label", title="Sentiment Over Time",
	labels={"count": "Number of News Articles", "date": "Date"})


	# Update layout...
	line_fig_2.update_layout(xaxis_title='Date', yaxis_title='Number of News Articles',
	xaxis=dict(tickformat="%b %d<br>%Y"))
	# line-fig 2 ends

	# Map original labels to their translated versions
	label_translation = {'positive': 'positivo', 'neutral': 'neutro', 'negative': 'negativo'}
	df_filtered['FinBERT_label_transformed'] = df_filtered['FinBERT_label'].map(label_translation)

	# Group by FinBERT_label and count occurrences
	label_counts_all = df_filtered['FinBERT_label_transformed'].value_counts()

	# Calculate percentage of each label
	label_percentages_all = (label_counts_all / label_counts_all.sum()) * 100

	# Plot general pie chart
	pie_chart_1 = px.pie(
	values=label_percentages_all,
	names=label_percentages_all.index,
	title='Distribuição Geral',
	color_discrete_sequence=['#039a4d', '#3c03f4', '#ca3919']
	)

	# Get unique media categories
	media_categories = df_filtered['Veículos de notícias'].unique()

	# Define colors for each label
	label_colors = {'positivo': '#039a4d', 'neutro': '#3c03f4', 'negativo': '#ca3919'}

	# Filter DataFrame for current media category
	media_df = df_filtered[df_filtered['Veículos de notícias'] == selected_domain]

	# Group by FinBERT_label and count occurrences
	label_counts = media_df['FinBERT_label_transformed'].value_counts()

	# Calculate percentage of each label
	label_percentages = (label_counts / label_counts.sum()) * 100

	# Plot pie chart
	pie_chart_2 = px.pie(
	values=label_percentages,
	names=label_percentages.index,
	title=f'Distribuição para {selected_domain}',
	color_discrete_sequence=[label_colors[label] for label in label_percentages.index]
	)
	# pie_chart_2 = dcc.Graph(figure=fig)
	# pie_chart_2 = html.Div(fig,className='four columns')

	# Convert FinBERT_label to categorical for better sorting
	df_filtered['FinBERT_label'] = pd.Categorical(df_filtered['FinBERT_label'],
	categories=['positive', 'neutral', 'negative'],
	ordered=True)

	# Sort DataFrame by sentiment label and date
	data_table_1 = df_filtered.sort_values(by=['FinBERT_label', 'date'])

	return line_fig_1, bar_fig_1, pie_chart_1, line_fig_2, pie_chart_2, data_table_1.to_dict('records')
	else:
	return {'data': []},{'data': []} ,{'data': []} ,{'data': []} , {'data': []}, {'data': []}

	# return line_fig_1



	# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')



	# app.layout = html.Div([
	# html.H1(children='Title of Dash App', style={'textAlign':'center'}),
	# dcc.Dropdown(df.country.unique(), 'Canada', id='dropdown-selection'),
	# dcc.Graph(id='graph-content')
	# ])

	# @callback(
	# Output('graph-content', 'figure'),
	# Input('dropdown-selection', 'value')
	# )
	# def update_graph(value):
	# dff = df[df.country==value]
	# return px.line(dff, x='year', y='pop')

	# # Define callback function for updating the headlines table
	# @app.callback(
	# Output('headlines-table', 'data'),
	# Input("topic-selector", "value"),
	# Input("domain-selector", "value"),
	# Input('date-range', 'start_date'),
	# Input('date-range', 'end_date')
	# )
	# def update_headlines_table(selected_topic, selected_domain, start_date, end_date):
	# # Filtering data...


	if __name__ == '__main__':
	app.run_server(debug=True)