Spaces:

norygano
/

causev

Running

App Files Files Community

causev / plot.py

norygano

Kolloquium

adb4a34 8 months ago

raw

history blame contribute delete

13.8 kB

	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import os
	import umap
	import streamlit as st

	@st.cache_data
	def load_data(file_path):
	return pd.read_csv(file_path, sep='\t')

	class Plot:
	def __init__(self, data_file='data/feature_matrix.tsv', metadata_file='data/indicator_cause_sentence_metadata.tsv'):
	self.data_file = data_file
	self.metadata_file = metadata_file
	self.df = load_data(self.data_file) # Cached data loading
	self.metadata_df = load_data(self.metadata_file)

	# Cache and compute necessary columns once
	self.indicator_columns = [col for col in self.df.columns if col.startswith('indicator_')]
	self.cause_columns = [col for col in self.df.columns if col.startswith('cause_')]

	self.df['Year'] = self.df['text_date'].astype(str).str[:4]
	self.df['Has_Indicator'] = self.df[self.indicator_columns].sum(axis=1) > 0

	# Precompute totals for faster use in chart functions
	self.total_sentences_per_year = self.df.groupby(['Year', 'subfolder']).size().reset_index(name='Total Sentences')
	self.total_sentences_per_subfolder = self.df.groupby('subfolder').size().reset_index(name='Total Sentences')

	def get_indicator_chart(self, chart_type='total', individual_threshold=5):
	if chart_type == 'total':
	# Summarize indicator share per subfolder
	indicator_counts = self.df[self.df['Has_Indicator']].groupby('subfolder').size().reset_index(name='Indicator Count')
	total_counts = indicator_counts.merge(self.total_sentences_per_subfolder, on='subfolder')
	total_counts['Indicator_Share'] = total_counts['Indicator Count'] / total_counts['Total Sentences']
	total_counts['Indicator_Share_Text'] = (total_counts['Indicator_Share'] * 100).round(2).astype(str) + '%'

	fig = px.bar(
	total_counts,
	x='subfolder',
	y='Indicator_Share',
	labels={'Indicator_Share': 'Share of Sentences with Indicators', 'subfolder': ''},
	color='subfolder',
	text='Indicator_Share_Text',
	color_discrete_sequence=px.colors.qualitative.D3
	)
	fig.update_traces(
	textposition='inside',
	insidetextanchor='middle',
	texttemplate='%{text}',
	textfont=dict(color='rgb(255, 255, 255)')
	)

	elif chart_type == 'individual':
	# Melt the dataframe to long format
	df_melted = self.df.melt(id_vars=['subfolder'], value_vars=self.indicator_columns, var_name='Indicator', value_name='Count')
	df_melted = df_melted[df_melted['Count'] > 0]

	# Group by Indicator only to calculate total counts across all subfolders
	total_indicator_counts = df_melted.groupby('Indicator').size().reset_index(name='Total Count')
	indicators_meeting_threshold = total_indicator_counts[total_indicator_counts['Total Count'] >= individual_threshold]['Indicator'].unique()

	# Filter df_melted to include only indicators that meet the threshold overall
	df_melted = df_melted[df_melted['Indicator'].isin(indicators_meeting_threshold)]
	df_melted['Indicator'] = df_melted['Indicator'].str.replace('indicator_', '').str.capitalize()

	# Re-aggregate counts by subfolder and indicator for the filtered indicators
	df_melted = df_melted.groupby(['subfolder', 'Indicator']).size().reset_index(name='Count')

	# Create the bar chart
	fig = px.bar(
	df_melted,
	x='subfolder',
	y='Count',
	color='Indicator',
	barmode='group',
	labels={'Count': 'Occurrences', 'subfolder': '', 'Indicator': 'Indicator'},
	color_discrete_sequence=px.colors.qualitative.D3
	)
	fig.update_traces(
	texttemplate='%{y}',
	textposition='inside',
	insidetextanchor='middle',
	textfont=dict(color='rgb(255, 255, 255)')
	)

	elif chart_type == 'year':
	indicator_counts_per_year = self.df[self.df['Has_Indicator']].groupby(['Year', 'subfolder']).size().reset_index(name='Indicator Count')
	df_summary = pd.merge(self.total_sentences_per_year, indicator_counts_per_year, on=['Year', 'subfolder'], how='left')
	df_summary['Indicator_Share_Text'] = (df_summary['Indicator Count'] / df_summary['Total Sentences'] * 100).round(2).astype(str) + '%'

	fig = px.bar(
	df_summary,
	x='Year',
	y='Total Sentences',
	color='subfolder',
	labels={'Total Sentences': 'Total Number of Sentences', 'Year': 'Year'},
	text='Indicator_Share_Text',
	color_discrete_sequence=px.colors.qualitative.D3
	)
	fig.update_traces(
	textposition='inside',
	texttemplate='%{text}',
	insidetextanchor='middle',
	textfont=dict(color='rgb(255, 255, 255)')
	)

	fig.update_layout(
	xaxis=dict(showline=True),
	yaxis=dict(title='Indicator Sentences' if chart_type != 'year' else 'Total Sentences'),
	bargap=0.05,
	showlegend=(chart_type != 'total')
	)
	return fig

	def get_causes_chart(self, min_value=30):
	df_filtered = self.metadata_df[self.metadata_df['cause'] != 'N/A']
	causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
	df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
	df_filtered['cause'] = df_filtered['cause'].str.capitalize()

	fig = px.bar(
	df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
	x='subfolder',
	y='Count',
	color='cause',
	barmode='group',
	labels={'Count': 'Occurrences', 'subfolder': '', 'cause': 'Cause'},
	color_discrete_sequence=px.colors.qualitative.D3
	)
	fig.update_layout(xaxis=dict(showline=True), yaxis=dict(showticklabels=True, title=''))
	fig.update_traces(
	texttemplate='%{y}',
	textposition='inside',
	insidetextanchor='middle',
	textfont=dict(color='rgb(255, 255, 255)')
	)
	return fig

	def scatter(self, include_modality=False):
	# Use self.df to avoid reloading data
	df_filtered = self.df[(self.df[self.indicator_columns].sum(axis=1) > 0) \|
	(self.df[self.cause_columns].sum(axis=1) > 0)]

	# Exclude specific indicators and filter based on count threshold
	indicator_columns = [col for col in self.indicator_columns if 'indicator_!besprechen' not in col]
	indicator_counts = df_filtered[indicator_columns].sum()
	indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()
	df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]

	# Exclude non-feature columns for dimensionality reduction
	columns_to_drop = ['subfolder', 'text_id', 'sentence_id', 'text_date', 'text_source', 'text_text_type']
	if not include_modality:
	columns_to_drop += [col for col in self.df.columns if col.startswith('modality_')]

	features = df_filtered.drop(columns=columns_to_drop, errors='ignore').select_dtypes(include=[float, int])
	features_clean = features.fillna(0)

	# Prepare metadata for plotting
	metadata = df_filtered[['subfolder']].copy()
	metadata['indicator'] = df_filtered[indicators_to_keep].apply(
	lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]),
	axis=1
	)
	metadata['cause'] = df_filtered[self.cause_columns].apply(
	lambda row: ', '.join([cause.replace('cause_', '') for cause in self.cause_columns if row[cause] > 0]),
	axis=1
	)

	# Perform UMAP dimensionality reduction
	reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine')
	reduced_features = reducer.fit_transform(features_clean)
	df_reduced = pd.DataFrame(reduced_features, columns=['UMAP x', 'UMAP y'])
	df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)

	# Plotting the scatter plot
	hover_data = {'cause': True, 'UMAP x': False, 'UMAP y': False}
	if include_modality:
	hover_data['Modality'] = True

	fig = px.scatter(
	df_reduced,
	x='UMAP x',
	y='UMAP y',
	color='subfolder',
	symbol='indicator',
	labels={'subfolder': 'Effect'},
	hover_data=hover_data,
	color_discrete_sequence=px.colors.qualitative.D3
	)

	fig.update_layout(
	xaxis=dict(showgrid=True),
	yaxis=dict(showgrid=True),
	showlegend=True,
	legend=dict(title="Effect, Indicator", yanchor="top", xanchor="left", borderwidth=1),
	)

	return fig

	def sankey(self, cause_threshold=10, indicator_threshold=5, link_opacity=0.4):
	# Use self.df to avoid reloading data
	df_filtered = self.df[(self.df[self.cause_columns].sum(axis=1) > 0) &
	(self.df[self.indicator_columns].sum(axis=1) > 0)]

	# Melt causes and indicators separately, ensuring unique sentence IDs
	cause_data = df_filtered[['text_id', 'subfolder'] + self.cause_columns].melt(
	id_vars=['text_id', 'subfolder'], var_name='cause', value_name='count'
	).query("count > 0").drop_duplicates(['text_id', 'cause'])

	indicator_data = df_filtered[['text_id', 'subfolder'] + self.indicator_columns].melt(
	id_vars=['text_id', 'subfolder'], var_name='indicator', value_name='count'
	).query("count > 0").drop_duplicates(['text_id', 'indicator'])

	# Apply threshold filters
	valid_causes = cause_data['cause'].value_counts()[lambda x: x >= cause_threshold].index
	valid_indicators = indicator_data['indicator'].value_counts()[lambda x: x >= indicator_threshold].index
	cause_data = cause_data[cause_data['cause'].isin(valid_causes)]
	indicator_data = indicator_data[indicator_data['indicator'].isin(valid_indicators)]

	# Create unique cause-indicator-subfolder links by merging cause and indicator data on 'text_id' and 'subfolder'
	cause_indicator_links = (
	cause_data.merge(indicator_data, on=['text_id', 'subfolder'])
	.groupby(['cause', 'indicator']).size().reset_index(name='count')
	)

	# Aggregate indicator-subfolder counts
	indicator_subfolder_links = (
	indicator_data.groupby(['indicator', 'subfolder']).size().reset_index(name='count')
	)

	# Define unique labels and their order
	all_labels = list(valid_causes) + list(valid_indicators) + self.df['subfolder'].unique().tolist()

	# Remove prefixes for cleaner labels
	all_labels_cleaned = [label.replace("cause_", "").replace("indicator_", "") for label in all_labels]
	label_to_index = {label: idx for idx, label in enumerate(all_labels)}

	# Define a color palette from Plotly's D3 color sequence
	color_palette = px.colors.qualitative.D3
	node_colors = [color_palette[i % len(color_palette)] for i in range(len(all_labels))]

	# Define sources, targets, values, and link colors with RGBA opacity
	sources, targets, values, link_colors = [], [], [], []

	def hex_to_rgba(hex_color, opacity):
	return f'rgba({int(hex_color[1:3], 16)}, {int(hex_color[3:5], 16)}, {int(hex_color[5:], 16)}, {opacity})'

	# Cause -> Indicator links
	for _, row in cause_indicator_links.iterrows():
	if row['cause'] in label_to_index and row['indicator'] in label_to_index:
	source_idx = label_to_index[row['cause']]
	target_idx = label_to_index[row['indicator']]
	sources.append(source_idx)
	targets.append(target_idx)
	values.append(row['count'])
	link_colors.append(hex_to_rgba(node_colors[source_idx], link_opacity))

	# Indicator -> Subfolder links
	for _, row in indicator_subfolder_links.iterrows():
	if row['indicator'] in label_to_index and row['subfolder'] in label_to_index:
	source_idx = label_to_index[row['indicator']]
	target_idx = label_to_index[row['subfolder']]
	sources.append(source_idx)
	targets.append(target_idx)
	values.append(row['count'])
	link_colors.append(hex_to_rgba(node_colors[source_idx], link_opacity))

	fig = go.Figure(data=[go.Sankey(
	node=dict(
	pad=15,
	thickness=20,
	line=dict(color="black", width=0.5),
	label=all_labels_cleaned,
	color=node_colors
	),
	link=dict(
	source=sources,
	target=targets,
	value=values,
	color=link_colors
	)
	)])

	fig.update_layout(
	autosize=False,
	width=800,
	height=600,
	font=dict(size=10)
	)

	return fig