Spaces:

norygano
/

causev

Running

File size: 13,758 Bytes

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import umap
import streamlit as st

@st.cache_data
def load_data(file_path):
    return pd.read_csv(file_path, sep='\t')

class Plot:
    def __init__(self, data_file='data/feature_matrix.tsv', metadata_file='data/indicator_cause_sentence_metadata.tsv'):
        self.data_file = data_file
        self.metadata_file = metadata_file
        self.df = load_data(self.data_file)  # Cached data loading
        self.metadata_df = load_data(self.metadata_file)
        
        # Cache and compute necessary columns once
        self.indicator_columns = [col for col in self.df.columns if col.startswith('indicator_')]
        self.cause_columns = [col for col in self.df.columns if col.startswith('cause_')]
        
        self.df['Year'] = self.df['text_date'].astype(str).str[:4]
        self.df['Has_Indicator'] = self.df[self.indicator_columns].sum(axis=1) > 0

        # Precompute totals for faster use in chart functions
        self.total_sentences_per_year = self.df.groupby(['Year', 'subfolder']).size().reset_index(name='Total Sentences')
        self.total_sentences_per_subfolder = self.df.groupby('subfolder').size().reset_index(name='Total Sentences')

    def get_indicator_chart(self, chart_type='total', individual_threshold=5):
        if chart_type == 'total':
            # Summarize indicator share per subfolder
            indicator_counts = self.df[self.df['Has_Indicator']].groupby('subfolder').size().reset_index(name='Indicator Count')
            total_counts = indicator_counts.merge(self.total_sentences_per_subfolder, on='subfolder')
            total_counts['Indicator_Share'] = total_counts['Indicator Count'] / total_counts['Total Sentences']
            total_counts['Indicator_Share_Text'] = (total_counts['Indicator_Share'] * 100).round(2).astype(str) + '%'

            fig = px.bar(
                total_counts,
                x='subfolder',
                y='Indicator_Share',
                labels={'Indicator_Share': 'Share of Sentences with Indicators', 'subfolder': ''},
                color='subfolder',
                text='Indicator_Share_Text',
                color_discrete_sequence=px.colors.qualitative.D3
            )
            fig.update_traces(
                textposition='inside',
                insidetextanchor='middle',
                texttemplate='%{text}',
                textfont=dict(color='rgb(255, 255, 255)')
            )

        elif chart_type == 'individual':
            # Melt the dataframe to long format
            df_melted = self.df.melt(id_vars=['subfolder'], value_vars=self.indicator_columns, var_name='Indicator', value_name='Count')
            df_melted = df_melted[df_melted['Count'] > 0]

            # Group by Indicator only to calculate total counts across all subfolders
            total_indicator_counts = df_melted.groupby('Indicator').size().reset_index(name='Total Count')
            indicators_meeting_threshold = total_indicator_counts[total_indicator_counts['Total Count'] >= individual_threshold]['Indicator'].unique()

            # Filter df_melted to include only indicators that meet the threshold overall
            df_melted = df_melted[df_melted['Indicator'].isin(indicators_meeting_threshold)]
            df_melted['Indicator'] = df_melted['Indicator'].str.replace('indicator_', '').str.capitalize()

            # Re-aggregate counts by subfolder and indicator for the filtered indicators
            df_melted = df_melted.groupby(['subfolder', 'Indicator']).size().reset_index(name='Count')

            # Create the bar chart
            fig = px.bar(
                df_melted,
                x='subfolder', 
                y='Count',
                color='Indicator',
                barmode='group',
                labels={'Count': 'Occurrences', 'subfolder': '', 'Indicator': 'Indicator'},
                color_discrete_sequence=px.colors.qualitative.D3
            )
            fig.update_traces(
                texttemplate='%{y}',
                textposition='inside',
                insidetextanchor='middle',
                textfont=dict(color='rgb(255, 255, 255)')
            )

        elif chart_type == 'year':
            indicator_counts_per_year = self.df[self.df['Has_Indicator']].groupby(['Year', 'subfolder']).size().reset_index(name='Indicator Count')
            df_summary = pd.merge(self.total_sentences_per_year, indicator_counts_per_year, on=['Year', 'subfolder'], how='left')
            df_summary['Indicator_Share_Text'] = (df_summary['Indicator Count'] / df_summary['Total Sentences'] * 100).round(2).astype(str) + '%'

            fig = px.bar(
                df_summary,
                x='Year',
                y='Total Sentences',
                color='subfolder',
                labels={'Total Sentences': 'Total Number of Sentences', 'Year': 'Year'},
                text='Indicator_Share_Text',
                color_discrete_sequence=px.colors.qualitative.D3
            )
            fig.update_traces(
                textposition='inside',
                texttemplate='%{text}',
                insidetextanchor='middle',
                textfont=dict(color='rgb(255, 255, 255)')
            )

        fig.update_layout(
            xaxis=dict(showline=True),
            yaxis=dict(title='Indicator Sentences' if chart_type != 'year' else 'Total Sentences'),
            bargap=0.05,
            showlegend=(chart_type != 'total')
        )
        return fig

    def get_causes_chart(self, min_value=30):
        df_filtered = self.metadata_df[self.metadata_df['cause'] != 'N/A']
        causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
        df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
        df_filtered['cause'] = df_filtered['cause'].str.capitalize()

        fig = px.bar(
            df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
            x='subfolder', 
            y='Count', 
            color='cause',
            barmode='group',
            labels={'Count': 'Occurrences', 'subfolder': '', 'cause': 'Cause'},
            color_discrete_sequence=px.colors.qualitative.D3
        )
        fig.update_layout(xaxis=dict(showline=True), yaxis=dict(showticklabels=True, title=''))
        fig.update_traces(
            texttemplate='%{y}',
            textposition='inside',
            insidetextanchor='middle',
            textfont=dict(color='rgb(255, 255, 255)')
        )
        return fig

    def scatter(self, include_modality=False):
        # Use self.df to avoid reloading data
        df_filtered = self.df[(self.df[self.indicator_columns].sum(axis=1) > 0) |
                              (self.df[self.cause_columns].sum(axis=1) > 0)]

        # Exclude specific indicators and filter based on count threshold
        indicator_columns = [col for col in self.indicator_columns if 'indicator_!besprechen' not in col]
        indicator_counts = df_filtered[indicator_columns].sum()
        indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()
        df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]

        # Exclude non-feature columns for dimensionality reduction
        columns_to_drop = ['subfolder', 'text_id', 'sentence_id', 'text_date', 'text_source', 'text_text_type']
        if not include_modality:
            columns_to_drop += [col for col in self.df.columns if col.startswith('modality_')]

        features = df_filtered.drop(columns=columns_to_drop, errors='ignore').select_dtypes(include=[float, int])
        features_clean = features.fillna(0)

        # Prepare metadata for plotting
        metadata = df_filtered[['subfolder']].copy()
        metadata['indicator'] = df_filtered[indicators_to_keep].apply(
            lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]),
            axis=1
        )
        metadata['cause'] = df_filtered[self.cause_columns].apply(
            lambda row: ', '.join([cause.replace('cause_', '') for cause in self.cause_columns if row[cause] > 0]),
            axis=1
        )

        # Perform UMAP dimensionality reduction
        reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine')
        reduced_features = reducer.fit_transform(features_clean)
        df_reduced = pd.DataFrame(reduced_features, columns=['UMAP x', 'UMAP y'])
        df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)

        # Plotting the scatter plot
        hover_data = {'cause': True, 'UMAP x': False, 'UMAP y': False}
        if include_modality:
            hover_data['Modality'] = True

        fig = px.scatter(
            df_reduced,
            x='UMAP x',
            y='UMAP y',
            color='subfolder',
            symbol='indicator',
            labels={'subfolder': 'Effect'},
            hover_data=hover_data,
            color_discrete_sequence=px.colors.qualitative.D3
        )

        fig.update_layout(
            xaxis=dict(showgrid=True),
            yaxis=dict(showgrid=True),
            showlegend=True,
            legend=dict(title="Effect, Indicator", yanchor="top", xanchor="left", borderwidth=1),
        )

        return fig

    def sankey(self, cause_threshold=10, indicator_threshold=5, link_opacity=0.4):
        # Use self.df to avoid reloading data
        df_filtered = self.df[(self.df[self.cause_columns].sum(axis=1) > 0) & 
                              (self.df[self.indicator_columns].sum(axis=1) > 0)]

        # Melt causes and indicators separately, ensuring unique sentence IDs
        cause_data = df_filtered[['text_id', 'subfolder'] + self.cause_columns].melt(
            id_vars=['text_id', 'subfolder'], var_name='cause', value_name='count'
        ).query("count > 0").drop_duplicates(['text_id', 'cause'])

        indicator_data = df_filtered[['text_id', 'subfolder'] + self.indicator_columns].melt(
            id_vars=['text_id', 'subfolder'], var_name='indicator', value_name='count'
        ).query("count > 0").drop_duplicates(['text_id', 'indicator'])

        # Apply threshold filters
        valid_causes = cause_data['cause'].value_counts()[lambda x: x >= cause_threshold].index
        valid_indicators = indicator_data['indicator'].value_counts()[lambda x: x >= indicator_threshold].index
        cause_data = cause_data[cause_data['cause'].isin(valid_causes)]
        indicator_data = indicator_data[indicator_data['indicator'].isin(valid_indicators)]

        # Create unique cause-indicator-subfolder links by merging cause and indicator data on 'text_id' and 'subfolder'
        cause_indicator_links = (
            cause_data.merge(indicator_data, on=['text_id', 'subfolder'])
            .groupby(['cause', 'indicator']).size().reset_index(name='count')
        )

        # Aggregate indicator-subfolder counts
        indicator_subfolder_links = (
            indicator_data.groupby(['indicator', 'subfolder']).size().reset_index(name='count')
        )

        # Define unique labels and their order
        all_labels = list(valid_causes) + list(valid_indicators) + self.df['subfolder'].unique().tolist()
        
        # Remove prefixes for cleaner labels
        all_labels_cleaned = [label.replace("cause_", "").replace("indicator_", "") for label in all_labels]
        label_to_index = {label: idx for idx, label in enumerate(all_labels)}

        # Define a color palette from Plotly's D3 color sequence
        color_palette = px.colors.qualitative.D3
        node_colors = [color_palette[i % len(color_palette)] for i in range(len(all_labels))]

        # Define sources, targets, values, and link colors with RGBA opacity
        sources, targets, values, link_colors = [], [], [], []

        def hex_to_rgba(hex_color, opacity):
            return f'rgba({int(hex_color[1:3], 16)}, {int(hex_color[3:5], 16)}, {int(hex_color[5:], 16)}, {opacity})'

        # Cause -> Indicator links
        for _, row in cause_indicator_links.iterrows():
            if row['cause'] in label_to_index and row['indicator'] in label_to_index:
                source_idx = label_to_index[row['cause']]
                target_idx = label_to_index[row['indicator']]
                sources.append(source_idx)
                targets.append(target_idx)
                values.append(row['count'])
                link_colors.append(hex_to_rgba(node_colors[source_idx], link_opacity))

        # Indicator -> Subfolder links
        for _, row in indicator_subfolder_links.iterrows():
            if row['indicator'] in label_to_index and row['subfolder'] in label_to_index:
                source_idx = label_to_index[row['indicator']]
                target_idx = label_to_index[row['subfolder']]
                sources.append(source_idx)
                targets.append(target_idx)
                values.append(row['count'])
                link_colors.append(hex_to_rgba(node_colors[source_idx], link_opacity))

        fig = go.Figure(data=[go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="black", width=0.5),
                label=all_labels_cleaned,
                color=node_colors
            ),
            link=dict(
                source=sources,
                target=targets,
                value=values,
                color=link_colors
            )
        )])

        fig.update_layout(
            autosize=False,
            width=800,
            height=600,
            font=dict(size=10)
        )

        return fig