File size: 13,758 Bytes
45d0933
 
9525dec
45d0933
 
7e996cc
 
 
 
 
 
 
 
 
 
 
 
45d0933
7e996cc
 
 
45d0933
7e996cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adb4a34
7e996cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adb4a34
7e996cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adb4a34
7e996cc
 
 
 
 
 
 
 
45d0933
7e996cc
45d0933
7e996cc
 
 
 
 
45d0933
 
7e996cc
 
45d0933
7e996cc
45d0933
7e996cc
45d0933
 
7e996cc
45d0933
 
 
adb4a34
7e996cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adb4a34
7e996cc
 
 
adb4a34
7e996cc
 
 
 
 
adb4a34
 
7e996cc
 
 
 
 
45d0933
 
7e996cc
 
 
 
 
9525dec
 
7e996cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9525dec
7e996cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import umap
import streamlit as st

@st.cache_data
def load_data(file_path):
    return pd.read_csv(file_path, sep='\t')

class Plot:
    def __init__(self, data_file='data/feature_matrix.tsv', metadata_file='data/indicator_cause_sentence_metadata.tsv'):
        self.data_file = data_file
        self.metadata_file = metadata_file
        self.df = load_data(self.data_file)  # Cached data loading
        self.metadata_df = load_data(self.metadata_file)
        
        # Cache and compute necessary columns once
        self.indicator_columns = [col for col in self.df.columns if col.startswith('indicator_')]
        self.cause_columns = [col for col in self.df.columns if col.startswith('cause_')]
        
        self.df['Year'] = self.df['text_date'].astype(str).str[:4]
        self.df['Has_Indicator'] = self.df[self.indicator_columns].sum(axis=1) > 0

        # Precompute totals for faster use in chart functions
        self.total_sentences_per_year = self.df.groupby(['Year', 'subfolder']).size().reset_index(name='Total Sentences')
        self.total_sentences_per_subfolder = self.df.groupby('subfolder').size().reset_index(name='Total Sentences')

    def get_indicator_chart(self, chart_type='total', individual_threshold=5):
        if chart_type == 'total':
            # Summarize indicator share per subfolder
            indicator_counts = self.df[self.df['Has_Indicator']].groupby('subfolder').size().reset_index(name='Indicator Count')
            total_counts = indicator_counts.merge(self.total_sentences_per_subfolder, on='subfolder')
            total_counts['Indicator_Share'] = total_counts['Indicator Count'] / total_counts['Total Sentences']
            total_counts['Indicator_Share_Text'] = (total_counts['Indicator_Share'] * 100).round(2).astype(str) + '%'

            fig = px.bar(
                total_counts,
                x='subfolder',
                y='Indicator_Share',
                labels={'Indicator_Share': 'Share of Sentences with Indicators', 'subfolder': ''},
                color='subfolder',
                text='Indicator_Share_Text',
                color_discrete_sequence=px.colors.qualitative.D3
            )
            fig.update_traces(
                textposition='inside',
                insidetextanchor='middle',
                texttemplate='%{text}',
                textfont=dict(color='rgb(255, 255, 255)')
            )

        elif chart_type == 'individual':
            # Melt the dataframe to long format
            df_melted = self.df.melt(id_vars=['subfolder'], value_vars=self.indicator_columns, var_name='Indicator', value_name='Count')
            df_melted = df_melted[df_melted['Count'] > 0]

            # Group by Indicator only to calculate total counts across all subfolders
            total_indicator_counts = df_melted.groupby('Indicator').size().reset_index(name='Total Count')
            indicators_meeting_threshold = total_indicator_counts[total_indicator_counts['Total Count'] >= individual_threshold]['Indicator'].unique()

            # Filter df_melted to include only indicators that meet the threshold overall
            df_melted = df_melted[df_melted['Indicator'].isin(indicators_meeting_threshold)]
            df_melted['Indicator'] = df_melted['Indicator'].str.replace('indicator_', '').str.capitalize()

            # Re-aggregate counts by subfolder and indicator for the filtered indicators
            df_melted = df_melted.groupby(['subfolder', 'Indicator']).size().reset_index(name='Count')

            # Create the bar chart
            fig = px.bar(
                df_melted,
                x='subfolder', 
                y='Count',
                color='Indicator',
                barmode='group',
                labels={'Count': 'Occurrences', 'subfolder': '', 'Indicator': 'Indicator'},
                color_discrete_sequence=px.colors.qualitative.D3
            )
            fig.update_traces(
                texttemplate='%{y}',
                textposition='inside',
                insidetextanchor='middle',
                textfont=dict(color='rgb(255, 255, 255)')
            )

        elif chart_type == 'year':
            indicator_counts_per_year = self.df[self.df['Has_Indicator']].groupby(['Year', 'subfolder']).size().reset_index(name='Indicator Count')
            df_summary = pd.merge(self.total_sentences_per_year, indicator_counts_per_year, on=['Year', 'subfolder'], how='left')
            df_summary['Indicator_Share_Text'] = (df_summary['Indicator Count'] / df_summary['Total Sentences'] * 100).round(2).astype(str) + '%'

            fig = px.bar(
                df_summary,
                x='Year',
                y='Total Sentences',
                color='subfolder',
                labels={'Total Sentences': 'Total Number of Sentences', 'Year': 'Year'},
                text='Indicator_Share_Text',
                color_discrete_sequence=px.colors.qualitative.D3
            )
            fig.update_traces(
                textposition='inside',
                texttemplate='%{text}',
                insidetextanchor='middle',
                textfont=dict(color='rgb(255, 255, 255)')
            )

        fig.update_layout(
            xaxis=dict(showline=True),
            yaxis=dict(title='Indicator Sentences' if chart_type != 'year' else 'Total Sentences'),
            bargap=0.05,
            showlegend=(chart_type != 'total')
        )
        return fig

    def get_causes_chart(self, min_value=30):
        df_filtered = self.metadata_df[self.metadata_df['cause'] != 'N/A']
        causes_meeting_threshold = df_filtered.groupby('cause')['cause'].count()[lambda x: x >= min_value].index
        df_filtered = df_filtered[df_filtered['cause'].isin(causes_meeting_threshold)]
        df_filtered['cause'] = df_filtered['cause'].str.capitalize()

        fig = px.bar(
            df_filtered.groupby(['subfolder', 'cause']).size().reset_index(name='Count'),
            x='subfolder', 
            y='Count', 
            color='cause',
            barmode='group',
            labels={'Count': 'Occurrences', 'subfolder': '', 'cause': 'Cause'},
            color_discrete_sequence=px.colors.qualitative.D3
        )
        fig.update_layout(xaxis=dict(showline=True), yaxis=dict(showticklabels=True, title=''))
        fig.update_traces(
            texttemplate='%{y}',
            textposition='inside',
            insidetextanchor='middle',
            textfont=dict(color='rgb(255, 255, 255)')
        )
        return fig

    def scatter(self, include_modality=False):
        # Use self.df to avoid reloading data
        df_filtered = self.df[(self.df[self.indicator_columns].sum(axis=1) > 0) |
                              (self.df[self.cause_columns].sum(axis=1) > 0)]

        # Exclude specific indicators and filter based on count threshold
        indicator_columns = [col for col in self.indicator_columns if 'indicator_!besprechen' not in col]
        indicator_counts = df_filtered[indicator_columns].sum()
        indicators_to_keep = indicator_counts[indicator_counts >= 10].index.tolist()
        df_filtered = df_filtered[df_filtered[indicators_to_keep].sum(axis=1) > 0]

        # Exclude non-feature columns for dimensionality reduction
        columns_to_drop = ['subfolder', 'text_id', 'sentence_id', 'text_date', 'text_source', 'text_text_type']
        if not include_modality:
            columns_to_drop += [col for col in self.df.columns if col.startswith('modality_')]

        features = df_filtered.drop(columns=columns_to_drop, errors='ignore').select_dtypes(include=[float, int])
        features_clean = features.fillna(0)

        # Prepare metadata for plotting
        metadata = df_filtered[['subfolder']].copy()
        metadata['indicator'] = df_filtered[indicators_to_keep].apply(
            lambda row: ', '.join([indicator.replace('indicator_', '') for indicator in indicators_to_keep if row[indicator] > 0]),
            axis=1
        )
        metadata['cause'] = df_filtered[self.cause_columns].apply(
            lambda row: ', '.join([cause.replace('cause_', '') for cause in self.cause_columns if row[cause] > 0]),
            axis=1
        )

        # Perform UMAP dimensionality reduction
        reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=50, n_jobs=1, metric='cosine')
        reduced_features = reducer.fit_transform(features_clean)
        df_reduced = pd.DataFrame(reduced_features, columns=['UMAP x', 'UMAP y'])
        df_reduced = pd.concat([df_reduced, metadata.reset_index(drop=True)], axis=1)

        # Plotting the scatter plot
        hover_data = {'cause': True, 'UMAP x': False, 'UMAP y': False}
        if include_modality:
            hover_data['Modality'] = True

        fig = px.scatter(
            df_reduced,
            x='UMAP x',
            y='UMAP y',
            color='subfolder',
            symbol='indicator',
            labels={'subfolder': 'Effect'},
            hover_data=hover_data,
            color_discrete_sequence=px.colors.qualitative.D3
        )

        fig.update_layout(
            xaxis=dict(showgrid=True),
            yaxis=dict(showgrid=True),
            showlegend=True,
            legend=dict(title="Effect, Indicator", yanchor="top", xanchor="left", borderwidth=1),
        )

        return fig

    def sankey(self, cause_threshold=10, indicator_threshold=5, link_opacity=0.4):
        # Use self.df to avoid reloading data
        df_filtered = self.df[(self.df[self.cause_columns].sum(axis=1) > 0) & 
                              (self.df[self.indicator_columns].sum(axis=1) > 0)]

        # Melt causes and indicators separately, ensuring unique sentence IDs
        cause_data = df_filtered[['text_id', 'subfolder'] + self.cause_columns].melt(
            id_vars=['text_id', 'subfolder'], var_name='cause', value_name='count'
        ).query("count > 0").drop_duplicates(['text_id', 'cause'])

        indicator_data = df_filtered[['text_id', 'subfolder'] + self.indicator_columns].melt(
            id_vars=['text_id', 'subfolder'], var_name='indicator', value_name='count'
        ).query("count > 0").drop_duplicates(['text_id', 'indicator'])

        # Apply threshold filters
        valid_causes = cause_data['cause'].value_counts()[lambda x: x >= cause_threshold].index
        valid_indicators = indicator_data['indicator'].value_counts()[lambda x: x >= indicator_threshold].index
        cause_data = cause_data[cause_data['cause'].isin(valid_causes)]
        indicator_data = indicator_data[indicator_data['indicator'].isin(valid_indicators)]

        # Create unique cause-indicator-subfolder links by merging cause and indicator data on 'text_id' and 'subfolder'
        cause_indicator_links = (
            cause_data.merge(indicator_data, on=['text_id', 'subfolder'])
            .groupby(['cause', 'indicator']).size().reset_index(name='count')
        )

        # Aggregate indicator-subfolder counts
        indicator_subfolder_links = (
            indicator_data.groupby(['indicator', 'subfolder']).size().reset_index(name='count')
        )

        # Define unique labels and their order
        all_labels = list(valid_causes) + list(valid_indicators) + self.df['subfolder'].unique().tolist()
        
        # Remove prefixes for cleaner labels
        all_labels_cleaned = [label.replace("cause_", "").replace("indicator_", "") for label in all_labels]
        label_to_index = {label: idx for idx, label in enumerate(all_labels)}

        # Define a color palette from Plotly's D3 color sequence
        color_palette = px.colors.qualitative.D3
        node_colors = [color_palette[i % len(color_palette)] for i in range(len(all_labels))]

        # Define sources, targets, values, and link colors with RGBA opacity
        sources, targets, values, link_colors = [], [], [], []

        def hex_to_rgba(hex_color, opacity):
            return f'rgba({int(hex_color[1:3], 16)}, {int(hex_color[3:5], 16)}, {int(hex_color[5:], 16)}, {opacity})'

        # Cause -> Indicator links
        for _, row in cause_indicator_links.iterrows():
            if row['cause'] in label_to_index and row['indicator'] in label_to_index:
                source_idx = label_to_index[row['cause']]
                target_idx = label_to_index[row['indicator']]
                sources.append(source_idx)
                targets.append(target_idx)
                values.append(row['count'])
                link_colors.append(hex_to_rgba(node_colors[source_idx], link_opacity))

        # Indicator -> Subfolder links
        for _, row in indicator_subfolder_links.iterrows():
            if row['indicator'] in label_to_index and row['subfolder'] in label_to_index:
                source_idx = label_to_index[row['indicator']]
                target_idx = label_to_index[row['subfolder']]
                sources.append(source_idx)
                targets.append(target_idx)
                values.append(row['count'])
                link_colors.append(hex_to_rgba(node_colors[source_idx], link_opacity))

        fig = go.Figure(data=[go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="black", width=0.5),
                label=all_labels_cleaned,
                color=node_colors
            ),
            link=dict(
                source=sources,
                target=targets,
                value=values,
                color=link_colors
            )
        )])

        fig.update_layout(
            autosize=False,
            width=800,
            height=600,
            font=dict(size=10)
        )

        return fig