AIEcosystem commited on
Commit
cb73c3a
·
verified ·
1 Parent(s): 7d708f9

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +890 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,892 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+ import os
2
+ os.environ['HF_HOME'] = '/tmp'
3
+ import time
4
  import streamlit as st
5
+ import streamlit.components.v1 as components
6
+ import pandas as pd
7
+ import io
8
+ import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ import numpy as np
11
+ import re
12
+ import string
13
+ import json
14
+ from itertools import cycle
15
+ # --- PPTX Imports (Note: pptx must be installed via 'pip install python-pptx') ---
16
+ from io import BytesIO
17
+ import plotly.io as pio
18
+ # ---------------------------
19
+ # --- Stable Scikit-learn LDA Imports ---
20
+ from sklearn.feature_extraction.text import TfidfVectorizer
21
+ from sklearn.decomposition import LatentDirichletAllocation
22
+ # ------------------------------
23
+ from gliner import GLiNER
24
+ from streamlit_extras.stylable_container import stylable_container
25
+
26
+ # Using a try/except for comet_ml import
27
+ try:
28
+ from comet_ml import Experiment
29
+ except ImportError:
30
+ class Experiment:
31
+ def __init__(self, **kwargs): pass
32
+ def log_parameter(self, *args): pass
33
+ def log_table(self, *args): pass
34
+ def end(self): pass
35
+
36
+ # --- Model Home Directory (Fix for deployment environments) ---
37
+ os.environ['HF_HOME'] = '/tmp'
38
+
39
+ # --- Fixed Label Definitions and Mappings (Used as Fallback) ---
40
+ FIXED_LABELS = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
41
+ FIXED_ENTITY_COLOR_MAP = {
42
+ "person": "#10b981", # Green
43
+ "country": "#3b82f6", # Blue
44
+ "city": "#4ade80", # Light Green
45
+ "organization": "#f59e0b", # Orange
46
+ "date": "#8b5cf6", # Purple
47
+ "time": "#ec4899", # Pink
48
+ "cardinal": "#06b6d4", # Cyan
49
+ "money": "#f43f5e", # Red
50
+ "position": "#a855f7", # Violet
51
+ }
52
+
53
+ # --- Fixed Category Mapping ---
54
+ FIXED_CATEGORY_MAPPING = {
55
+ "People & Roles": ["person", "organization", "position"],
56
+ "Locations": ["country", "city"],
57
+ "Time & Dates": ["date", "time"],
58
+ "Numbers & Finance": ["money", "cardinal"]}
59
+ REVERSE_FIXED_CATEGORY_MAPPING = {label: category for category, label_list in FIXED_CATEGORY_MAPPING.items() for label in label_list}
60
+
61
+ # --- Dynamic Color Generator for Custom Labels ---
62
+ # Use Plotly's Alphabet set for a large pool of distinct colors
63
+ COLOR_PALETTE = cycle(px.colors.qualitative.Alphabet)
64
+
65
+ def extract_label(node_name):
66
+ """Extracts the label from a node string like 'Text (Label)'."""
67
+ match = re.search(r'\(([^)]+)\)$', node_name)
68
+ return match.group(1) if match else "Unknown"
69
+
70
+ def remove_trailing_punctuation(text_string):
71
+ """Removes trailing punctuation from a string."""
72
+ return text_string.rstrip(string.punctuation)
73
+
74
+ def get_dynamic_color_map(active_labels, fixed_map):
75
+ """Generates a color map, using fixed colors if available, otherwise dynamic colors."""
76
+ color_map = {}
77
+ # If using fixed labels, use the fixed map directly
78
+ if active_labels == FIXED_LABELS:
79
+ return fixed_map
80
+ # If using custom labels, generate colors
81
+ for label in active_labels:
82
+ # Prioritize fixed color if the custom label happens to match a fixed one
83
+ if label in fixed_map:
84
+ color_map[label] = fixed_map[label]
85
+ else:
86
+ # Generate a new color from the palette
87
+ color_map[label] = next(COLOR_PALETTE)
88
+ return color_map
89
+
90
+ def highlight_entities(text, df_entities, entity_color_map):
91
+ """
92
+ Generates HTML to display text with entities highlighted and colored.
93
+ IMPORTANT: Assumes 'start' and 'end' are relative to the 'text' input.
94
+ """
95
+ if df_entities.empty:
96
+ return text
97
+ # Sort entities by start index descending to insert highlights without affecting subsequent indices
98
+ entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
99
+ highlighted_text = text
100
+ for entity in entities:
101
+ # Ensure the entity indices are within the bounds of the full text
102
+ start = max(0, entity['start'])
103
+ end = min(len(text), entity['end'])
104
+
105
+ # Get entity text from the full document based on its indices
106
+ # The 'text' column in the dataframe is now an attribute of the chunked text, not the original span
107
+ entity_text_from_full_doc = text[start:end]
108
+
109
+ label = entity['label']
110
+ color = entity_color_map.get(label, '#000000')
111
+ # Create a span with background color and tooltip
112
+ highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text_from_full_doc}</span>'
113
+ # Replace the original text segment with the highlighted HTML
114
+ highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
115
+ # Use a div to mimic the Streamlit input box style for the report
116
+ return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
117
+
118
+ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
119
+ """Performs basic Topic Modeling using LDA."""
120
+ documents = df_entities['text'].unique().tolist()
121
+ # Topic modeling is usually more effective with full sentences/paragraphs,
122
+ # but here we use the extracted entity texts as per the original code's intent.
123
+ if len(documents) < 2:
124
+ return None
125
+
126
+ N = min(num_top_words, len(documents))
127
+ try:
128
+ tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(1, 3))
129
+ tfidf = tfidf_vectorizer.fit_transform(documents)
130
+ tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
131
+
132
+ if len(tfidf_feature_names) < num_topics:
133
+ tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3))
134
+ tfidf = tfidf_vectorizer.fit_transform(documents)
135
+ tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
136
+ if len(tfidf_feature_names) < num_topics:
137
+ return None
138
+
139
+ lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', random_state=42, n_jobs=-1)
140
+ lda.fit(tfidf)
141
+
142
+ topic_data_list = []
143
+ for topic_idx, topic in enumerate(lda.components_):
144
+ top_words_indices = topic.argsort()[:-N - 1:-1]
145
+ top_words = [tfidf_feature_names[i] for i in top_words_indices]
146
+ word_weights = [topic[i] for i in top_words_indices]
147
+
148
+ for word, weight in zip(top_words, word_weights):
149
+ topic_data_list.append({
150
+ 'Topic_ID': f'Topic #{topic_idx + 1}',
151
+ 'Word': word,
152
+ 'Weight': weight,
153
+ })
154
+ return pd.DataFrame(topic_data_list)
155
+
156
+ except Exception as e:
157
+ return None
158
+
159
+ def create_topic_word_bubbles(df_topic_data):
160
+ """Generates a Plotly Bubble Chart for top words across all topics."""
161
+ df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic','Word': 'word', 'Weight': 'weight'})
162
+ df_topic_data['x_pos'] = df_topic_data.index
163
+ if df_topic_data.empty:
164
+ return None
165
+
166
+ fig = px.scatter(
167
+ df_topic_data,
168
+ x='x_pos', y='weight', size='weight', color='topic', text='word', hover_name='word', size_max=40,
169
+ title='Topic Word Weights (Bubble Chart)',
170
+ color_discrete_sequence=px.colors.qualitative.Bold,
171
+ labels={'x_pos': 'Entity/Word Index', 'weight': 'Word Weight', 'topic': 'Topic ID'},
172
+ custom_data=['word', 'weight', 'topic']
173
+ )
174
+ fig.update_layout(
175
+ xaxis_title="Entity/Word", yaxis_title="Word Weight",
176
+ xaxis={'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
177
+ yaxis={'showgrid': True},
178
+ showlegend=True, height=600,
179
+ margin=dict(t=50, b=100, l=50, r=10),
180
+ plot_bgcolor='#f9f9f9', paper_bgcolor='#f9f9f9'
181
+ )
182
+ fig.update_traces(
183
+ textposition='middle center',
184
+ textfont=dict(color='white', size=10),
185
+ hovertemplate="<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<br>Topic: %{customdata[2]}<extra></extra>",
186
+ marker=dict(line=dict(width=1, color='DarkSlateGrey'))
187
+ )
188
+ return fig
189
+
190
+ def generate_network_graph(df, raw_text, entity_color_map):
191
+ """Generates a network graph visualization (Node Plot) with edges based on entity co-occurrence in sentences."""
192
+ entity_counts = df['text'].value_counts().reset_index()
193
+ entity_counts.columns = ['text', 'frequency']
194
+ unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
195
+ if unique_entities.shape[0] < 2:
196
+ return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
197
+
198
+ num_nodes = len(unique_entities)
199
+ thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
200
+ radius = 10
201
+ unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
202
+ unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
203
+ pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
204
+ edges = set()
205
+ # Simple sentence tokenizer
206
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
207
+ for sentence in sentences:
208
+ entities_in_sentence = []
209
+ for entity_text in unique_entities['text'].unique():
210
+ # Note: This is an inexact but fast co-occurrence check
211
+ if entity_text.lower() in sentence.lower():
212
+ entities_in_sentence.append(entity_text)
213
+ unique_entities_in_sentence = list(set(entities_in_sentence))
214
+ for i in range(len(unique_entities_in_sentence)):
215
+ for j in range(i + 1, len(unique_entities_in_sentence)):
216
+ node1 = unique_entities_in_sentence[i]
217
+ node2 = unique_entities_in_sentence[j]
218
+ edge_tuple = tuple(sorted((node1, node2)))
219
+ edges.add(edge_tuple)
220
+
221
+ edge_x = []
222
+ edge_y = []
223
+ for edge in edges:
224
+ n1, n2 = edge
225
+ if n1 in pos_map and n2 in pos_map:
226
+ edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
227
+ edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
228
+
229
+ fig = go.Figure()
230
+ edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines', name='Co-occurrence Edges', showlegend=False)
231
+ fig.add_trace(edge_trace)
232
+
233
+ fig.add_trace(go.Scatter(
234
+ x=unique_entities['x'], y=unique_entities['y'], mode='markers+text', name='Entities', text=unique_entities['text'], textposition="top center", showlegend=False,
235
+ marker=dict(
236
+ size=unique_entities['frequency'] * 5 + 10,
237
+ color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
238
+ line_width=1, line_color='black', opacity=0.9
239
+ ),
240
+ textfont=dict(size=10),
241
+ customdata=unique_entities[['label', 'score', 'frequency']],
242
+ hovertemplate=("<b>%{text}</b><br>Label: %{customdata[0]}<br>Score: %{customdata[1]:.2f}<br>Frequency: %{customdata[2]}<extra></extra>")
243
+ ))
244
+
245
+ legend_traces = []
246
+ seen_labels = set()
247
+ for index, row in unique_entities.iterrows():
248
+ label = row['label']
249
+ if label not in seen_labels:
250
+ seen_labels.add(label)
251
+ color = entity_color_map.get(label, '#cccccc')
252
+ legend_traces.append(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True))
253
+
254
+ for trace in legend_traces:
255
+ fig.add_trace(trace)
256
+
257
+ fig.update_layout(
258
+ title='Entity Co-occurrence Network (Edges = Same Sentence)',
259
+ showlegend=True, hovermode='closest',
260
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
261
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
262
+ plot_bgcolor='#f9f9f9', paper_bgcolor='#f9f9f9',
263
+ margin=dict(t=50, b=10, l=10, r=10), height=600
264
+ )
265
+ return fig
266
+
267
+ # --- CSV GENERATION FUNCTION ---
268
+ def generate_entity_csv(df):
269
+ """Generates a CSV file of the extracted entities in an in-memory buffer."""
270
+ csv_buffer = BytesIO()
271
+ df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
272
+ csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
273
+ csv_buffer.seek(0)
274
+ return csv_buffer
275
+ # -----------------------------------
276
+
277
+ # --- HTML REPORT GENERATION FUNCTION (MODIFIED FOR WHITE-LABEL) ---
278
+ def generate_html_report(df, text_input, elapsed_time, df_topic_data, entity_color_map, report_title="Entity and Topic Analysis Report", branding_html=""):
279
+ """
280
+ Generates a full HTML report containing all analysis results and visualizations.
281
+ Accepts report_title and branding_html for white-labeling.
282
+ """
283
+ # Use the category values from the DataFrame to ensure the report matches the app's current mode (fixed or custom)
284
+ unique_categories = df['category'].unique()
285
+
286
+ # 1. Generate Visualizations (Plotly HTML)
287
+ # 1a. Treemap
288
+ fig_treemap = px.treemap(
289
+ df,
290
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
291
+ values='score',
292
+ color='category',
293
+ title="Entity Distribution by Category and Label",
294
+ color_discrete_sequence=px.colors.qualitative.Dark24
295
+ )
296
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
297
+ treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
298
+
299
+ # 1b. Pie Chart
300
+ grouped_counts = df['category'].value_counts().reset_index()
301
+ grouped_counts.columns = ['Category', 'Count']
302
+ color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
303
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=color_seq)
304
+ fig_pie.update_layout(margin=dict(t=50, b=10))
305
+ pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
306
+
307
+ # 1c. Bar Chart (Category Count)
308
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=color_seq)
309
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
310
+ bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
311
+
312
+ # 1d. Bar Chart (Most Frequent Entities)
313
+ word_counts = df['text'].value_counts().reset_index()
314
+ word_counts.columns = ['Entity', 'Count']
315
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
316
+ bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
317
+ if not repeating_entities.empty:
318
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
319
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
320
+ bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
321
+
322
+ # 1e. Network Graph HTML - IMPORTANT: Pass color map
323
+ network_fig = generate_network_graph(df, text_input, entity_color_map)
324
+ network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
325
+
326
+ # 1f. Topic Charts HTML
327
+ topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
328
+ if df_topic_data is not None and not df_topic_data.empty:
329
+ bubble_figure = create_topic_word_bubbles(df_topic_data)
330
+ if bubble_figure:
331
+ topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}</div>'
332
+ else:
333
+ topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
334
+ else:
335
+ topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
336
+ topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
337
+ topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
338
+ topic_charts_html += '</div>'
339
+
340
+ # 2. Get Highlighted Text - IMPORTANT: Pass color map
341
+ highlighted_text_html = highlight_entities(text_input, df, entity_color_map).replace("div style", "div class='highlighted-text' style")
342
+
343
+ # 3. Entity Tables (Pandas to HTML)
344
+ entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
345
+ classes='table table-striped',
346
+ index=False
347
+ )
348
+
349
+ # 4. Construct the Final HTML (UPDATED FOR WHITE-LABELING)
350
+ html_content = f"""<!DOCTYPE html><html lang="en"><head>
351
+ <meta charset="UTF-8">
352
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
353
+ <title>{report_title}</title>
354
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
355
+ <style>
356
+ body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
357
+ .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
358
+ h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
359
+ h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
360
+ h3 {{ color: #555; margin-top: 20px; }}
361
+ .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
362
+ .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
363
+ table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
364
+ table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
365
+ table th {{ background-color: #f0f0f0; }}
366
+ .highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
367
+ </style>
368
+ </head>
369
+ <body>
370
+ <div class="container">
371
+ <h1>{report_title}</h1>
372
+ <div class="metadata">
373
+ {branding_html} <!-- CUSTOM BRANDING INSERTED HERE -->
374
+ <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
375
+ <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
376
+ </div>
377
+ <h2>1. Analyzed Text & Extracted Entities</h2>
378
+ <h3>Original Text with Highlighted Entities</h3>
379
+ <div class="highlighted-text-container">
380
+ {highlighted_text_html}
381
+ </div>
382
+ <h2>2. Full Extracted Entities Table
383
+ </h2>
384
+ {entity_table_html}
385
+ <h2>3. Data Visualizations</h2>
386
+ <h3>3.1 Entity Distribution Treemap</h3>
387
+ <div class="chart-box">{treemap_html}</div>
388
+ <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
389
+ <div class="chart-box">{pie_html}</div>
390
+ <div class="chart-box">{bar_category_html}</div>
391
+ <h3>3.3 Entity Relationship Map (Edges = Same Sentence)</h3>
392
+ <div class="chart-box">{network_html}</div>
393
+ <h2>4. Topic Modelling</h2>
394
+ {topic_charts_html}
395
+ <h3>3.4 Most Frequent Entities</h3>
396
+ <div class="chart-box">{bar_freq_html}</div>
397
+ </div>
398
+ </body>
399
+ </html>
400
+ """
401
+ return html_content
402
+ # -----------------------------------
403
+
404
+ # --- CHUNKING IMPLEMENTATION FOR LARGE TEXT ---
405
+ def chunk_text(text, max_chunk_size=1500):
406
+ """Splits text into chunks by sentence/paragraph, respecting a max size (by character count)."""
407
+ # Split by double newline (paragraph) or sentence-like separators
408
+ segments = re.split(r'(\n\n|(?<=[.!?])\s+)', text)
409
+ chunks = []
410
+ current_chunk = ""
411
+ current_offset = 0
412
+
413
+ for segment in segments:
414
+ if not segment: continue
415
+
416
+ if len(current_chunk) + len(segment) > max_chunk_size and current_chunk:
417
+ # Save the current chunk and its starting offset
418
+ chunks.append((current_chunk, current_offset))
419
+ current_offset += len(current_chunk)
420
+ current_chunk = segment
421
+ else:
422
+ current_chunk += segment
423
+ if current_chunk:
424
+ chunks.append((current_chunk, current_offset))
425
+
426
+ return chunks
427
+
428
+ def process_chunked_text(text, labels, model):
429
+ """Processes large text in chunks and aggregates/offsets the entities."""
430
+ # GLiNER model context size can be around 1024-1500 tokens/words. We use a generous char limit.
431
+ # The word count limit is 10000, but we chunk around 500 words for safety/performance.
432
+ MAX_CHUNK_CHARS = 3500
433
+
434
+ chunks = chunk_text(text, max_chunk_size=MAX_CHUNK_CHARS)
435
+ all_entities = []
436
+
437
+ for chunk_text, chunk_offset in chunks:
438
+ # Predict entities on the small chunk
439
+ chunk_entities = model.predict_entities(chunk_text, labels)
440
+
441
+ # Offset the start and end indices to match the original document
442
+ for entity in chunk_entities:
443
+ entity['start'] += chunk_offset
444
+ entity['end'] += chunk_offset
445
+ all_entities.append(entity)
446
+
447
+ return all_entities
448
+ # -----------------------------------
449
+
450
+ # --- Page Configuration and Styling (No Sidebar) ---
451
+ st.set_page_config(layout="wide", page_title="NER & Topic Report App")
452
+
453
+ # --- Conditional Mobile Warning ---
454
+ st.markdown(
455
+ """
456
+ <style>
457
+ /* CSS Media Query: Only show the content inside this selector when the screen width is 600px or less (typical mobile size) */
458
+ @media (max-width: 600px) {
459
+ #mobile-warning-container {
460
+ display: block; /* Show the warning container */
461
+ background-color: #ffcccc; /* Light red/pink background */
462
+ color: #cc0000; /* Dark red text */
463
+ padding: 10px;
464
+ border-radius: 5px;
465
+ text-align: center;
466
+ margin-bottom: 20px;
467
+ font-weight: bold;
468
+ border: 1px solid #cc0000;
469
+ }
470
+ }
471
+ /* Hide the content by default (for larger screens) */
472
+ @media (min-width: 601px) {
473
+ #mobile-warning-container {
474
+ display: none; /* Hide the warning container on desktop */
475
+ }
476
+ }
477
+ /* --- FIX: Tab Label Colors for Visibility --- */
478
+ [data-testid="stConfigurableTabs"] button {
479
+ color: #333333 !important; /* Dark gray for inactive tabs */
480
+ background-color: #f0f0f0; /* Light gray background for inactive tabs */
481
+ border: 1px solid #cccccc;
482
+ }
483
+ /* Target the ACTIVE tab label */
484
+ [data-testid="stConfigurableTabs"] button[aria-selected="true"] {
485
+ color: #FFFFFF !important; /* White text for active tab */
486
+ background-color: #007bff; /* Blue background for active tab */
487
+ border-bottom: 2px solid #007bff; /* Optional: adds an accent line */
488
+ }
489
+ /* Expander header color fix (since you overwrote it to white) */
490
+ .streamlit-expanderHeader {
491
+ color: #007bff; /* Blue text for Expander header */
492
+ }
493
+ </style>
494
+ <div id="mobile-warning-container">
495
+ ⚠️ **Tip for Mobile Users:** For the best viewing experience of the charts and tables, please switch your browser to **"Desktop Site"** view.
496
+ </div>
497
+ """,
498
+ unsafe_allow_html=True)
499
+ # ----------------------------------
500
+ st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
501
+ # Removed st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") for white-labeling
502
+
503
+ tab1, tab2 = st.tabs(["Embed", "Important Notes"])
504
+ with tab1:
505
+ with st.expander("Embed"):
506
+ st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
507
+ code = '''
508
+ <iframe
509
+ src="https://aiecosystem-dataharvest.hf.space"
510
+ frameborder="0"
511
+ width="850"
512
+ height="450"
513
+ ></iframe>
514
+ '''
515
+ st.code(code, language="html")
516
+
517
+ with tab2:
518
+ expander = st.expander("**Important Notes**")
519
+ expander.markdown("""
520
+ **Named Entities (Fixed Mode):** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position".
521
+
522
+ **Custom Labels Mode:** You can define your own comma-separated labels (e.g., `product, symptom, client_id`) in the input box below.
523
+
524
+ **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
525
+
526
+ **How to Use:** Type or paste your text into the text area below, then click the 'Results' button.
527
+ """)
528
+ st.markdown("For any errors or inquiries, please contact us at [info@your-company.com](mailto:info@your-company.com)") # Updated contact info
529
+
530
+ # --- Comet ML Setup (Placeholder/Conditional) ---
531
+ COMET_API_KEY = os.environ.get("COMET_API_KEY")
532
+ COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
533
+ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
534
+ comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
535
+
536
+ # --- Model Loading ---
537
+ @st.cache_resourced
538
+ def load_ner_model(labels):
539
+ """Loads the GLiNER model and caches it."""
540
+ try:
541
+ # The model requires constraints (labels) to be passed during loading
542
+ return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
543
+ except Exception as e:
544
+ st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
545
+ st.stop()
546
+
547
+ # --- LONG DEFAULT TEXT (178 Words) ---
548
+ DEFAULT_TEXT = (
549
+ "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
550
+ "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
551
+ "leap forward for commercial space technology across the entire **European Union**. The agreement, finalized "
552
+ "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
553
+ "software platform. This version of the **Astra** platform is critical for processing and managing the vast amounts of data being sent "
554
+ "back from the recent Mars rover mission. This project underscores the ESA's commitment to advancing "
555
+ "space capabilities within the **European Union**. The core team, including lead engineer Marcus Davies, will hold "
556
+ "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
557
+ "media platform X (under the username @TechCEO) was overwhelmingly positive, with many major tech "
558
+ "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
559
+ "end of the year, further strengthening the technological standing of the **European Union**. The platform is designed to be compatible with both Windows and Linux operating systems. "
560
+ "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
561
+ "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
562
+ "general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026.")
563
+ # -----------------------------------
564
+
565
+ # --- Session State Initialization (CRITICAL FIX) ---
566
+ if 'show_results' not in st.session_state: st.session_state.show_results = False
567
+ if 'last_text' not in st.session_state: st.session_state.last_text = ""
568
+ if 'results_df' not in st.session_state: st.session_state.results_df = pd.DataFrame()
569
+ if 'elapsed_time' not in st.session_state: st.session_state.elapsed_time = 0.0
570
+ if 'topic_results' not in st.session_state: st.session_state.topic_results = None
571
+ if 'my_text_area' not in st.session_state: st.session_state.my_text_area = DEFAULT_TEXT
572
+ if 'custom_labels_input' not in st.session_state: st.session_state.custom_labels_input = ""
573
+ if 'active_labels_list' not in st.session_state: st.session_state.active_labels_list = FIXED_LABELS
574
+ if 'is_custom_mode' not in st.session_state: st.session_state.is_custom_mode = False
575
+
576
+ # --- Clear Button Function (MODIFIED) ---
577
+ def clear_text():
578
+ """Clears the text area (sets it to an empty string) and hides results."""
579
+ st.session_state['my_text_area'] = ""
580
+ st.session_state.show_results = False
581
+ st.session_state.last_text = ""
582
+ st.session_state.results_df = pd.DataFrame()
583
+ st.session_state.elapsed_time = 0.0
584
+ st.session_state.topic_results = None
585
+
586
+ # --- Text Input and Clear Button ---
587
+ word_limit = 10000 # Updated to 10000
588
+ text = st.text_area(
589
+ f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
590
+ height=250,
591
+ key='my_text_area',
592
+ )
593
+ word_count = len(text.split())
594
+ st.markdown(f"**Word count:** {word_count}/{word_limit}")
595
+
596
+ # --- Custom Labels Input ---
597
+ custom_labels_text = st.text_area(
598
+ "**Optional:** Enter your own comma-separated entity labels here (e.g., `product, symptom, client_id`). Leave blank for default labels.",
599
+ height=60,
600
+ key='custom_labels_input',
601
+ placeholder="e.g., product, symptom, client_id" # Show placeholder after the prompt
602
+ )
603
+
604
+ # Use columns to align the buttons neatly
605
+ col_results, col_clear = st.columns([1, 1])
606
+ with col_results:
607
+ run_button = st.button("Results", key='run_results', use_container_width=True)
608
+ with col_clear:
609
+ st.button("Clear text", on_click=clear_text, use_container_width=True)
610
+
611
+ # --- Results Trigger and Processing (Updated Logic with Chunking) ---
612
+ if run_button:
613
+ # 1. Determine Active Labels and Mode
614
+ custom_labels_raw = st.session_state.custom_labels_input
615
+ if custom_labels_raw.strip():
616
+ # Sanitize and parse custom labels
617
+ custom_labels_list = [label.strip().lower() for label in custom_labels_raw.split(',') if label.strip()]
618
+ if not custom_labels_list:
619
+ # Fallback if user enters commas but no actual words
620
+ st.session_state.active_labels_list = FIXED_LABELS
621
+ st.session_state.is_custom_mode = False
622
+ st.info("No valid custom labels found. Falling back to default fixed labels.")
623
+ else:
624
+ st.session_state.active_labels_list = custom_labels_list
625
+ st.session_state.is_custom_mode = True
626
+
627
+ else:
628
+ st.session_state.active_labels_list = FIXED_LABELS
629
+ st.session_state.is_custom_mode = False
630
+
631
+ active_labels = st.session_state.active_labels_list
632
+
633
+ if not text.strip():
634
+ st.warning("Please enter some text to extract entities.")
635
+ st.session_state.show_results = False
636
+ elif word_count > word_limit:
637
+ st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
638
+ st.session_state.show_results = False
639
+ else:
640
+ # Define a safe threshold for when to start chunking (e.g., above 500 words)
641
+ CHUNKING_THRESHOLD = 500
642
+ should_chunk = word_count > CHUNKING_THRESHOLD
643
+
644
+ mode_msg = f"{'custom' if st.session_state.is_custom_mode else 'fixed'} labels"
645
+ if should_chunk:
646
+ mode_msg += " with **chunking** for large text"
647
+
648
+ with st.spinner(f"Extracting entities using {mode_msg}...", show_time=True):
649
+
650
+ # Re-run prediction only if text or active labels have changed
651
+ current_settings = (text, tuple(active_labels))
652
+ last_settings = (st.session_state.last_text, tuple(st.session_state.get('last_active_labels', [])))
653
+
654
+ if current_settings != last_settings:
655
+ st.session_state.last_text = text
656
+ st.session_state['last_active_labels'] = active_labels
657
+
658
+ start_time = time.time()
659
+
660
+ # Load model using the determined active labels
661
+ model = load_ner_model(active_labels)
662
+
663
+ # --- Model Prediction & Dataframe Creation (Using Chunking if needed) ---
664
+ if should_chunk:
665
+ entities = process_chunked_text(text, active_labels, model)
666
+ st.info(f"Text was split into {len(chunk_text(text))} chunks for processing.")
667
+ else:
668
+ # Original logic for small texts
669
+ entities = model.predict_entities(text, active_labels)
670
+
671
+ elapsed_time = time.time() - start_time
672
+ st.session_state.elapsed_time = elapsed_time
673
+
674
+ # --- DataFrame Construction ---
675
+ df = pd.DataFrame(entities)
676
+ if df.empty:
677
+ st.session_state.results_df = df
678
+ st.session_state.topic_results = None
679
+ st.session_state.show_results = True
680
+ else:
681
+ # Clean up entity text (optional, but good practice)
682
+ df['text'] = df['text'].apply(remove_trailing_punctuation)
683
+
684
+ # Map entities to categories
685
+ if st.session_state.is_custom_mode:
686
+ # For custom labels, group everything under a single category
687
+ df['category'] = "User Defined Entities"
688
+ else:
689
+ # For fixed labels, use the fixed mapping
690
+ df['category'] = df['label'].map(REVERSE_FIXED_CATEGORY_MAPPING).fillna('Other')
691
+
692
+ # Remove duplicates for topics/frequency analysis, keeping the highest score
693
+ df_unique_entities = df.sort_values('score', ascending=False).drop_duplicates(subset=['text', 'label'])
694
+
695
+ # --- Topic Modeling ---
696
+ # We use the unique entities as input for the topic modeling
697
+ df_topic_data = perform_topic_modeling(df_unique_entities, num_topics=min(3, len(df_unique_entities.text.unique())), num_top_words=10)
698
+
699
+ # Update session state
700
+ st.session_state.results_df = df
701
+ st.session_state.topic_results = df_topic_data
702
+ st.session_state.show_results = True
703
+
704
+ else:
705
+ # If settings haven't changed, just show the last results
706
+ st.session_state.show_results = True
707
+
708
+
709
+ # --- Display Download Link and Results (Updated with White-Label inputs) ---
710
+ if st.session_state.show_results:
711
+ df = st.session_state.results_df
712
+ df_topic_data = st.session_state.topic_results
713
+
714
+ # Generate the color map based on the results DF labels
715
+ current_labels_in_df = df['label'].unique().tolist()
716
+ entity_color_map = get_dynamic_color_map(current_labels_in_df, FIXED_ENTITY_COLOR_MAP)
717
+
718
+ if df.empty:
719
+ st.warning("No entities were found in the provided text with the current label set.")
720
+ else:
721
+ st.subheader("Analysis Results", divider="blue")
722
+
723
+ # 1. Highlighted Text
724
+ st.markdown(f"### 1. Analyzed Text with Highlighted Entities ({'Custom Mode' if st.session_state.is_custom_mode else 'Fixed Mode'})")
725
+ st.markdown(highlight_entities(st.session_state.last_text, df, entity_color_map), unsafe_allow_html=True)
726
+
727
+ # 2. Detailed Entity Analysis Tabs
728
+ st.markdown("### 2. Detailed Entity Analysis")
729
+ tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
730
+
731
+ # Determine which categories to use for the tabs
732
+ if st.session_state.is_custom_mode:
733
+ unique_categories = ["User Defined Entities"]
734
+ tabs_to_show = df['label'].unique().tolist()
735
+ st.markdown(f"**Custom Labels Detected: {', '.join(tabs_to_show)}**")
736
+ else:
737
+ unique_categories = list(FIXED_CATEGORY_MAPPING.keys())
738
+
739
+ # --- Section 2a: Detailed Tables by Category/Label ---
740
+ with tab_category_details:
741
+ st.markdown("#### Detailed Entities Table (Grouped by Category)")
742
+
743
+ if st.session_state.is_custom_mode:
744
+ # In custom mode, group by the actual label since the category is just "User Defined Entities"
745
+ tabs_list = df['label'].unique().tolist()
746
+ tabs_category = st.tabs(tabs_list)
747
+ for label, tab in zip(tabs_list, tabs_category):
748
+ df_label = df[df['label'] == label][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
749
+ with tab:
750
+ st.markdown(f"##### {label.capitalize()} Entities ({len(df_label)} total)")
751
+ st.dataframe(
752
+ df_label,
753
+ use_container_width=True,
754
+ column_config={'score': st.column_config.NumberColumn(format="%.4f")}
755
+ )
756
+ else:
757
+ # In fixed mode, group by the category defined in FIXED_CATEGORY_MAPPING
758
+ tabs_category = st.tabs(unique_categories)
759
+ for category, tab in zip(unique_categories, tabs_category):
760
+ df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
761
+ with tab:
762
+ st.markdown(f"##### {category} Entities ({len(df_category)} total)")
763
+ if not df_category.empty:
764
+ st.dataframe(
765
+ df_category,
766
+ use_container_width=True,
767
+ column_config={'score': st.column_config.NumberColumn(format="%.4f")}
768
+ )
769
+ else:
770
+ st.info(f"No entities of category **{category}** were found in the text.")
771
+
772
+ # --- INSERTED GLOSSARY HERE ---
773
+ with st.expander("See Glossary of tags"):
774
+ st.write('''- **text**: ['entity extracted from your text data']- **label**: ['label (tag) assigned to a given extracted entity (custom or fixed)']- **category**: ['the grouping category (e.g., "Locations" or "User Defined Entities")']- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']- **start**: ['index of the start of the corresponding entity']- **end**: ['index of the end of the corresponding entity']''')
775
+ # --- END GLOSSARY INSERTION ---
776
+
777
+ # --- Section 2b: Treemap Visualization ---
778
+ with tab_treemap_viz:
779
+ st.markdown("#### Treemap: Entity Distribution")
780
+ fig_treemap = px.treemap(
781
+ df,
782
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
783
+ values='score',
784
+ color='category',
785
+ color_discrete_sequence=px.colors.qualitative.Dark24
786
+ )
787
+ fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
788
+ st.plotly_chart(fig_treemap, use_container_width=True)
789
+
790
+ # --- Section 3: Comparative Charts (COMPLETED) ---
791
+ st.markdown("---")
792
+ st.markdown("### 3. Comparative Charts")
793
+ col1, col2, col3 = st.columns(3)
794
+ grouped_counts = df['category'].value_counts().reset_index()
795
+ grouped_counts.columns = ['Category', 'Count']
796
+
797
+ # Determine color sequence for charts
798
+ chart_color_seq = px.colors.qualitative.Pastel if len(grouped_counts) > 1 else px.colors.sequential.Cividis
799
+
800
+ with col1: # Pie Chart
801
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=chart_color_seq)
802
+ fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
803
+ st.plotly_chart(fig_pie, use_container_width=True)
804
+
805
+ with col2: # Bar Chart by Category
806
+ st.markdown("#### Entity Count by Category")
807
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count', color='Category', title='Total Entities per Category', color_discrete_sequence=chart_color_seq)
808
+ fig_bar_category.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
809
+ st.plotly_chart(fig_bar_category, use_container_width=True)
810
+
811
+ with col3: # Bar Chart for Most Frequent Entities
812
+ st.markdown("#### Top 10 Most Frequent Entities")
813
+ word_counts = df['text'].value_counts().reset_index()
814
+ word_counts.columns = ['Entity', 'Count']
815
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
816
+ if not repeating_entities.empty:
817
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count', title='Top 10 Most Frequent Entities', color='Entity', color_discrete_sequence=px.colors.sequential.Viridis)
818
+ fig_bar_freq.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350, showlegend=False)
819
+ st.plotly_chart(fig_bar_freq, use_container_width=True)
820
+ else:
821
+ st.info("No entities were repeated enough for a Top 10 frequency chart.")
822
+
823
+ # 4. Network Graph and Topic Modeling
824
+ st.markdown("---")
825
+ st.markdown("### 4. Advanced Analysis")
826
+ col_network, col_topic = st.columns(2)
827
+
828
+ with col_network:
829
+ with st.expander("🔗 Entity Co-occurrence Network Graph", expanded=True):
830
+ st.plotly_chart(generate_network_graph(df, st.session_state.last_text, entity_color_map), use_container_width=True)
831
+
832
+ with col_topic:
833
+ with st.expander("💡 Topic Modeling (LDA)", expanded=True):
834
+ if df_topic_data is not None and not df_topic_data.empty:
835
+ st.plotly_chart(create_topic_word_bubbles(df_topic_data), use_container_width=True)
836
+ st.markdown("This chart visualizes the key words driving the identified topics, based on extracted entities.")
837
+ else:
838
+ st.info("Topic Modeling requires at least two unique entities with a minimum frequency to perform statistical analysis.")
839
+
840
+ # --- 5. White-Label Configuration (NEW SECTION FOR CUSTOM BRANDING) ---
841
+ st.markdown("---")
842
+ st.markdown("### 5. White-Label Report Configuration 🎨")
843
+
844
+ # Set a dynamic default title based on the mode
845
+ default_report_title = f"{'Custom' if st.session_state.is_custom_mode else 'Fixed'} Entity Analysis Report"
846
+ custom_report_title = st.text_input(
847
+ "Report Title (for HTML Report)",
848
+ value=default_report_title
849
+ )
850
+
851
+ custom_branding_text = st.text_area(
852
+ "Custom Branding Text/HTML (Appears below title in report)",
853
+ value="<p>Analysis powered by **My Own Brand**.</p>",
854
+ help="You can use basic HTML tags like <p>, <b>, <i>, and <a href='...'>. This replaces the default branding."
855
+ )
856
+
857
+ # 6. Downloads (Updated to pass custom variables)
858
+ st.markdown("---")
859
+ st.markdown("### 6. Downloads")
860
+
861
+ col_csv, col_html = st.columns(2)
862
+
863
+ # CSV Download
864
+ csv_buffer = generate_entity_csv(df)
865
+ with col_csv:
866
+ st.download_button(
867
+ label="⬇️ Download Entities as CSV",
868
+ data=csv_buffer,
869
+ file_name="ner_entities_report.csv",
870
+ mime="text/csv",
871
+ use_container_width=True
872
+ )
873
 
874
+ # HTML Download (Passing custom white-label parameters)
875
+ html_content = generate_html_report(
876
+ df,
877
+ st.session_state.last_text,
878
+ st.session_state.elapsed_time,
879
+ df_topic_data,
880
+ entity_color_map,
881
+ report_title=custom_report_title, # Pass custom title
882
+ branding_html=custom_branding_text # Pass custom branding
883
+ )
884
+ html_bytes = html_content.encode('utf-8')
885
+ with col_html:
886
+ st.download_button(
887
+ label="⬇️ Download Full HTML Report",
888
+ data=html_bytes,
889
+ file_name="ner_topic_full_report.html",
890
+ mime="text/html",
891
+ use_container_width=True
892
+ )