Spaces:

beppeinthesky
/

pnrr-data-processor

Sleeping

App Files Files Community

pnrr-data-processor / modules /cluster_page.py

beppeinthesky

feat: Enhance cluster analysis flow by initializing session state and displaying results

4c3bc18 13 days ago

raw

history blame contribute delete

16.2 kB

	import os
	import sys
	import logging
	import streamlit as st
	import pandas as pd
	from typing import Dict, Union, Any
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
	from modules import cluster_analysis

	METADATA_PATH = 'modules/fixtures/Scheda metadatazione_Progetti_Lozalizzazioni_PNRR_Italiadomani_V2.xlsx'


	def set_page_config() -> None:
	"""Configure Streamlit page settings for cluster analysis.

	Returns:
	None
	"""
	st.set_page_config(
	page_title="PNRR Cluster Analysis",
	page_icon=":chart_with_upwards_trend:",
	layout="wide"
	)


	def load_metadata_columns() -> Dict[str, str]:
	"""Load available columns from metadata file.

	Returns:
	Dict[str, str]: Dictionary mapping column names to their descriptions
	"""
	try:
	metadata_paths = [
	'/home/giuseppe/IUAV - PNRR/semantic-filter/data/metadata.csv',
	'data/metadata.csv',
	'../data/metadata.csv'
	]

	metadata_df = None
	for path in metadata_paths:
	if os.path.exists(path):
	metadata_df = pd.read_csv(path)
	break

	if metadata_df is None:
	return {}

	high_importance = metadata_df[
	(metadata_df['Ranking importanza variabili (da 1, bassa importanza, a 5, massima importanza)'].isin([4, 5])) &
	(metadata_df['Variabile dei file originali (Italiadomani/Regione Veneto)'].notna())
	]

	columns_info = {}
	for _, row in high_importance.iterrows():
	var_name = row['Variabile dei file originali (Italiadomani/Regione Veneto)']
	description = row['Descrizione']
	if pd.notna(var_name) and pd.notna(description):
	columns_info[var_name] = description

	return columns_info
	except Exception as e:
	st.error(f"Errore nel caricamento dei metadati: {e}")
	return {}


	def display_cluster_statistics(stats: Dict[str, Union[int, float]]) -> None:
	"""Display clustering statistics in an organized format.

	Args:
	stats: Dictionary containing clustering statistics

	Returns:
	None
	"""
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Progetti Totali", stats['total_projects'])
	with col2:
	st.metric("Progetti Assegnati", stats['assigned_projects'])
	with col3:
	st.metric("Numero Cluster", stats['num_clusters'])
	with col4:
	st.metric("Progetti per Cluster (media)", f"{stats['avg_projects_per_cluster']:.1f}")


	def main() -> None:
	"""Main function for cluster analysis user interface.
	Handles file upload, parameter configuration, and analysis execution.

	Returns:
	None
	"""
	if 'cluster_results' not in st.session_state:
	st.session_state['cluster_results'] = None

	st.title("🔍 Analisi Cluster Progetti PNRR")
	st.markdown("""
	Questa sezione permette di identificare automaticamente gruppi tematici di progetti PNRR
	basati sul contenuto delle colonne selezionate. L'algoritmo utilizza tecniche di machine learning
	per raggruppare progetti simili e genera automaticamente titoli e descrizioni per ogni cluster.
	""")

	st.header("📁 Carica il File Excel")
	uploaded_file = st.file_uploader(
	"Seleziona il file Excel contenente i progetti PNRR",
	type=["xlsx"],
	help="Carica un file Excel con i dati dei progetti PNRR"
	)

	if uploaded_file is not None:
	try:
	# Cache the parsed DataFrame in session_state keyed by filename+size so
	# pd.read_excel is only called once per uploaded file, not on every
	# Streamlit re-run triggered by multiselect interactions.
	file_key = f"{uploaded_file.name}_{uploaded_file.size}"
	if st.session_state.get('cluster_file_key') != file_key:
	df = pd.read_excel(uploaded_file)
	st.session_state['cluster_df'] = df
	st.session_state['cluster_file_key'] = file_key
	st.session_state['cluster_results'] = None
	else:
	df = st.session_state['cluster_df']

	st.success(f"✅ File caricato con successo! Trovate {len(df)} righe e {len(df.columns)} colonne.")

	st.header("🎯 Selezione Colonne per Clustering")
	st.markdown("""
	Seleziona le colonne da utilizzare per il clustering. Le colonne testuali con informazioni
	descrittive dei progetti sono generalmente le più efficaci per identificare temi ricorrenti.
	""")

	selected_columns = st.multiselect(
	"Seleziona le colonne da utilizzare per il clustering:",
	list(df.columns),
	help="Seleziona almeno una colonna. Le colonne con testo descrittivo sono più efficaci."
	)

	if selected_columns:
	st.header("⚙️ Parametri Clustering")
	col1, col2 = st.columns(2)

	with col1:
	auto_clusters = st.checkbox(
	"Determinazione automatica del numero di cluster",
	value=True,
	help="Se selezionato, l'algoritmo determinerà automaticamente il numero ottimale di cluster"
	)

	with col2:
	if not auto_clusters:
	n_clusters = st.slider(
	"Numero di cluster",
	min_value=2,
	max_value=min(100, len(df) // 5),
	value=20,
	help="Numero fisso di cluster da creare"
	)
	else:
	col2_1, col2_2 = st.columns(2)
	with col2_1:
	min_clusters = st.number_input(
	"Numero minimo di cluster",
	min_value=2,
	max_value=500,
	value=5,
	step=1,
	help="Numero minimo di cluster per la determinazione automatica"
	)
	with col2_2:
	max_clusters = st.number_input(
	"Numero massimo di cluster",
	min_value=min_clusters,
	max_value=500,
	value=30,
	step=1,
	help="Numero massimo di cluster per la determinazione automatica. Valori alti aumentano molto il tempo di calcolo."
	)

	st.header("🚫 Blacklist Parole Personalizzata")
	st.markdown("""
	Aggiungi parole che vuoi escludere completamente dall'analisi del clustering.
	Queste parole saranno rimosse dall'analisi per evitare che influenzino i risultati.
	""")

	col1_bl, col2_bl = st.columns([2, 1])
	with col1_bl:
	custom_words_input = st.text_area(
	"Parole da escludere (una per riga o separate da virgola):",
	height=100,
	placeholder="digitalizzazione\ninfrastruttura\nsanità\n\noppure: digitalizzazione, infrastruttura, sanità",
	help="Inserisci parole che ritieni irrilevanti per il tuo contesto di analisi. "
	"Puoi inserire una parola per riga oppure separare le parole con virgole."
	)

	with col2_bl:
	st.markdown("Esempi di parole da escludere:")
	st.markdown("- Termini troppo generici")
	st.markdown("- Nomi di enti frequenti")
	st.markdown("- Parole tecniche comuni")
	st.markdown("- Location ricorrenti")

	# Parse custom blacklist
	custom_blacklist = []
	if custom_words_input.strip():
	# Try comma-separated first
	if ',' in custom_words_input:
	custom_blacklist = [
	word.strip() for word in custom_words_input.split(',')]
	else:
	# Otherwise, split by lines
	custom_blacklist = [
	word.strip() for word in custom_words_input.split('\n')]

	# Filter out empty strings
	custom_blacklist = [
	word for word in custom_blacklist if word]

	if custom_blacklist:
	st.success(
	f"✅ Saranno escluse {len(custom_blacklist)} parole personalizzate: {', '.join(custom_blacklist[:5])}{'...' if len(custom_blacklist) > 5 else ''}")

	if st.button("🚀 Avvia Analisi Cluster", type="primary"):
	with st.spinner("Analisi in corso... Questo potrebbe richiedere alcuni minuti."):
	try:
	n_clusters_param = None if auto_clusters else n_clusters
	max_clusters_param = max_clusters if auto_clusters else 20
	min_clusters_param = min_clusters if auto_clusters else 2

	cluster_df, data_with_clusters_df, embeddings, cluster_labels = cluster_analysis.analyze_clusters(
	data_frame_path=uploaded_file,
	selected_columns=selected_columns,
	n_clusters=n_clusters_param,
	max_clusters=max_clusters_param,
	min_clusters=min_clusters_param,
	custom_blacklist=custom_blacklist if custom_blacklist else None
	)

	cluster_analysis.save_results(cluster_df, data_with_clusters_df)
	stats = cluster_analysis.get_cluster_statistics(cluster_df, data_with_clusters_df)

	st.session_state['cluster_results'] = {
	'cluster_df': cluster_df,
	'data_with_clusters_df': data_with_clusters_df,
	'embeddings': embeddings,
	'cluster_labels': cluster_labels,
	'stats': stats,
	'selected_columns': selected_columns,
	}

	except Exception as e:
	st.error(f"❌ Errore durante l'analisi: {str(e)}")
	logging.error(f"Clustering error: {e}", exc_info=True)

	if st.session_state.get('cluster_results') is not None:
	r = st.session_state['cluster_results']
	st.success("✅ Analisi completata con successo!")

	st.header("📊 Statistiche Clustering")
	display_cluster_statistics(r['stats'])

	st.header("🎯 Risultati Cluster")
	st.markdown(f"Sono stati identificati {len(r['cluster_df'])} cluster tematici:")

	for idx, row in r['cluster_df'].iterrows():
	with st.expander(f"Cluster {row['cluster_id'] + 1}: {row['titolo']} ({row['num_progetti']} progetti)"):
	st.write(f"Descrizione: {row['descrizione']}")
	st.write(f"Parole chiave: {row['keywords']}")
	st.write(f"Progetti di esempio:")
	st.write(row['progetti_campione'])

	st.header("📥 Download Risultati")
	col1, col2 = st.columns(2)

	with col1:
	with open(cluster_analysis.SAVE_PATH_CLUSTERS, 'rb') as f:
	cluster_bytes = f.read()

	st.download_button(
	label="📋 Scarica Sommario Cluster",
	data=cluster_bytes,
	file_name="cluster_results.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	help="File Excel con titoli, descrizioni e statistiche dei cluster"
	)

	with col2:
	with open(cluster_analysis.SAVE_PATH_ORIGINAL, 'rb') as f:
	data_bytes = f.read()

	st.download_button(
	label="📊 Scarica Dati con Cluster ID",
	data=data_bytes,
	file_name="data_with_clusters.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	help="File Excel originale con aggiunta colonna cluster_id per ogni progetto"
	)

	st.header("📊 Visualizzazione Cluster nello Spazio degli Embeddings")
	st.markdown("""
	Questo grafico mostra una rappresentazione bidimensionale dei cluster ottenuti tramite PCA (Principal Component Analysis).
	Ogni punto rappresenta un progetto PNRR, colorato secondo il cluster di appartenenza.
	""")

	try:
	pca_fig = cluster_analysis.create_cluster_pca_plot(
	r['embeddings'], r['cluster_labels'], r['cluster_df'])
	st.plotly_chart(pca_fig, use_container_width=True)
	except Exception as e:
	st.error(f"❌ Errore nella creazione del plot PCA: {str(e)}")
	logging.error(f"PCA plot error: {e}", exc_info=True)

	st.header("👀 Anteprima Risultati")

	cluster_counts = r['data_with_clusters_df']['cluster_id'].value_counts().sort_index()
	cluster_counts_df = pd.DataFrame({
	'Cluster ID': cluster_counts.index,
	'Numero Progetti': cluster_counts.values
	})

	st.subheader("Distribuzione Progetti per Cluster")
	st.bar_chart(cluster_counts_df.set_index('Cluster ID'))

	st.subheader("Dati di Esempio con Cluster ID")
	sample_data = r['data_with_clusters_df'][r['selected_columns'] + ['cluster_id']].head(10)
	st.dataframe(sample_data, use_container_width=True)

	else:
	st.warning("⚠️ Seleziona almeno una colonna per procedere con il clustering.")

	except Exception as e:
	st.error(f"❌ Errore nel caricamento del file: {str(e)}")

	else:
	st.info("👆 Carica un file Excel per iniziare l'analisi cluster.")

	st.header("📋 Formato File Atteso")
	st.markdown("""
	Il file Excel dovrebbe contenere i dati dei progetti PNRR con colonne come:
	- Titolo Progetto: Nome del progetto
	- Sintesi Progetto: Descrizione dettagliata
	- Descrizione Missione: Descrizione della missione PNRR
	- Descrizione Componente: Descrizione della componente
	- Soggetto Attuatore: Ente responsabile
	- Descrizione Comune: Località del progetto

	Più colonne testuali descrittive vengono selezionate, migliore sarà la qualità del clustering.
	""")


	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO)
	set_page_config()
	main()