| | import streamlit as st |
| | import duckdb |
| | import pandas as pd |
| | import plotly.express as px |
| | import plotly.graph_objects as go |
| | import numpy as np |
| |
|
| | |
| | st.set_page_config( |
| | page_title="DuckDB Database Analyzer", |
| | page_icon="🦆", |
| | layout="wide", |
| | initial_sidebar_state="expanded" |
| | ) |
| |
|
| | st.title("🦆 DuckDB Database Analyzer") |
| | st.markdown("**Analysez vos bases de données sans les importer !**") |
| |
|
| | |
| | st.sidebar.header("⚙️ Paramètres de connexion") |
| |
|
| | |
| | if "reset_counter" not in st.session_state: |
| | st.session_state.reset_counter = 0 |
| | if "test_url" not in st.session_state: |
| | st.session_state.test_url = "" |
| | if "analysis_done" not in st.session_state: |
| | st.session_state.analysis_done = False |
| | if "analysis_data" not in st.session_state: |
| | st.session_state.analysis_data = {} |
| |
|
| | |
| | url_input = st.sidebar.text_input( |
| | "📍 URL de la base de données", |
| | value=st.session_state.test_url, |
| | placeholder="https://example.com/data.parquet", |
| | help="Formats supportés : Parquet, CSV, JSON, HTTP, S3, etc.", |
| | key=f"url_input_{st.session_state.reset_counter}" |
| | ) |
| |
|
| | |
| | col1, col2 = st.sidebar.columns([4, 1]) |
| | with col2: |
| | if st.sidebar.button("🗑️ Reset"): |
| | st.session_state.reset_counter += 1 |
| | st.session_state.test_url = "" |
| | st.session_state.analysis_done = False |
| | st.session_state.analysis_data = {} |
| | st.rerun() |
| |
|
| | |
| | with st.sidebar.expander("🔧 Options avancées"): |
| | max_rows_sample = st.slider("Lignes échantillon", 50, 2000, 100) |
| |
|
| | |
| | if st.sidebar.button("🚀 Analyser la base de données", type="primary"): |
| | if url_input: |
| | st.session_state.test_url = "" |
| | |
| | with st.spinner("🔍 Analyse en cours..."): |
| | try: |
| | con = duckdb.connect() |
| | con.execute("INSTALL httpfs; LOAD httpfs;") |
| |
|
| | |
| | formats_to_try = [ |
| | ("parquet", f"read_parquet('{url_input}')"), |
| | ("csv", f"read_csv_auto('{url_input}')"), |
| | ("json", f"read_json_auto('{url_input}')") |
| | ] |
| | |
| | read_func = "" |
| | detected_format = "" |
| | |
| | for fmt_name, fmt in formats_to_try: |
| | try: |
| | result = con.execute(f"SELECT COUNT(*) FROM {fmt}").fetchone() |
| | if result and result[0] is not None: |
| | read_func = fmt |
| | detected_format = fmt_name |
| | st.success(f"✅ Format détecté : {fmt_name}") |
| | break |
| | except: |
| | continue |
| |
|
| | if not read_func: |
| | st.error("❌ Impossible de lire le fichier. Vérifiez l'URL.") |
| | st.stop() |
| |
|
| | |
| | total_rows = con.execute(f"SELECT COUNT(*) FROM {read_func}").fetchone()[0] |
| |
|
| | |
| | sample_df = con.execute(f"SELECT * FROM {read_func} LIMIT 1").df() |
| | num_columns = len(sample_df.columns) |
| | |
| | |
| | file_size = "N/A" |
| | try: |
| | if detected_format == "parquet": |
| | metadata_result = con.execute(f""" |
| | SELECT COUNT(*) as row_groups |
| | FROM parquet_metadata('{url_input}') |
| | """).fetchone() |
| | if metadata_result: |
| | row_groups = metadata_result[0] |
| | estimated_mb = row_groups * 4.5 |
| | file_size = f"~{estimated_mb:.0f} MB" |
| | except: |
| | pass |
| |
|
| | |
| | sample_1000 = con.execute(f"SELECT * FROM {read_func} LIMIT 1000").df() |
| | |
| | columns_info = [] |
| | for col in sample_1000.columns: |
| | col_data = sample_1000[col].dropna() |
| | |
| | |
| | if len(col_data) == 0: |
| | col_type = "UNKNOWN" |
| | detail_type = "VIDE" |
| | elif pd.api.types.is_integer_dtype(col_data): |
| | col_type = "INTEGER" |
| | detail_type = "ENTIER" |
| | elif pd.api.types.is_float_dtype(col_data): |
| | col_type = "FLOAT" |
| | detail_type = "DÉCIMAL" |
| | elif pd.api.types.is_datetime64_any_dtype(col_data): |
| | col_type = "DATETIME" |
| | detail_type = "DATE/HEURE" |
| | elif pd.api.types.is_bool_dtype(col_data): |
| | col_type = "BOOLEAN" |
| | detail_type = "BOOLEEN" |
| | else: |
| | col_type = "TEXT" |
| | try: |
| | pd.to_numeric(col_data, errors='raise') |
| | detail_type = "NUMÉRIQUE" |
| | except: |
| | detail_type = "TEXTE" |
| | |
| | |
| | null_count_sample = sample_1000[col].isna().sum() |
| | fill_rate = ((1000 - null_count_sample) / 1000 * 100) |
| | |
| | example = str(col_data.iloc[0])[:30] if len(col_data) > 0 else "N/A" |
| | |
| | columns_info.append({ |
| | 'Variable': col, |
| | 'Type': col_type, |
| | 'Type_Détaillé': detail_type, |
| | 'Valeurs_Manquantes': null_count_sample, |
| | 'Taux_Remplissage': round(fill_rate, 1), |
| | 'Exemple': example |
| | }) |
| | |
| | columns_df = pd.DataFrame(columns_info) |
| | |
| | |
| | sample_display = con.execute(f"SELECT * FROM {read_func} LIMIT {max_rows_sample}").df() |
| |
|
| | |
| | st.session_state.analysis_data = { |
| | 'total_rows': total_rows, |
| | 'num_columns': num_columns, |
| | 'file_size': file_size, |
| | 'detected_format': detected_format, |
| | 'columns_df': columns_df, |
| | 'sample_display': sample_display, |
| | 'read_func': read_func, |
| | 'url_input': url_input |
| | } |
| | st.session_state.analysis_done = True |
| | |
| | con.close() |
| | st.success("✅ **Analyse terminée avec succès !**") |
| | st.rerun() |
| |
|
| | except Exception as e: |
| | st.error(f"❌ Erreur lors de l'analyse : {str(e)}") |
| | st.info("💡 Vérifiez que l'URL est accessible et publique") |
| | else: |
| | st.warning("⚠️ Veuillez saisir une URL valide") |
| |
|
| | |
| | with st.sidebar.expander("🧪 URLs de test"): |
| | st.markdown("**URL fonctionnelles pour tester :**") |
| | |
| | test_urls = [ |
| | ("SIREN Entreprises France", "https://object.files.data.gouv.fr/data-pipeline-open/siren/stock/StockUniteLegale_utf8.parquet"), |
| | ("NYC Taxi Oct 2025", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-10.parquet"), |
| | ("Open Data Paris Ilôts de fraîcheur", r"https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/ilots-de-fraicheur-equipements-activites/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B") |
| | ] |
| | |
| | for i, (name, url) in enumerate(test_urls): |
| | if st.button(f"📊 {name}", key=f"test_{i}", use_container_width=True): |
| | st.session_state.reset_counter += 1 |
| | st.session_state.test_url = url |
| | st.rerun() |
| |
|
| | |
| | if st.session_state.analysis_done: |
| | data = st.session_state.analysis_data |
| | |
| | tab1, tab2, tab3, tab4 = st.tabs(["📊 Dashboard", "📋 Variables", "💾 Données", "💻 Code"]) |
| | |
| | |
| | |
| | |
| | with tab1: |
| | |
| | avg_fill = data['columns_df']['Taux_Remplissage'].mean() |
| | missing_cols = len(data['columns_df'][data['columns_df']['Taux_Remplissage'] < 100]) |
| | complete_cols = len(data['columns_df']) - missing_cols |
| | |
| | |
| | st.markdown(""" |
| | <style> |
| | .metric-card { |
| | background-color: #f0f2f6; |
| | border-radius: 10px; |
| | padding: 15px; |
| | text-align: center; |
| | box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
| | height: 100px; |
| | display: flex; |
| | flex-direction: column; |
| | justify-content: center; |
| | align-items: center; |
| | } |
| | .metric-label { |
| | font-size: 0.85em; |
| | color: #666; |
| | margin-bottom: 5px; |
| | line-height: 1.2; |
| | min-height: 32px; |
| | display: flex; |
| | align-items: center; |
| | justify-content: center; |
| | } |
| | .metric-value { |
| | font-size: 1.8em; |
| | font-weight: bold; |
| | color: #262730; |
| | } |
| | </style> |
| | """, unsafe_allow_html=True) |
| | |
| | |
| | col1, col2, col3, col4, col5, col6, col7 = st.columns(7) |
| | |
| | with col1: |
| | st.markdown(f""" |
| | <div class="metric-card"> |
| | <div class="metric-label">📊 Observations</div> |
| | <div class="metric-value">{data['total_rows']:,}</div> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | |
| | with col2: |
| | st.markdown(f""" |
| | <div class="metric-card"> |
| | <div class="metric-label">📋 Colonnes</div> |
| | <div class="metric-value">{data['num_columns']}</div> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | |
| | with col3: |
| | st.markdown(f""" |
| | <div class="metric-card"> |
| | <div class="metric-label">💾 Taille fichier</div> |
| | <div class="metric-value">{data['file_size']}</div> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | |
| | with col4: |
| | st.markdown(f""" |
| | <div class="metric-card"> |
| | <div class="metric-label">📄 Format</div> |
| | <div class="metric-value">{data['detected_format'].upper()}</div> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | |
| | with col5: |
| | st.markdown(f""" |
| | <div class="metric-card"> |
| | <div class="metric-label">✅ Taux moyen</div> |
| | <div class="metric-value">{avg_fill:.1f}%</div> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | |
| | with col6: |
| | st.markdown(f""" |
| | <div class="metric-card"> |
| | <div class="metric-label">⚠️ Colonnes incomplètes</div> |
| | <div class="metric-value">{missing_cols}</div> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | |
| | with col7: |
| | st.markdown(f""" |
| | <div class="metric-card"> |
| | <div class="metric-label">✔️ Colonnes complètes</div> |
| | <div class="metric-value">{complete_cols}</div> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | |
| | st.markdown("<br>", unsafe_allow_html=True) |
| | |
| | |
| | col_left, col_right = st.columns([2, 1]) |
| | |
| | with col_left: |
| | |
| | fig_fill = px.bar( |
| | data['columns_df'].sort_values('Taux_Remplissage'), |
| | y='Variable', |
| | x='Taux_Remplissage', |
| | title="Taux de remplissage par variable (1000 premières lignes)", |
| | color='Taux_Remplissage', |
| | color_continuous_scale='RdYlGn', |
| | orientation='h', |
| | range_color=[0, 100], |
| | height=500 |
| | ) |
| | |
| | fig_fill.update_layout( |
| | showlegend=False, |
| | xaxis_title="Taux de Remplissage (%)", |
| | yaxis_title="", |
| | margin=dict(t=50, b=50, l=200, r=20) |
| | ) |
| | fig_fill.update_traces(marker_line_width=0, marker_cornerradius=5) |
| | fig_fill.update_yaxes(tickmode='linear') |
| | |
| | st.plotly_chart(fig_fill, use_container_width=True) |
| | |
| | with col_right: |
| | |
| | type_counts = data['columns_df']['Type_Détaillé'].value_counts() |
| | fig_pie = px.pie( |
| | values=type_counts.values, |
| | names=type_counts.index, |
| | title="Répartition des types" |
| | ) |
| | fig_pie.update_traces(textposition='inside', textinfo='percent+label') |
| | fig_pie.update_layout(height=500) |
| | st.plotly_chart(fig_pie, use_container_width=True) |
| | |
| | |
| | |
| | |
| | with tab2: |
| | st.header("📋 Structure des Variables") |
| | |
| | |
| | display_df = data['columns_df'][['Variable', 'Type', 'Type_Détaillé', 'Valeurs_Manquantes', 'Taux_Remplissage', 'Exemple']].copy() |
| | display_df.columns = ['Variable', 'Type', 'Type Détaillé', 'Valeurs Manquantes (sur 1000)', 'Taux de Remplissage (%)', 'Exemple'] |
| | |
| | st.dataframe(display_df, use_container_width=True, height=600) |
| | |
| | |
| | |
| | |
| | with tab3: |
| | st.header("💾 Échantillon des Données") |
| | |
| | col1, col2 = st.columns([1, 3]) |
| | with col1: |
| | st.metric("Lignes affichées", f"{len(data['sample_display']):,}") |
| | with col2: |
| | st.caption(f"sur {data['total_rows']:,} total") |
| | |
| | st.dataframe(data['sample_display'], use_container_width=True, height=600) |
| | |
| | |
| | |
| | |
| | with tab4: |
| | st.header("💻 Code Python prêt à l'emploi") |
| | |
| | st.code(f""" |
| | import duckdb |
| | |
| | # Connexion |
| | con = duckdb.connect() |
| | con.execute("INSTALL httpfs; LOAD httpfs;") |
| | |
| | # Lecture des données |
| | df = con.execute("SELECT * FROM {data['read_func']} LIMIT 1000").df() |
| | print(f"Forme: {{df.shape}}") |
| | print("Colonnes:", df.columns.tolist()) |
| | |
| | # Nombre total de lignes |
| | total_rows = con.execute("SELECT COUNT(*) FROM {data['read_func']}").fetchone()[0] |
| | print(f"Total lignes: {{total_rows:,}}") |
| | """, language="python") |
| |
|
| | else: |
| | st.info("👆 Veuillez saisir une URL et cliquer sur **Analyser la base de données** pour commencer l'analyse") |