Spaces:
No application file
No application file
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from sklearn.ensemble import IsolationForest | |
| def quality_report(df): | |
| st.markdown(""" | |
| <div style='text-align: center; margin-bottom: 2rem;'> | |
| <h2>π Data Quality Report</h2> | |
| <p style='color: gray;'>Comprehensive data quality assessment</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Overall quality score | |
| st.subheader("π Overall Data Quality Score") | |
| # Calculate various quality metrics | |
| completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100 | |
| uniqueness = (1 - df.duplicated().sum() / df.shape[0]) * 100 | |
| # Data type consistency | |
| type_consistency = 100 | |
| for col in df.columns: | |
| if df[col].dtype == 'object': | |
| # Check if column has consistent types | |
| try: | |
| pd.to_numeric(df[col], errors='raise') | |
| # If convertible to numeric, it might be inconsistent | |
| type_consistency -= 5 | |
| except: | |
| pass | |
| # Outlier impact | |
| outlier_impact = 100 | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns | |
| if len(numeric_cols) > 0: | |
| for col in numeric_cols: | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)] | |
| outlier_pct = len(outliers) / len(df) * 100 | |
| if outlier_pct > 10: | |
| outlier_impact -= 10 | |
| quality_score = (completeness + uniqueness + type_consistency + outlier_impact) / 4 | |
| # Display gauge | |
| fig = go.Figure(go.Indicator( | |
| mode="gauge+number", | |
| value=quality_score, | |
| domain={'x': [0, 1], 'y': [0, 1]}, | |
| title={'text': "Quality Score"}, | |
| gauge={ | |
| 'axis': {'range': [None, 100]}, | |
| 'bar': {'color': "#2E86AB"}, | |
| 'steps': [ | |
| {'range': [0, 50], 'color': "#FF6B6B"}, | |
| {'range': [50, 70], 'color': "#FFD93D"}, | |
| {'range': [70, 85], 'color': "#6BCB77"}, | |
| {'range': [85, 100], 'color': "#4CAF50"} | |
| ], | |
| 'threshold': { | |
| 'line': {'color': "red", 'width': 4}, | |
| 'thickness': 0.75, | |
| 'value': 90 | |
| } | |
| })) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Quality metrics cards | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Completeness", f"{completeness:.1f}%", | |
| delta=None, delta_color="normal") | |
| with col2: | |
| st.metric("Uniqueness", f"{uniqueness:.1f}%", | |
| delta=None, delta_color="normal") | |
| with col3: | |
| st.metric("Type Consistency", f"{type_consistency:.1f}%", | |
| delta=None, delta_color="normal") | |
| with col4: | |
| st.metric("Outlier Impact", f"{outlier_impact:.1f}%", | |
| delta=None, delta_color="inverse") | |
| # Detailed quality report | |
| st.subheader("π Detailed Quality Report") | |
| quality_df = pd.DataFrame({ | |
| 'Column': df.columns, | |
| 'Data Type': df.dtypes, | |
| 'Missing Count': df.isnull().sum().values, | |
| 'Missing %': (df.isnull().sum().values / len(df) * 100).round(2), | |
| 'Unique Values': [df[col].nunique() for col in df.columns], | |
| 'Unique %': [round((df[col].nunique() / len(df) * 100),2) for col in df.columns], | |
| 'Duplicate Values?': [df[col].duplicated().any() for col in df.columns] | |
| }) | |
| # Add outlier info for numeric columns | |
| outlier_info = [] | |
| for col in df.columns: | |
| if df[col].dtype in ['int64', 'float64']: | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)] | |
| outlier_info.append(len(outliers)) | |
| else: | |
| outlier_info.append(0) | |
| quality_df['Outliers'] = outlier_info | |
| st.dataframe(quality_df.style.background_gradient(subset=['Missing %', 'Outliers'], cmap='YlOrRd'), | |
| use_container_width=True) | |
| # Visualizations | |
| st.subheader("π Quality Visualizations") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Missing values bar chart | |
| missing_cols = df.isnull().sum()[df.isnull().sum() > 0] | |
| if len(missing_cols) > 0: | |
| fig = px.bar(x=missing_cols.index, y=missing_cols.values, | |
| title="Missing Values by Column", | |
| labels={'x': 'Column', 'y': 'Missing Count'}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.success("No missing values found!") | |
| with col2: | |
| # Data type distribution | |
| dtype_counts = df.dtypes.value_counts() | |
| fig = px.pie(values=dtype_counts.values, names=dtype_counts.index.astype(str), | |
| title="Data Type Distribution") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Outlier detection with Isolation Forest | |
| st.subheader("π΅οΈ Anomaly Detection") | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| if len(numeric_cols) > 0: | |
| contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01) | |
| iso_forest = IsolationForest(contamination=contamination, random_state=42) | |
| outliers = iso_forest.fit_predict(df[numeric_cols].fillna(0)) | |
| n_outliers = (outliers == -1).sum() | |
| st.write(f"**Detected Anomalies:** {n_outliers} rows ({n_outliers/len(df)*100:.2f}%)") | |
| # Visualize outliers (if 2 or 3 numeric columns) | |
| if len(numeric_cols) >= 2: | |
| df_with_outliers = df[numeric_cols[:3]].copy() | |
| df_with_outliers['is_outlier'] = outliers | |
| if len(numeric_cols) == 2: | |
| fig = px.scatter(df_with_outliers, x=numeric_cols[0], y=numeric_cols[1], | |
| color='is_outlier', title="Anomaly Detection Results", | |
| color_continuous_scale=['blue', 'red']) | |
| st.plotly_chart(fig, use_container_width=True) | |
| elif len(numeric_cols) >= 3: | |
| fig = px.scatter_3d(df_with_outliers, x=numeric_cols[0], | |
| y=numeric_cols[1], z=numeric_cols[2], | |
| color='is_outlier', title="Anomaly Detection Results (3D)", | |
| color_continuous_scale=['blue', 'red']) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.info("No numeric columns available for anomaly detection") | |
| # Recommendations | |
| st.subheader("π‘ Quality Improvement Recommendations") | |
| recommendations = [] | |
| # Missing value recommendations | |
| missing_cols = df.columns[df.isnull().any()].tolist() | |
| if missing_cols: | |
| recommendations.append(f"β’ Handle missing values in {len(missing_cols)} columns: {', '.join(missing_cols[:5])}") | |
| # Duplicate recommendations | |
| if df.duplicated().sum() > 0: | |
| recommendations.append(f"β’ Remove {df.duplicated().sum()} duplicate rows") | |
| # Outlier recommendations | |
| outlier_cols = [] | |
| for col in numeric_cols: | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)] | |
| if len(outliers) > len(df) * 0.1: # More than 10% outliers | |
| outlier_cols.append(col) | |
| if outlier_cols: | |
| recommendations.append(f"β’ Investigate outliers in: {', '.join(outlier_cols[:3])}") | |
| # Data type recommendations | |
| for col in df.columns: | |
| if df[col].dtype == 'object': | |
| # Check if column should be numeric | |
| try: | |
| pd.to_numeric(df[col].dropna().iloc[:100]) | |
| recommendations.append(f"β’ Convert '{col}' to numeric type") | |
| except: | |
| pass | |
| if recommendations: | |
| for rec in recommendations: | |
| st.markdown(rec) | |
| else: | |
| st.success("β Dataset quality looks good! No major issues detected.") | |
| # Download quality report | |
| report_text = f""" | |
| DATA QUALITY REPORT | |
| =================== | |
| Overall Quality Score: {quality_score:.1f}/100 | |
| Metrics: | |
| β’ Completeness: {completeness:.1f}% | |
| β’ Uniqueness: {uniqueness:.1f}% | |
| β’ Type Consistency: {type_consistency:.1f}% | |
| β’ Outlier Impact: {outlier_impact:.1f}% | |
| Dataset Statistics: | |
| β’ Rows: {df.shape[0]:,} | |
| β’ Columns: {df.shape[1]} | |
| β’ Missing Values: {df.isnull().sum().sum():,} | |
| β’ Duplicate Rows: {df.duplicated().sum():,} | |
| Recommendations: | |
| {chr(10).join(recommendations)} | |
| """ | |
| st.download_button( | |
| label="π₯ Download Quality Report", | |
| data=report_text, | |
| file_name="data_quality_report.txt", | |
| mime="text/plain", | |
| use_container_width=True | |
| ) |