dataanalyst / data_quality.py
RamAi2026's picture
Upload 13 files
da8e446 verified
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import IsolationForest
def quality_report(df):
st.markdown("""
<div style='text-align: center; margin-bottom: 2rem;'>
<h2>πŸ“‹ Data Quality Report</h2>
<p style='color: gray;'>Comprehensive data quality assessment</p>
</div>
""", unsafe_allow_html=True)
# Overall quality score
st.subheader("πŸ“Š Overall Data Quality Score")
# Calculate various quality metrics
completeness = (1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
uniqueness = (1 - df.duplicated().sum() / df.shape[0]) * 100
# Data type consistency
type_consistency = 100
for col in df.columns:
if df[col].dtype == 'object':
# Check if column has consistent types
try:
pd.to_numeric(df[col], errors='raise')
# If convertible to numeric, it might be inconsistent
type_consistency -= 5
except:
pass
# Outlier impact
outlier_impact = 100
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
outlier_pct = len(outliers) / len(df) * 100
if outlier_pct > 10:
outlier_impact -= 10
quality_score = (completeness + uniqueness + type_consistency + outlier_impact) / 4
# Display gauge
fig = go.Figure(go.Indicator(
mode="gauge+number",
value=quality_score,
domain={'x': [0, 1], 'y': [0, 1]},
title={'text': "Quality Score"},
gauge={
'axis': {'range': [None, 100]},
'bar': {'color': "#2E86AB"},
'steps': [
{'range': [0, 50], 'color': "#FF6B6B"},
{'range': [50, 70], 'color': "#FFD93D"},
{'range': [70, 85], 'color': "#6BCB77"},
{'range': [85, 100], 'color': "#4CAF50"}
],
'threshold': {
'line': {'color': "red", 'width': 4},
'thickness': 0.75,
'value': 90
}
}))
st.plotly_chart(fig, use_container_width=True)
# Quality metrics cards
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Completeness", f"{completeness:.1f}%",
delta=None, delta_color="normal")
with col2:
st.metric("Uniqueness", f"{uniqueness:.1f}%",
delta=None, delta_color="normal")
with col3:
st.metric("Type Consistency", f"{type_consistency:.1f}%",
delta=None, delta_color="normal")
with col4:
st.metric("Outlier Impact", f"{outlier_impact:.1f}%",
delta=None, delta_color="inverse")
# Detailed quality report
st.subheader("πŸ” Detailed Quality Report")
quality_df = pd.DataFrame({
'Column': df.columns,
'Data Type': df.dtypes,
'Missing Count': df.isnull().sum().values,
'Missing %': (df.isnull().sum().values / len(df) * 100).round(2),
'Unique Values': [df[col].nunique() for col in df.columns],
'Unique %': [round((df[col].nunique() / len(df) * 100),2) for col in df.columns],
'Duplicate Values?': [df[col].duplicated().any() for col in df.columns]
})
# Add outlier info for numeric columns
outlier_info = []
for col in df.columns:
if df[col].dtype in ['int64', 'float64']:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
outlier_info.append(len(outliers))
else:
outlier_info.append(0)
quality_df['Outliers'] = outlier_info
st.dataframe(quality_df.style.background_gradient(subset=['Missing %', 'Outliers'], cmap='YlOrRd'),
use_container_width=True)
# Visualizations
st.subheader("πŸ“Š Quality Visualizations")
col1, col2 = st.columns(2)
with col1:
# Missing values bar chart
missing_cols = df.isnull().sum()[df.isnull().sum() > 0]
if len(missing_cols) > 0:
fig = px.bar(x=missing_cols.index, y=missing_cols.values,
title="Missing Values by Column",
labels={'x': 'Column', 'y': 'Missing Count'})
st.plotly_chart(fig, use_container_width=True)
else:
st.success("No missing values found!")
with col2:
# Data type distribution
dtype_counts = df.dtypes.value_counts()
fig = px.pie(values=dtype_counts.values, names=dtype_counts.index.astype(str),
title="Data Type Distribution")
st.plotly_chart(fig, use_container_width=True)
# Outlier detection with Isolation Forest
st.subheader("πŸ•΅οΈ Anomaly Detection")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) > 0:
contamination = st.slider("Expected outlier proportion", 0.01, 0.5, 0.1, 0.01)
iso_forest = IsolationForest(contamination=contamination, random_state=42)
outliers = iso_forest.fit_predict(df[numeric_cols].fillna(0))
n_outliers = (outliers == -1).sum()
st.write(f"**Detected Anomalies:** {n_outliers} rows ({n_outliers/len(df)*100:.2f}%)")
# Visualize outliers (if 2 or 3 numeric columns)
if len(numeric_cols) >= 2:
df_with_outliers = df[numeric_cols[:3]].copy()
df_with_outliers['is_outlier'] = outliers
if len(numeric_cols) == 2:
fig = px.scatter(df_with_outliers, x=numeric_cols[0], y=numeric_cols[1],
color='is_outlier', title="Anomaly Detection Results",
color_continuous_scale=['blue', 'red'])
st.plotly_chart(fig, use_container_width=True)
elif len(numeric_cols) >= 3:
fig = px.scatter_3d(df_with_outliers, x=numeric_cols[0],
y=numeric_cols[1], z=numeric_cols[2],
color='is_outlier', title="Anomaly Detection Results (3D)",
color_continuous_scale=['blue', 'red'])
st.plotly_chart(fig, use_container_width=True)
else:
st.info("No numeric columns available for anomaly detection")
# Recommendations
st.subheader("πŸ’‘ Quality Improvement Recommendations")
recommendations = []
# Missing value recommendations
missing_cols = df.columns[df.isnull().any()].tolist()
if missing_cols:
recommendations.append(f"β€’ Handle missing values in {len(missing_cols)} columns: {', '.join(missing_cols[:5])}")
# Duplicate recommendations
if df.duplicated().sum() > 0:
recommendations.append(f"β€’ Remove {df.duplicated().sum()} duplicate rows")
# Outlier recommendations
outlier_cols = []
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
if len(outliers) > len(df) * 0.1: # More than 10% outliers
outlier_cols.append(col)
if outlier_cols:
recommendations.append(f"β€’ Investigate outliers in: {', '.join(outlier_cols[:3])}")
# Data type recommendations
for col in df.columns:
if df[col].dtype == 'object':
# Check if column should be numeric
try:
pd.to_numeric(df[col].dropna().iloc[:100])
recommendations.append(f"β€’ Convert '{col}' to numeric type")
except:
pass
if recommendations:
for rec in recommendations:
st.markdown(rec)
else:
st.success("βœ… Dataset quality looks good! No major issues detected.")
# Download quality report
report_text = f"""
DATA QUALITY REPORT
===================
Overall Quality Score: {quality_score:.1f}/100
Metrics:
β€’ Completeness: {completeness:.1f}%
β€’ Uniqueness: {uniqueness:.1f}%
β€’ Type Consistency: {type_consistency:.1f}%
β€’ Outlier Impact: {outlier_impact:.1f}%
Dataset Statistics:
β€’ Rows: {df.shape[0]:,}
β€’ Columns: {df.shape[1]}
β€’ Missing Values: {df.isnull().sum().sum():,}
β€’ Duplicate Rows: {df.duplicated().sum():,}
Recommendations:
{chr(10).join(recommendations)}
"""
st.download_button(
label="πŸ“₯ Download Quality Report",
data=report_text,
file_name="data_quality_report.txt",
mime="text/plain",
use_container_width=True
)