charanKompala's picture
Rename app (3).py to app.py
d3a5eeb verified
import gradio as gr
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
def smart_dataset_analysis(file):
"""Complete dataset analysis with intelligent recommendations"""
if not file: return "Upload a dataset to get intelligent analysis."
try:
# Smart file reading
df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name)
# Deep dataset profiling
profile = {
'shape': df.shape,
'numeric_cols': list(df.select_dtypes(include=[np.number]).columns),
'categorical_cols': list(df.select_dtypes(include=['object', 'category']).columns),
'datetime_cols': list(df.select_dtypes(include=['datetime64']).columns),
'missing_data': df.isnull().sum().to_dict(),
'duplicates': df.duplicated().sum(),
'memory_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2)
}
# Smart target detection with scoring
target_scores = {}
target_keywords = ['target', 'label', 'class', 'outcome', 'price', 'sales', 'churn', 'rating', 'status']
for col in df.columns:
score = 0
col_lower = col.lower()
# Keyword matching
score += 15 if any(kw in col_lower for kw in target_keywords) else 0
# Statistical characteristics
if col in profile['numeric_cols']:
unique_ratio = df[col].nunique() / len(df)
if 0.02 < unique_ratio < 0.95: # Good target range
score += 10
if df[col].std() > 0: # Has variance
score += 5
elif col in profile['categorical_cols']:
unique_count = df[col].nunique()
if 2 <= unique_count <= 20: # Good classification range
score += 12
score += 5 if unique_count == 2 else 0 # Binary bonus
# Position bias (targets often at end)
if list(df.columns).index(col) >= len(df.columns) - 3:
score += 3
target_scores[col] = score
# Get top targets
top_targets = sorted(target_scores.items(), key=lambda x: x[1], reverse=True)[:3]
smart_targets = [col for col, score in top_targets if score > 8]
# Generate ML recommendations for each target
ml_recommendations = []
for target in smart_targets[:2]: # Top 2 targets
target_type = 'numeric' if target in profile['numeric_cols'] else 'categorical'
unique_vals = df[target].nunique()
# Smart model selection based on actual data
if target_type == 'numeric':
skewness = df[target].skew()
if abs(skewness) > 2:
models = ['XGBoost Regressor', 'Random Forest', 'Gradient Boosting']
reason = f"Highly skewed data (skew: {skewness:.2f}) - tree models handle non-linearity"
else:
models = ['Linear Regression', 'Ridge/Lasso', 'Random Forest']
reason = f"Normal distribution - linear models suitable"
else:
if unique_vals == 2:
models = ['Logistic Regression', 'XGBoost Classifier', 'Random Forest']
reason = f"Binary classification - balanced approach recommended"
elif unique_vals <= 5:
models = ['Multinomial Logistic', 'XGBoost', 'Random Forest']
reason = f"{unique_vals} classes - multi-class classification"
else:
models = ['XGBoost', 'Neural Networks', 'Random Forest']
reason = f"{unique_vals} classes - complex multi-class problem"
ml_recommendations.append({
'target': target,
'type': target_type,
'unique_vals': unique_vals,
'models': models,
'reason': reason,
'features': [c for c in df.columns if c != target]
})
# Smart visualization recommendations
viz_recs = []
# Executive KPIs
for col in profile['numeric_cols'][:4]:
viz_recs.append(f"πŸ“Š KPI Card: {col} (Current: {df[col].iloc[-1]:.2f}, Trend: {'↗️' if df[col].corr(pd.Series(range(len(df)))) > 0 else 'β†˜οΈ'})")
# Comparative analysis
for cat in profile['categorical_cols'][:2]:
for num in profile['numeric_cols'][:2]:
if df[cat].nunique() <= 10:
avg_by_cat = df.groupby(cat)[num].mean().round(2).to_dict()
viz_recs.append(f"πŸ“Š Bar Chart: {num} by {cat} - Averages: {avg_by_cat}")
# Correlation insights
if len(profile['numeric_cols']) >= 2:
corr_matrix = df[profile['numeric_cols']].corr()
strongest_corr = corr_matrix.abs().unstack().sort_values(ascending=False)
strongest_corr = strongest_corr[strongest_corr < 1.0].head(3)
for (var1, var2), corr_val in strongest_corr.items():
viz_recs.append(f"πŸ”— Scatter Plot: {var1} vs {var2} (Correlation: {corr_val:.3f})")
# Distribution insights
for col in profile['numeric_cols'][:3]:
q1, q3 = df[col].quantile([0.25, 0.75])
outliers = len(df[(df[col] < q1 - 1.5*(q3-q1)) | (df[col] > q3 + 1.5*(q3-q1))])
viz_recs.append(f"πŸ“ˆ Histogram: {col} distribution (Outliers: {outliers}, Skew: {df[col].skew():.2f})")
# Generate comprehensive report
report = f"""# 🧠 INTELLIGENT DATASET ANALYSIS
## πŸ“Š Dataset Intelligence
**Rows:** {profile['shape'][0]:,} | **Columns:** {profile['shape'][1]} | **Size:** {profile['memory_mb']} MB | **Duplicates:** {profile['duplicates']}
**Data Quality Score:** {((len(df) - sum(profile['missing_data'].values())) / (len(df) * len(df.columns)) * 100):.1f}%
## 🎯 SMART TARGET DETECTION
"""
for i, (target, score) in enumerate(top_targets[:3], 1):
target_info = f"**{i}. {target}** (Confidence: {score}/20)"
if target in smart_targets:
target_info += f" βœ… **RECOMMENDED**"
if target in profile['numeric_cols']:
target_info += f" | Values: {df[target].min():.2f} - {df[target].max():.2f} | Mean: {df[target].mean():.2f}"
else:
top_values = df[target].value_counts().head(3).to_dict()
target_info += f" | Top categories: {top_values}"
report += f"{target_info}\n"
report += f"\n## πŸ€– ML MODEL RECOMMENDATIONS\n"
for i, rec in enumerate(ml_recommendations, 1):
report += f"""### Scenario {i}: Predict `{rec['target']}`
**Problem Type:** {rec['type'].upper()} {'REGRESSION' if rec['type'] == 'numeric' else 'CLASSIFICATION'}
**AI Reasoning:** {rec['reason']}
**Recommended Models:**
1. πŸ₯‡ **{rec['models'][0]}** (Primary choice)
2. πŸ₯ˆ **{rec['models'][1]}** (Alternative)
3. πŸ₯‰ **{rec['models'][2]}** (Backup option)
**Features Available:** {len(rec['features'])} variables
**Data Split:** {int(len(df)*0.8):,} train / {int(len(df)*0.2):,} test
"""
# Data preprocessing recommendations
missing_cols = [col for col, missing in profile['missing_data'].items() if missing > 0]
high_cardinality = [col for col in profile['categorical_cols'] if df[col].nunique() > 20]
report += f"""## βš™οΈ PREPROCESSING PIPELINE
**Missing Data:** {len(missing_cols)} columns need attention
"""
if missing_cols:
for col in missing_cols[:5]:
missing_pct = (profile['missing_data'][col] / len(df)) * 100
strategy = "Drop column" if missing_pct > 50 else ("Median fill" if col in profile['numeric_cols'] else "Mode fill")
report += f"- `{col}`: {missing_pct:.1f}% missing β†’ {strategy}\n"
if high_cardinality:
report += f"**High Cardinality:** {len(high_cardinality)} categorical columns need encoding\n"
for col in high_cardinality[:3]:
report += f"- `{col}`: {df[col].nunique()} categories β†’ Target encoding recommended\n"
report += f"\n## πŸ“Š SMART VISUALIZATIONS\n"
for viz in viz_recs:
report += f"{viz}\n"
# Performance predictions
sample_size_category = "Large" if len(df) > 50000 else ("Medium" if len(df) > 5000 else "Small")
feature_ratio = len(df.columns) / len(df)
report += f"""
## πŸš€ IMPLEMENTATION ROADMAP
### Phase 1: Data Preparation (Week 1)
- Handle {len(missing_cols)} missing data issues
- Encode {len(profile['categorical_cols'])} categorical variables
- Feature scaling for {len(profile['numeric_cols'])} numeric variables
### Phase 2: Model Development (Week 2-3)
- {sample_size_category} dataset β†’ Expected training time: {'Minutes' if len(df) < 10000 else 'Hours'}
- Feature importance analysis using top recommended models
- Cross-validation with {5 if len(df) > 1000 else 3}-fold strategy
### Phase 3: Production (Week 4)
- Model deployment pipeline
- Performance monitoring dashboard
- A/B testing framework
## πŸ’‘ BUSINESS IMPACT PREDICTION
**Model Accuracy Expectation:** {85 if len(df) > 10000 and len(missing_cols) < 3 else 75}%+
**ROI Timeline:** 3-6 months
**Automation Potential:** {min(90, len(profile['numeric_cols']) * 10)}% of manual analysis
"""
return report
except Exception as e:
return f"Analysis failed: {str(e)}\nEnsure file is valid CSV/Excel format."
def smart_ba_chat(message, history):
"""Intelligent BA assistant with contextual responses"""
# Smart response generation based on keywords
keywords = message.lower()
if any(word in keywords for word in ['requirement', 'functional', 'specification']):
response = f"""## πŸ“‹ Smart Requirements Analysis for: "{message}"
### 🎯 Identified Requirements Type
**Primary:** {'Functional' if 'function' in keywords else 'Business'} Requirements
**Complexity:** {'High' if len(message.split()) > 20 else 'Medium'}
### πŸ“ Generated Requirements Framework
1. **Must Have (P1)**
- Core functionality: {message.split('.')[0] if '.' in message else message[:50]}
- User authentication and authorization
- Data validation and error handling
2. **Should Have (P2)**
- Reporting and analytics dashboard
- Export/import capabilities
- Audit trail functionality
3. **Could Have (P3)**
- Advanced filtering options
- Mobile responsiveness
- Integration APIs
### βœ… Acceptance Criteria Template
```
Given: User has appropriate permissions
When: User performs {message.split()[-1] if message.split() else 'action'}
Then: System should respond within 3 seconds
And: Changes should be logged for audit
```
### πŸ” Next Steps
- [ ] Stakeholder validation workshop
- [ ] Technical feasibility assessment
- [ ] Resource estimation and timeline
"""
elif any(word in keywords for word in ['process', 'workflow', 'procedure']):
response = f"""## πŸ”„ Process Analysis for: "{message}"
### πŸ“Š Current State Assessment
**Process Complexity:** {'High' if 'complex' in keywords or 'multiple' in keywords else 'Medium'}
**Stakeholders Involved:** {'Multiple departments' if 'department' in keywords else 'Single team'}
### 🎯 Identified Pain Points
- Manual handoffs and delays
- Lack of visibility and tracking
- Inconsistent execution
- No performance metrics
### πŸš€ Recommended Solution
**Automation Level:** {80 if 'automate' in keywords else 60}%
**Expected Efficiency Gain:** {40 if 'improve' in keywords else 25}%
**Implementation Timeline:** {'3-6 months' if 'large' in keywords else '6-12 weeks'}
### πŸ“ˆ Process Optimization Steps
1. **Map Current State** (Week 1-2)
2. **Identify Bottlenecks** (Week 3)
3. **Design Future State** (Week 4-5)
4. **Pilot Implementation** (Week 6-8)
5. **Full Rollout** (Week 9-12)
### 🎯 Success Metrics
- Process cycle time reduction: 50%+
- Error rate reduction: 80%+
- User satisfaction score: 4.5+/5
"""
elif any(word in keywords for word in ['dashboard', 'report', 'visual', 'chart']):
response = f"""## πŸ“Š Visualization Strategy for: "{message}"
### 🎨 Smart Chart Recommendations
**Data Type Detected:** {'Time-series' if 'time' in keywords or 'trend' in keywords else 'Categorical'}
**Audience Level:** {'Executive' if 'executive' in keywords else 'Operational'}
### πŸ“ˆ Recommended Visualizations
1. **KPI Dashboard**
- Primary metrics with trend indicators
- Color-coded status (Red/Yellow/Green)
- Real-time data refresh
2. **Comparative Analysis**
- Bar charts for category comparison
- Heat maps for correlation analysis
- Scatter plots for relationship insights
3. **Trend Analysis**
- Line charts for time-based data
- Area charts for cumulative metrics
- Waterfall charts for variance analysis
### 🎯 Dashboard Layout Strategy
```
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ Executive Summary KPIs β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ Primary Chart β”‚ Filter Controls β”‚
β”‚ (60% width) β”‚ & Drill-downs β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ Supporting Analytics β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
```
### πŸ’‘ Business Intelligence Features
- Interactive filtering and drill-down
- Automated insights and anomaly detection
- Mobile-responsive design
- Export capabilities (PDF, Excel, PowerPoint)
"""
else:
# Generic intelligent response
response = f"""## 🧠 Smart Business Analysis for: "{message}"
### 🎯 Analysis Summary
**Domain:** {'Technology' if any(tech in keywords for tech in ['system', 'software', 'digital', 'ai', 'ml']) else 'Business Operations'}
**Complexity:** {'High' if len(message.split()) > 15 else 'Medium'}
**Urgency:** {'High' if any(urgent in keywords for urgent in ['urgent', 'asap', 'critical']) else 'Normal'}
### πŸ“Š Key Considerations
1. **Stakeholder Impact Analysis**
- Primary users: Business operations team
- Secondary users: Management and IT
- Decision makers: Department heads
2. **Success Criteria Definition**
- Quantitative: ROI > 20%, Time savings > 30%
- Qualitative: User satisfaction, Process efficiency
- Timeline: {'6-8 weeks' if 'quick' in keywords else '3-4 months'}
3. **Risk Assessment**
- Technical: {'Medium' if 'technical' in keywords else 'Low'}
- Business: {'High' if 'change' in keywords else 'Medium'}
- Resource: Based on scope and timeline
### πŸš€ Recommended Action Plan
**Phase 1:** Requirement gathering and stakeholder alignment
**Phase 2:** Solution design and prototype development
**Phase 3:** Implementation and testing
**Phase 4:** Deployment and change management
### πŸ’‘ Next Steps
- Schedule stakeholder interviews
- Define detailed acceptance criteria
- Create project timeline and milestones
- Identify potential risks and mitigation strategies
"""
return response
# Streamlined Gradio Interface
with gr.Blocks(title="Smart Business Analyst GPT", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 Smart Business Analyst GPT\n## AI-Powered Dataset Analysis & Business Intelligence")
with gr.Tabs():
with gr.TabItem("πŸ’¬ Intelligent BA Assistant"):
chatbot = gr.Chatbot(height=500, label="Smart BA Assistant")
msg = gr.Textbox(placeholder="Describe your business challenge...", label="Your Question", lines=2)
with gr.Row():
submit = gr.Button("πŸš€ Get Smart Analysis", variant="primary")
clear = gr.Button("Clear")
with gr.TabItem("πŸ“Š Complete Dataset Analysis"):
gr.Markdown("### Upload your dataset for comprehensive AI analysis")
file_upload = gr.File(label="Upload CSV/Excel", file_types=[".csv", ".xlsx", ".xls"])
analyze_btn = gr.Button("🧠 Complete Analysis", variant="primary", size="lg")
analysis_output = gr.Textbox(label="Intelligent Analysis Report", lines=30, show_copy_button=True)
# Event handlers
def chat_respond(message, history):
response = smart_ba_chat(message, history)
history.append((message, response))
return "", history
msg.submit(chat_respond, [msg, chatbot], [msg, chatbot])
submit.click(chat_respond, [msg, chatbot], [msg, chatbot])
clear.click(lambda: [], None, chatbot)
analyze_btn.click(smart_dataset_analysis, file_upload, analysis_output)
if __name__ == "__main__":
demo.launch()