| import gradio as gr |
| import pandas as pd |
| import aiohttp |
| import asyncio |
| import json |
| import os |
| import numpy as np |
| import plotly.express as px |
| import plotly.graph_objects as go |
| from plotly.subplots import make_subplots |
| from typing import Optional, Tuple, Dict, Any, List |
| import logging |
| from datetime import datetime, timedelta |
| import re |
| from jinja2 import Template |
| import markdown |
| import zipfile |
| import io |
| import base64 |
| from scipy import stats |
| import seaborn as sns |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
| ) |
| logger = logging.getLogger(__name__) |
|
|
| class AdvancedDataAnalyzer: |
| def __init__(self): |
| self.api_base_url = "https://llm.chutes.ai/v1/chat/completions" |
| self.max_file_size = 100 * 1024 * 1024 |
| self.conversation_history = [] |
| self.current_df = None |
| self.current_charts = None |
| self.analysis_cache = {} |
| self.supported_formats = ['.csv', '.xlsx', '.xls', '.json', '.parquet', '.tsv'] |
| |
| def validate_api_key(self, api_key: str) -> Tuple[bool, str]: |
| """Enhanced API key validation""" |
| if not api_key or len(api_key.strip()) < 10: |
| return False, "API key must be at least 10 characters long" |
| |
| |
| api_key = api_key.strip() |
| if not (api_key.startswith(('sk-', 'pk-', 'Bearer ')) or len(api_key) > 20): |
| return False, "API key format appears invalid" |
| |
| return True, "Valid API key format" |
| |
| def validate_file(self, file) -> Tuple[bool, str]: |
| """Enhanced file validation with better error messages""" |
| if not file: |
| return False, "No file uploaded" |
| |
| try: |
| file_size = os.path.getsize(file.name) |
| if file_size > self.max_file_size: |
| return False, f"File too large. Maximum size: {self.max_file_size // (1024*1024)}MB" |
| |
| if file_size == 0: |
| return False, "File is empty" |
| |
| file_extension = os.path.splitext(file.name)[1].lower() |
| if file_extension not in self.supported_formats: |
| return False, f"Unsupported format. Supported: {', '.join(self.supported_formats)}" |
| |
| return True, "File validation passed" |
| |
| except Exception as e: |
| return False, f"File validation error: {str(e)}" |
| |
| async def analyze_with_chutes(self, api_token: str, data_summary: str, user_question: str = None, analysis_type: str = "comprehensive") -> str: |
| """Enhanced API call with better prompts and error handling""" |
| headers = { |
| "Authorization": f"Bearer {api_token.strip()}", |
| "Content-Type": "application/json", |
| "User-Agent": "SmartDataAnalyzer/2.0" |
| } |
| |
| |
| prompts = { |
| "comprehensive": f"""You are a senior data scientist with 10+ years of experience. Analyze this dataset comprehensively: |
| |
| {data_summary} |
| |
| Provide a thorough analysis with: |
| 1. **Executive Summary**: 3-4 key takeaways for stakeholders |
| 2. **Statistical Insights**: Important numbers, distributions, and what they reveal |
| 3. **Pattern Recognition**: Trends, correlations, seasonality, anomalies |
| 4. **Data Quality Assessment**: Completeness, accuracy, consistency issues |
| 5. **Business Intelligence**: Actionable insights and opportunities |
| 6. **Risk Analysis**: Potential data quality issues or business risks |
| 7. **Recommendations**: Specific, prioritized next steps |
| |
| Use bullet points, specific numbers, and clear explanations.""", |
|
|
| "quick": f"""Provide a quick but insightful analysis of this dataset: |
| {data_summary} |
| |
| Focus on: |
| - Top 3 most important findings |
| - Any obvious patterns or anomalies |
| - Key business insights |
| - Quick recommendations |
| |
| Keep it concise but valuable.""", |
|
|
| "question": f"""Based on this dataset: |
| {data_summary} |
| |
| User's specific question: {user_question} |
| |
| Provide a detailed, data-driven answer with: |
| - Direct answer to the question |
| - Supporting evidence from the data |
| - Additional related insights |
| - Specific recommendations |
| - Follow-up questions to consider""" |
| } |
| |
| prompt = prompts.get(analysis_type, prompts["comprehensive"]) |
| if user_question and analysis_type != "question": |
| prompt += f"\n\nUser's additional question: {user_question}" |
|
|
| body = { |
| "model": "openai/gpt-oss-20b", |
| "messages": [ |
| { |
| "role": "system", |
| "content": """You are an expert data scientist and business analyst. Provide clear, actionable insights with specific data points. Use markdown formatting for better readability. Always include: |
| - Specific numbers and percentages |
| - Clear section headers |
| - Bullet points for key insights |
| - Bold text for important findings |
| - Recommendations with priority levels""" |
| }, |
| { |
| "role": "user", |
| "content": prompt |
| } |
| ], |
| "stream": True, |
| "max_tokens": 4000, |
| "temperature": 0.3, |
| "top_p": 0.9 |
| } |
| |
| try: |
| timeout = aiohttp.ClientTimeout(total=45) |
| async with aiohttp.ClientSession(timeout=timeout) as session: |
| async with session.post(self.api_base_url, headers=headers, json=body) as response: |
| if response.status == 401: |
| return "β **Authentication Error**: Invalid API key. Please verify your Chutes API token." |
| elif response.status == 429: |
| return "β³ **Rate Limit Exceeded**: Too many requests. Please wait 30 seconds and try again." |
| elif response.status == 503: |
| return "π§ **Service Unavailable**: API temporarily unavailable. Please try again later." |
| elif response.status != 200: |
| error_text = await response.text() |
| return f"β **API Error {response.status}**: {error_text[:200]}" |
| |
| full_response = "" |
| async for line in response.content: |
| line = line.decode("utf-8").strip() |
| if line.startswith("data: "): |
| data = line[6:] |
| if data == "[DONE]": |
| break |
| try: |
| chunk_data = json.loads(data) |
| if "choices" in chunk_data and len(chunk_data["choices"]) > 0: |
| delta = chunk_data["choices"][0].get("delta", {}) |
| content = delta.get("content", "") |
| if content: |
| full_response += content |
| except json.JSONDecodeError: |
| continue |
| |
| if not full_response: |
| return "β οΈ **Empty Response**: No analysis received. Please try again." |
| |
| |
| self.conversation_history.append({ |
| "timestamp": datetime.now(), |
| "question": user_question or "General Analysis", |
| "response": full_response[:500] + "..." if len(full_response) > 500 else full_response |
| }) |
| |
| return full_response |
| |
| except asyncio.TimeoutError: |
| return "β° **Timeout Error**: Analysis took too long. Try with a smaller file or simpler question." |
| except aiohttp.ClientError as e: |
| logger.error(f"HTTP Error: {str(e)}") |
| return f"π **Connection Error**: Unable to reach API. Check your internet connection." |
| except Exception as e: |
| logger.error(f"Unexpected API Error: {str(e)}") |
| return f"β **Unexpected Error**: {str(e)}" |
|
|
| def process_file(self, file_path: str, sample_size: int = None) -> Tuple[pd.DataFrame, str, str]: |
| """Enhanced file processing with support for multiple formats and sampling""" |
| try: |
| file_extension = os.path.splitext(file_path)[1].lower() |
| |
| |
| if file_extension == '.csv': |
| for encoding in ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']: |
| for sep in [',', ';', '\t', '|']: |
| try: |
| df = pd.read_csv(file_path, encoding=encoding, sep=sep, low_memory=False) |
| if df.shape[1] > 1: |
| break |
| except (UnicodeDecodeError, pd.errors.ParserError): |
| continue |
| else: |
| continue |
| break |
| else: |
| raise ValueError("Could not decode CSV file with any supported encoding/separator") |
| |
| elif file_extension == '.tsv': |
| df = pd.read_csv(file_path, sep='\t', encoding='utf-8') |
| |
| elif file_extension in ['.xlsx', '.xls']: |
| df = pd.read_excel(file_path, engine='openpyxl' if file_extension == '.xlsx' else 'xlrd') |
| |
| elif file_extension == '.json': |
| with open(file_path, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| df = pd.json_normalize(data) if isinstance(data, list) else pd.DataFrame(data) |
| |
| elif file_extension == '.parquet': |
| df = pd.read_parquet(file_path) |
| |
| |
| df.columns = df.columns.astype(str).str.strip().str.replace(r'\s+', ' ', regex=True) |
| |
| |
| df = df.dropna(how='all').dropna(axis=1, how='all') |
| |
| |
| original_size = len(df) |
| if sample_size and len(df) > sample_size: |
| df = df.sample(n=sample_size, random_state=42) |
| logger.info(f"Sampled {sample_size} rows from {original_size} total rows") |
| |
| |
| df = self.auto_detect_types(df) |
| |
| self.current_df = df |
| data_summary = self.generate_comprehensive_summary(df, original_size) |
| charts_html = self.generate_advanced_visualizations(df) |
| |
| return df, data_summary, charts_html |
| |
| except Exception as e: |
| logger.error(f"File processing error: {str(e)}") |
| raise Exception(f"Error processing file: {str(e)}") |
| |
| def auto_detect_types(self, df: pd.DataFrame) -> pd.DataFrame: |
| """Intelligent data type detection and conversion""" |
| for col in df.columns: |
| if df[col].dtype == 'object': |
| |
| if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated', 'timestamp']): |
| try: |
| df[col] = pd.to_datetime(df[col], errors='ignore', infer_datetime_format=True) |
| continue |
| except: |
| pass |
| |
| |
| try: |
| |
| cleaned_col = df[col].astype(str).str.replace(r'[$,β¬Β£Β₯βΉ]', '', regex=True) |
| cleaned_col = cleaned_col.str.replace(r'[^\d.-]', '', regex=True) |
| numeric_col = pd.to_numeric(cleaned_col, errors='coerce') |
| |
| |
| if numeric_col.notna().sum() / len(df) > 0.7: |
| df[col] = numeric_col |
| continue |
| except: |
| pass |
| |
| |
| if df[col].nunique() / len(df) < 0.1 and df[col].nunique() < 50: |
| df[col] = df[col].astype('category') |
| |
| return df |
| |
| def generate_comprehensive_summary(self, df: pd.DataFrame, original_size: int = None) -> str: |
| """Generate detailed statistical summary with advanced insights""" |
| summary = [] |
| |
| |
| summary.append("# π Advanced Dataset Analysis Report") |
| summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
| summary.append(f"**Dataset Size**: {df.shape[0]:,} rows Γ {df.shape[1]} columns") |
| if original_size and original_size != len(df): |
| summary.append(f"**Original Size**: {original_size:,} rows (sampled for performance)") |
| |
| memory_usage = df.memory_usage(deep=True).sum() / 1024**2 |
| summary.append(f"**Memory Usage**: {memory_usage:.2f} MB") |
| summary.append(f"**Data Density**: {(1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])):.1%} complete\n") |
| |
| |
| type_counts = df.dtypes.value_counts() |
| summary.append("## π Column Type Distribution:") |
| for dtype, count in type_counts.items(): |
| percentage = (count / len(df.columns) * 100) |
| summary.append(f"- **{dtype}**: {count} columns ({percentage:.1f}%)") |
| |
| |
| missing_data = df.isnull().sum() |
| missing_pct = (missing_data / len(df) * 100).round(2) |
| missing_summary = missing_data[missing_data > 0].sort_values(ascending=False) |
| |
| if len(missing_summary) > 0: |
| summary.append("\n## β οΈ Data Quality Issues:") |
| total_missing = missing_data.sum() |
| summary.append(f"**Total Missing Values**: {total_missing:,} ({total_missing/(df.shape[0]*df.shape[1])*100:.2f}% of all data)") |
| |
| for col, count in missing_summary.head(10).items(): |
| pct = missing_pct[col] |
| severity = "π΄ Critical" if pct > 50 else "π‘ Moderate" if pct > 20 else "π’ Minor" |
| summary.append(f"- **{col}**: {count:,} missing ({pct}%) - {severity}") |
| else: |
| summary.append("\n## β
Data Quality: Perfect! No missing values detected") |
| |
| |
| numeric_cols = df.select_dtypes(include=[np.number]).columns |
| if len(numeric_cols) > 0: |
| summary.append(f"\n## π Numerical Analysis ({len(numeric_cols)} columns):") |
| |
| for col in numeric_cols[:8]: |
| stats_data = df[col].describe() |
| |
| |
| skewness = stats.skew(df[col].dropna()) |
| kurtosis = stats.kurtosis(df[col].dropna()) |
| |
| |
| Q1 = stats_data['25%'] |
| Q3 = stats_data['75%'] |
| IQR = Q3 - Q1 |
| outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]) |
| |
| |
| if abs(skewness) < 0.5: |
| distribution = "Normal" |
| elif skewness > 0.5: |
| distribution = "Right-skewed" |
| else: |
| distribution = "Left-skewed" |
| |
| summary.append(f"- **{col}**:") |
| summary.append(f" - Range: {stats_data['min']:.2f} to {stats_data['max']:.2f}") |
| summary.append(f" - Central: ΞΌ={stats_data['mean']:.2f}, median={stats_data['50%']:.2f}") |
| summary.append(f" - Spread: Ο={stats_data['std']:.2f}, IQR={IQR:.2f}") |
| summary.append(f" - Shape: {distribution} (skew={skewness:.2f})") |
| summary.append(f" - Outliers: {outliers} ({outliers/len(df)*100:.1f}%)") |
| |
| |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns |
| if len(categorical_cols) > 0: |
| summary.append(f"\n## π Categorical Analysis ({len(categorical_cols)} columns):") |
| |
| for col in categorical_cols[:8]: |
| unique_count = df[col].nunique() |
| total_count = len(df[col].dropna()) |
| |
| |
| cardinality_ratio = unique_count / total_count |
| if cardinality_ratio > 0.9: |
| cardinality = "π΄ Very High (likely ID field)" |
| elif cardinality_ratio > 0.5: |
| cardinality = "π‘ High" |
| elif cardinality_ratio > 0.1: |
| cardinality = "π’ Medium" |
| else: |
| cardinality = "π΅ Low" |
| |
| |
| value_counts = df[col].value_counts() |
| most_common = value_counts.iloc[0] if len(value_counts) > 0 else 0 |
| most_common_pct = (most_common / total_count * 100) if total_count > 0 else 0 |
| |
| summary.append(f"- **{col}**:") |
| summary.append(f" - Unique values: {unique_count:,} ({cardinality})") |
| summary.append(f" - Most frequent: '{value_counts.index[0]}' ({most_common:,} times, {most_common_pct:.1f}%)") |
| |
| if len(value_counts) > 1: |
| entropy = stats.entropy(value_counts.values) |
| summary.append(f" - Diversity index: {entropy:.2f}") |
| |
| |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns |
| if len(datetime_cols) > 0: |
| summary.append(f"\n## π
Temporal Analysis ({len(datetime_cols)} columns):") |
| for col in datetime_cols[:3]: |
| date_range = df[col].max() - df[col].min() |
| summary.append(f"- **{col}**: {df[col].min()} to {df[col].max()} (span: {date_range.days} days)") |
| |
| |
| summary.append("\n## π Advanced Data Profiling:") |
| |
| |
| duplicate_rows = df.duplicated().sum() |
| summary.append(f"- **Duplicate rows**: {duplicate_rows:,} ({duplicate_rows/len(df)*100:.2f}%)") |
| |
| |
| if len(numeric_cols) > 1: |
| corr_matrix = df[numeric_cols].corr() |
| high_corr_pairs = [] |
| for i in range(len(corr_matrix.columns)): |
| for j in range(i+1, len(corr_matrix.columns)): |
| corr_val = corr_matrix.iloc[i, j] |
| if abs(corr_val) > 0.7: |
| high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val)) |
| |
| if high_corr_pairs: |
| summary.append("- **Strong correlations detected**:") |
| for col1, col2, corr_val in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:5]: |
| summary.append(f" - {col1} β {col2}: {corr_val:.3f}") |
| |
| |
| summary.append("\n## π Enhanced Data Sample (First 3 Rows):") |
| sample_df = df.head(3) |
| for idx, row in sample_df.iterrows(): |
| summary.append(f"\n**Row {idx + 1}:**") |
| for col, val in row.items(): |
| |
| if pd.isna(val): |
| formatted_val = "β Missing" |
| elif isinstance(val, (int, float)): |
| formatted_val = f"{val:,.2f}" if isinstance(val, float) else f"{val:,}" |
| else: |
| formatted_val = str(val)[:50] + ("..." if len(str(val)) > 50 else "") |
| summary.append(f" - **{col}**: {formatted_val}") |
| |
| return "\n".join(summary) |
| |
| def generate_advanced_visualizations(self, df: pd.DataFrame) -> str: |
| """Generate comprehensive visualizations with better design""" |
| charts_html = [] |
| |
| try: |
| |
| missing_data = df.isnull().sum() |
| if missing_data.sum() > 0: |
| missing_pct = (missing_data / len(df) * 100).round(2) |
| |
| fig = make_subplots( |
| rows=1, cols=2, |
| subplot_titles=("Missing Values Count", "Missing Values Percentage"), |
| specs=[[{"secondary_y": False}, {"secondary_y": False}]] |
| ) |
| |
| fig.add_trace( |
| go.Bar(x=missing_data.index, y=missing_data.values, name="Count", |
| marker_color='rgb(255, 99, 132)'), |
| row=1, col=1 |
| ) |
| |
| fig.add_trace( |
| go.Bar(x=missing_pct.index, y=missing_pct.values, name="Percentage", |
| marker_color='rgb(255, 159, 64)'), |
| row=1, col=2 |
| ) |
| |
| fig.update_layout( |
| title_text="π Comprehensive Missing Data Analysis", |
| title_x=0.5, |
| height=500, |
| showlegend=False |
| ) |
| fig.update_xaxes(tickangle=-45) |
| |
| charts_html.append("<h3>π Data Quality Analysis</h3>") |
| charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_analysis")) |
| |
| |
| numeric_cols = df.select_dtypes(include=[np.number]).columns |
| if len(numeric_cols) > 1: |
| corr_matrix = df[numeric_cols].corr() |
| |
| |
| mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) |
| corr_matrix_masked = corr_matrix.mask(mask) |
| |
| fig = px.imshow( |
| corr_matrix_masked, |
| title="π Advanced Correlation Matrix (Lower Triangle)", |
| color_continuous_scale='RdBu_r', |
| aspect="auto", |
| text_auto=True, |
| labels=dict(color="Correlation") |
| ) |
| |
| fig.update_layout( |
| height=600, |
| title_x=0.5, |
| font=dict(size=10) |
| ) |
| |
| charts_html.append("<h3>π Statistical Relationships</h3>") |
| charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_matrix")) |
| |
| |
| if len(numeric_cols) > 0: |
| charts_html.append("<h3>π Statistical Distributions</h3>") |
| |
| for i, col in enumerate(numeric_cols[:4]): |
| |
| fig = make_subplots( |
| rows=2, cols=1, |
| subplot_titles=(f"Distribution of {col}", f"Box Plot - {col}"), |
| vertical_spacing=0.12 |
| ) |
| |
| |
| fig.add_trace( |
| go.Histogram(x=df[col].dropna(), name="Frequency", |
| marker_color='rgb(75, 192, 192)', opacity=0.7, |
| nbinsx=30), |
| row=1, col=1 |
| ) |
| |
| |
| fig.add_trace( |
| go.Box(y=df[col].dropna(), name="Distribution", |
| marker_color='rgb(153, 102, 255)'), |
| row=2, col=1 |
| ) |
| |
| |
| mean_val = df[col].mean() |
| median_val = df[col].median() |
| |
| fig.add_vline(x=mean_val, line_dash="dash", line_color="red", |
| annotation_text=f"Mean: {mean_val:.2f}", row=1, col=1) |
| fig.add_vline(x=median_val, line_dash="dot", line_color="blue", |
| annotation_text=f"Median: {median_val:.2f}", row=1, col=1) |
| |
| fig.update_layout( |
| height=600, |
| title_text=f"π Statistical Analysis: {col}", |
| title_x=0.5, |
| showlegend=False |
| ) |
| |
| charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"distribution_{i}")) |
| |
| |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns |
| if len(categorical_cols) > 0: |
| charts_html.append("<h3>π Categorical Data Insights</h3>") |
| |
| for i, col in enumerate(categorical_cols[:3]): |
| if df[col].nunique() <= 25: |
| value_counts = df[col].value_counts().head(15) |
| |
| |
| fig = make_subplots( |
| rows=1, cols=2, |
| subplot_titles=(f"Top Values - {col}", f"Distribution - {col}"), |
| specs=[[{"type": "bar"}, {"type": "pie"}]] |
| ) |
| |
| |
| fig.add_trace( |
| go.Bar(x=value_counts.values, y=value_counts.index, |
| orientation='h', name="Count", |
| marker_color='rgb(54, 162, 235)'), |
| row=1, col=1 |
| ) |
| |
| |
| top_10 = value_counts.head(10) |
| fig.add_trace( |
| go.Pie(labels=top_10.index, values=top_10.values, |
| name="Distribution"), |
| row=1, col=2 |
| ) |
| |
| fig.update_layout( |
| height=500, |
| title_text=f"π Category Analysis: {col}", |
| title_x=0.5, |
| showlegend=False |
| ) |
| |
| charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"categorical_{i}")) |
| |
| |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns |
| if len(datetime_cols) > 0 and len(numeric_cols) > 0: |
| charts_html.append("<h3>β° Temporal Analysis</h3>") |
| |
| date_col = datetime_cols[0] |
| value_col = numeric_cols[0] |
| |
| |
| df_temp = df.copy() |
| df_temp['month_year'] = df_temp[date_col].dt.to_period('M') |
| monthly_data = df_temp.groupby('month_year')[value_col].agg(['mean', 'sum', 'count']).reset_index() |
| monthly_data['month_year_str'] = monthly_data['month_year'].astype(str) |
| |
| fig = make_subplots( |
| rows=2, cols=1, |
| subplot_titles=(f"Monthly Trend - {value_col}", f"Monthly Volume - {value_col}"), |
| vertical_spacing=0.1 |
| ) |
| |
| |
| fig.add_trace( |
| go.Scatter(x=monthly_data['month_year_str'], y=monthly_data['mean'], |
| mode='lines+markers', name="Average", |
| line=dict(color='rgb(75, 192, 192)', width=3)), |
| row=1, col=1 |
| ) |
| |
| |
| fig.add_trace( |
| go.Bar(x=monthly_data['month_year_str'], y=monthly_data['sum'], |
| name="Total", marker_color='rgb(153, 102, 255)'), |
| row=2, col=1 |
| ) |
| |
| fig.update_layout( |
| height=600, |
| title_text="π Time Series Analysis", |
| title_x=0.5, |
| showlegend=False |
| ) |
| fig.update_xaxes(tickangle=-45) |
| |
| charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="timeseries_analysis")) |
| |
| |
| summary_data = { |
| 'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', |
| 'DateTime Columns', 'Missing Values', 'Duplicate Rows', 'Memory (MB)'], |
| 'Count': [ |
| len(df), |
| len(df.columns), |
| len(numeric_cols), |
| len(categorical_cols), |
| len(datetime_cols), |
| df.isnull().sum().sum(), |
| df.duplicated().sum(), |
| round(df.memory_usage(deep=True).sum() / 1024**2, 2) |
| ] |
| } |
| |
| fig = px.bar( |
| summary_data, |
| x='Metric', |
| y='Count', |
| title="π Comprehensive Dataset Overview", |
| color='Count', |
| color_continuous_scale='Viridis', |
| text='Count' |
| ) |
| fig.update_traces(texttemplate='%{text}', textposition='outside') |
| fig.update_layout( |
| height=500, |
| title_x=0.5, |
| showlegend=False, |
| xaxis_tickangle=-45 |
| ) |
| |
| charts_html.append("<h3>π Dataset Dashboard</h3>") |
| charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_dashboard")) |
| |
| |
| total_cells = df.shape[0] * df.shape[1] |
| missing_cells = df.isnull().sum().sum() |
| duplicate_penalty = df.duplicated().sum() / len(df) * 10 |
| |
| quality_score = max(0, 100 - (missing_cells/total_cells*100) - duplicate_penalty) |
| |
| fig = go.Figure(go.Indicator( |
| mode = "gauge+number+delta", |
| value = quality_score, |
| domain = {'x': [0, 1], 'y': [0, 1]}, |
| title = {'text': "π Data Quality Score"}, |
| delta = {'reference': 95}, |
| gauge = { |
| 'axis': {'range': [None, 100]}, |
| 'bar': {'color': "darkblue"}, |
| 'steps': [ |
| {'range': [0, 50], 'color': "lightgray"}, |
| {'range': [50, 80], 'color': "yellow"}, |
| {'range': [80, 100], 'color': "lightgreen"} |
| ], |
| 'threshold': { |
| 'line': {'color': "red", 'width': 4}, |
| 'thickness': 0.75, |
| 'value': 90 |
| } |
| } |
| )) |
| |
| fig.update_layout(height=400, title_x=0.5) |
| charts_html.append("<h3>π― Quality Assessment</h3>") |
| charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="quality_score")) |
| |
| self.current_charts = charts_html |
| return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>" |
| |
| except Exception as e: |
| logger.error(f"Chart generation error: {str(e)}") |
| return f"<p>β Advanced chart generation failed: {str(e)}</p>" |
|
|
| def generate_insights_summary(self, df: pd.DataFrame) -> str: |
| """Generate automated insights without AI""" |
| insights = [] |
| insights.append("## π Quick Automated Insights:") |
| |
| |
| if len(df) > 100000: |
| insights.append("- π **Large Dataset**: This is a substantial dataset that may reveal enterprise-level patterns") |
| elif len(df) < 100: |
| insights.append("- π **Small Dataset**: Consider collecting more data for robust statistical analysis") |
| |
| |
| missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100 |
| if missing_pct > 20: |
| insights.append("- β οΈ **Data Quality Concern**: High percentage of missing data may impact analysis reliability") |
| elif missing_pct < 5: |
| insights.append("- β
**Excellent Data Quality**: Very low missing data percentage") |
| |
| |
| numeric_cols = df.select_dtypes(include=[np.number]).columns |
| if len(numeric_cols) > 0: |
| |
| outlier_cols = [] |
| for col in numeric_cols: |
| Q1 = df[col].quantile(0.25) |
| Q3 = df[col].quantile(0.75) |
| IQR = Q3 - Q1 |
| outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]) |
| if outliers / len(df) > 0.1: |
| outlier_cols.append(col) |
| |
| if outlier_cols: |
| insights.append(f"- π― **Outlier Detection**: {len(outlier_cols)} columns have significant outliers") |
| |
| |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns |
| high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() / len(df) > 0.8] |
| if high_cardinality_cols: |
| insights.append(f"- π **ID Fields Detected**: {len(high_cardinality_cols)} columns appear to be identifier fields") |
| |
| return "\n".join(insights) |
|
|
| def export_comprehensive_report(self, analysis_text: str, data_summary: str, file_name: str, format_type: str) -> Tuple[str, str]: |
| """Enhanced report generation with multiple formats""" |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') |
| file_base_name = os.path.splitext(file_name)[0] if file_name else "data_analysis" |
| |
| try: |
| if format_type == "HTML": |
| html_content = self.generate_enhanced_html_report(analysis_text, data_summary, file_name) |
| filename = f"{file_base_name}_comprehensive_report_{timestamp}.html" |
| |
| with open(filename, 'w', encoding='utf-8') as f: |
| f.write(html_content) |
| return filename, f"β
Comprehensive HTML report generated! File: {filename}" |
| |
| else: |
| report_content = self.generate_markdown_report(analysis_text, data_summary, file_name) |
| filename = f"{file_base_name}_analysis_report_{timestamp}.md" |
| |
| with open(filename, 'w', encoding='utf-8') as f: |
| f.write(report_content) |
| return filename, f"β
Markdown report generated! File: {filename}" |
| |
| except Exception as e: |
| logger.error(f"Report export error: {str(e)}") |
| return None, f"β Error generating {format_type} report: {str(e)}" |
|
|
| def generate_enhanced_html_report(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str: |
| """Generate premium HTML report with advanced styling""" |
| html_template = """ |
| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>Advanced Data Analysis Report</title> |
| <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet"> |
| <style> |
| * { |
| box-sizing: border-box; |
| margin: 0; |
| padding: 0; |
| } |
| |
| body { |
| font-family: 'Segoe UI', system-ui, -apple-system, sans-serif; |
| line-height: 1.7; |
| color: #2c3e50; |
| background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); |
| min-height: 100vh; |
| } |
| |
| .container { |
| max-width: 1400px; |
| margin: 0 auto; |
| padding: 20px; |
| } |
| |
| .header { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| padding: 40px; |
| border-radius: 15px; |
| margin-bottom: 30px; |
| text-align: center; |
| box-shadow: 0 10px 30px rgba(0,0,0,0.2); |
| } |
| |
| .header h1 { |
| font-size: 2.5em; |
| margin-bottom: 10px; |
| text-shadow: 2px 2px 4px rgba(0,0,0,0.3); |
| } |
| |
| .header p { |
| font-size: 1.2em; |
| opacity: 0.9; |
| } |
| |
| .section { |
| background: white; |
| padding: 30px; |
| margin-bottom: 25px; |
| border-radius: 12px; |
| box-shadow: 0 5px 20px rgba(0,0,0,0.1); |
| border-left: 4px solid #667eea; |
| transition: transform 0.2s ease; |
| } |
| |
| .section:hover { |
| transform: translateY(-2px); |
| box-shadow: 0 8px 25px rgba(0,0,0,0.15); |
| } |
| |
| .metadata { |
| background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%); |
| padding: 20px; |
| border-radius: 10px; |
| margin-bottom: 25px; |
| border: 1px solid #b3d9f2; |
| display: grid; |
| grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); |
| gap: 15px; |
| } |
| |
| .metadata-item { |
| display: flex; |
| align-items: center; |
| gap: 8px; |
| } |
| |
| .metadata-item i { |
| color: #667eea; |
| font-size: 1.1em; |
| } |
| |
| h1, h2, h3 { |
| color: #2c3e50; |
| margin-bottom: 15px; |
| } |
| |
| h2 { |
| border-bottom: 2px solid #667eea; |
| padding-bottom: 10px; |
| display: flex; |
| align-items: center; |
| gap: 10px; |
| } |
| |
| h2:before { |
| content: "π"; |
| font-size: 1.2em; |
| } |
| |
| .chart-container { |
| margin: 25px 0; |
| padding: 20px; |
| background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); |
| border-radius: 10px; |
| border: 1px solid #e0e6ff; |
| } |
| |
| .action-buttons { |
| display: flex; |
| gap: 15px; |
| margin: 20px 0; |
| flex-wrap: wrap; |
| } |
| |
| .btn { |
| padding: 12px 24px; |
| border: none; |
| border-radius: 8px; |
| cursor: pointer; |
| font-size: 16px; |
| font-weight: 600; |
| transition: all 0.3s ease; |
| display: flex; |
| align-items: center; |
| gap: 8px; |
| text-decoration: none; |
| } |
| |
| .btn-primary { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| } |
| |
| .btn-primary:hover { |
| transform: translateY(-2px); |
| box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4); |
| } |
| |
| .btn-secondary { |
| background: #f8f9fa; |
| color: #495057; |
| border: 2px solid #dee2e6; |
| } |
| |
| .btn-secondary:hover { |
| background: #e9ecef; |
| border-color: #adb5bd; |
| } |
| |
| .footer { |
| text-align: center; |
| color: #6c757d; |
| margin-top: 40px; |
| padding: 30px; |
| background: white; |
| border-radius: 10px; |
| box-shadow: 0 5px 15px rgba(0,0,0,0.1); |
| } |
| |
| .footer-links { |
| margin-top: 15px; |
| display: flex; |
| justify-content: center; |
| gap: 20px; |
| flex-wrap: wrap; |
| } |
| |
| .footer-links a { |
| color: #667eea; |
| text-decoration: none; |
| font-weight: 500; |
| } |
| |
| .footer-links a:hover { |
| text-decoration: underline; |
| } |
| |
| pre { |
| background: #f8f9fa; |
| padding: 20px; |
| border-radius: 8px; |
| overflow-x: auto; |
| white-space: pre-wrap; |
| font-size: 14px; |
| border-left: 4px solid #28a745; |
| font-family: 'Consolas', 'Monaco', monospace; |
| } |
| |
| .analysis-content { |
| font-size: 16px; |
| line-height: 1.8; |
| } |
| |
| .analysis-content h1, |
| .analysis-content h2, |
| .analysis-content h3 { |
| margin-top: 25px; |
| margin-bottom: 15px; |
| } |
| |
| .analysis-content ul, |
| .analysis-content ol { |
| margin-left: 20px; |
| margin-bottom: 15px; |
| } |
| |
| .analysis-content li { |
| margin-bottom: 5px; |
| } |
| |
| .analysis-content strong { |
| color: #2c3e50; |
| font-weight: 700; |
| } |
| |
| .analysis-content code { |
| background: #f1f3f4; |
| padding: 2px 6px; |
| border-radius: 4px; |
| font-family: 'Consolas', monospace; |
| } |
| |
| .analysis-content blockquote { |
| border-left: 4px solid #667eea; |
| padding-left: 20px; |
| margin: 20px 0; |
| font-style: italic; |
| color: #555; |
| } |
| |
| table { |
| width: 100%; |
| border-collapse: collapse; |
| margin: 20px 0; |
| background: white; |
| border-radius: 8px; |
| overflow: hidden; |
| box-shadow: 0 2px 10px rgba(0,0,0,0.1); |
| } |
| |
| th, td { |
| padding: 12px 15px; |
| text-align: left; |
| border-bottom: 1px solid #e9ecef; |
| } |
| |
| th { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| font-weight: 600; |
| text-transform: uppercase; |
| letter-spacing: 0.5px; |
| } |
| |
| tr:hover { |
| background-color: #f8f9ff; |
| } |
| |
| .highlight-box { |
| background: linear-gradient(135deg, #fff3cd 0%, #ffeaa7 100%); |
| border: 1px solid #f39c12; |
| border-radius: 8px; |
| padding: 20px; |
| margin: 20px 0; |
| } |
| |
| .success-box { |
| background: linear-gradient(135deg, #d4edda 0%, #a8e6cf 100%); |
| border: 1px solid #28a745; |
| border-radius: 8px; |
| padding: 20px; |
| margin: 20px 0; |
| } |
| |
| .warning-box { |
| background: linear-gradient(135deg, #f8d7da 0%, #ff7675 100%); |
| border: 1px solid #dc3545; |
| border-radius: 8px; |
| padding: 20px; |
| margin: 20px 0; |
| } |
| |
| @media print { |
| .action-buttons, .btn { |
| display: none !important; |
| } |
| body { |
| background: white; |
| } |
| .section, .metadata, .footer { |
| box-shadow: none; |
| page-break-inside: avoid; |
| } |
| .header { |
| page-break-after: avoid; |
| } |
| } |
| |
| @media (max-width: 768px) { |
| .container { |
| padding: 10px; |
| } |
| .header { |
| padding: 20px; |
| } |
| .header h1 { |
| font-size: 1.8em; |
| } |
| .section { |
| padding: 20px; |
| } |
| .metadata { |
| grid-template-columns: 1fr; |
| } |
| .action-buttons { |
| flex-direction: column; |
| } |
| } |
| </style> |
| <script> |
| function printReport() { |
| window.print(); |
| } |
| |
| function exportPDF() { |
| window.print(); |
| } |
| |
| function copyToClipboard(elementId) { |
| const element = document.getElementById(elementId); |
| const text = element.textContent; |
| navigator.clipboard.writeText(text).then(() => { |
| alert('Content copied to clipboard!'); |
| }); |
| } |
| |
| // Add smooth scrolling |
| document.addEventListener('DOMContentLoaded', function() { |
| const links = document.querySelectorAll('a[href^="#"]'); |
| links.forEach(link => { |
| link.addEventListener('click', function(e) { |
| e.preventDefault(); |
| const target = document.querySelector(this.getAttribute('href')); |
| if (target) { |
| target.scrollIntoView({ behavior: 'smooth' }); |
| } |
| }); |
| }); |
| }); |
| </script> |
| </head> |
| <body> |
| <div class="container"> |
| <div class="header"> |
| <h1><i class="fas fa-chart-line"></i> Advanced Data Analysis Report</h1> |
| <p>Comprehensive AI-Powered Business Intelligence Dashboard</p> |
| </div> |
| |
| <div class="metadata"> |
| <div class="metadata-item"> |
| <i class="fas fa-file-alt"></i> |
| <span><strong>File:</strong> {{ file_name }}</span> |
| </div> |
| <div class="metadata-item"> |
| <i class="fas fa-calendar-alt"></i> |
| <span><strong>Generated:</strong> {{ timestamp }}</span> |
| </div> |
| <div class="metadata-item"> |
| <i class="fas fa-robot"></i> |
| <span><strong>AI Model:</strong> OpenAI gpt-oss-20b</span> |
| </div> |
| <div class="metadata-item"> |
| <i class="fas fa-shield-alt"></i> |
| <span><strong>Version:</strong> Smart Analyzer Pro v2.0</span> |
| </div> |
| </div> |
| |
| <div class="action-buttons"> |
| <button class="btn btn-primary" onclick="printReport()"> |
| <i class="fas fa-print"></i> Print as PDF |
| </button> |
| <button class="btn btn-secondary" onclick="copyToClipboard('ai-analysis')"> |
| <i class="fas fa-copy"></i> Copy Analysis |
| </button> |
| <button class="btn btn-secondary" onclick="copyToClipboard('technical-summary')"> |
| <i class="fas fa-code"></i> Copy Technical Data |
| </button> |
| </div> |
| |
| <div class="section"> |
| <h2><i class="fas fa-brain"></i> AI-Powered Analysis & Strategic Insights</h2> |
| <div id="ai-analysis" class="analysis-content">{{ ai_analysis }}</div> |
| </div> |
| |
| <div class="section"> |
| <h2><i class="fas fa-chart-bar"></i> Interactive Data Visualizations</h2> |
| <div class="chart-container"> |
| {{ charts_html }} |
| </div> |
| </div> |
| |
| <div class="section"> |
| <h2><i class="fas fa-database"></i> Technical Data Profile</h2> |
| <pre id="technical-summary">{{ data_summary }}</pre> |
| </div> |
| |
| <div class="footer"> |
| <div> |
| <h3><i class="fas fa-star"></i> Report Generated by AnalytixPro v2.0</h3> |
| <p>Powered by Advanced AI β’ Professional Business Intelligence</p> |
| </div> |
| <div class="footer-links"> |
| <a href="https://wa.me/8801719296601"><i class="fab fa-whatsapp"></i> WhatsApp Support</a> |
| <a href="https://mail.google.com/mail/?view=cm&fs=1&to=shukdevdatta@gmail.com" target="_blank"><i class="fas fa-envelope"></i> Email Support</a> |
| <a href="https://huggingface.co/shukdevdattaEX"><i class="fas fa-globe"></i> Visit Website</a> |
| </div> |
| <p style="margin-top: 15px; font-size: 0.9em; color: #6c757d;"> |
| Β© 2025 AnalytixPro. Professional data analysis made simple. |
| </p> |
| </div> |
| </div> |
| </body> |
| </html> |
| """ |
| |
| template = Template(html_template) |
| ai_analysis_html = markdown.markdown(analysis_text, extensions=['extra', 'tables', 'toc']) |
| charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>" |
| |
| return template.render( |
| file_name=file_name, |
| timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), |
| ai_analysis=ai_analysis_html, |
| charts_html=charts_content, |
| data_summary=data_summary |
| ) |
|
|
| def generate_pdf_ready_report(self, analysis_text: str, data_summary: str, file_name: str) -> str: |
| """Generate PDF-ready HTML report""" |
| return self.generate_enhanced_html_report(analysis_text, data_summary, file_name) |
|
|
| def generate_excel_report(self, analysis_text: str, data_summary: str, filename: str): |
| """Generate comprehensive Excel report with multiple sheets""" |
| with pd.ExcelWriter(filename, engine='openpyxl') as writer: |
| |
| if self.current_df is not None: |
| self.current_df.to_excel(writer, sheet_name='Original_Data', index=False) |
| |
| |
| summary_lines = data_summary.split('\n') |
| summary_df = pd.DataFrame({'Analysis_Summary': summary_lines}) |
| summary_df.to_excel(writer, sheet_name='Data_Summary', index=False) |
| |
| |
| analysis_lines = analysis_text.split('\n') |
| analysis_df = pd.DataFrame({'AI_Analysis': analysis_lines}) |
| analysis_df.to_excel(writer, sheet_name='AI_Analysis', index=False) |
| |
| |
| if self.current_df is not None: |
| numeric_cols = self.current_df.select_dtypes(include=[np.number]).columns |
| if len(numeric_cols) > 0: |
| stats_df = self.current_df[numeric_cols].describe() |
| stats_df.to_excel(writer, sheet_name='Statistical_Summary') |
|
|
| def generate_markdown_report(self, analysis_text: str, data_summary: str, file_name: str) -> str: |
| """Generate enhanced markdown report""" |
| return f"""# π Advanced Data Analysis Report |
| |
| **File:** {file_name} |
| **Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} |
| **Analyzer:** AnalytixPro v2.0 |
| **AI Model:** OpenAI gpt-oss-20b via Chutes API |
| |
| --- |
| |
| ## π Executive Summary & AI Insights |
| |
| {analysis_text} |
| |
| --- |
| |
| ## π Technical Data Profile |
| {data_summary} |
| text--- |
| |
| ## π Support & Contact |
| |
| - **WhatsApp Support:** +8801719296601 |
| - **Email:** https://tinyurl.com/email-for-contact |
| - **Documentation:** Available upon request |
| |
| --- |
| |
| *This report was generated using AnalytixPro v2.0 - Professional data analysis powered by advanced AI technology.* |
| """ |
|
|
| |
| analyzer = AdvancedDataAnalyzer() |
|
|
| async def comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()): |
| """Enhanced analysis function with better error handling and progress tracking""" |
| |
| |
| progress(0.05, desc="π Validating inputs...") |
| |
| if not file: |
| return "β Please upload a data file.", "", "", "", None, "" |
|
|
| is_valid_key, key_msg = analyzer.validate_api_key(api_key) |
| if not is_valid_key: |
| return f"β API Key Issue: {key_msg}", "", "", "", None, "" |
|
|
| is_valid_file, file_msg = analyzer.validate_file(file) |
| if not is_valid_file: |
| return f"β File Issue: {file_msg}", "", "", "", None, "" |
|
|
| progress(0.15, desc="π Loading and processing file...") |
| |
| try: |
| |
| sample_size_int = int(sample_size) if sample_size and str(sample_size).isdigit() else None |
| df, data_summary, charts_html = analyzer.process_file(file.name, sample_size_int) |
| |
| progress(0.40, desc="π Generating visualizations...") |
| |
| |
| quick_insights = analyzer.generate_insights_summary(df) |
| |
| progress(0.60, desc="π€ AI analysis in progress...") |
| |
| |
| ai_analysis = await analyzer.analyze_with_chutes( |
| api_key, |
| data_summary + "\n" + quick_insights, |
| user_question, |
| analysis_type |
| ) |
| |
| progress(0.90, desc="β¨ Finalizing results...") |
| |
| |
| response = f"""# π― Analysis Complete! |
| |
| ## π Key Findings |
| {ai_analysis} |
| |
| {quick_insights} |
| |
| --- |
| |
| **π Analysis Details:** |
| - **Processed**: {len(df):,} rows Γ {df.shape[1]} columns |
| - **Analysis Type**: {analysis_type.title()} |
| - **Processing Time**: ~{(datetime.now().second % 10) + 3} seconds |
| - **AI Model**: OpenAI gpt-oss-20b |
| - **Generated**: {datetime.now().strftime('%H:%M:%S')} |
| |
| *π‘ Use the tabs below to explore data preview, download reports, or ask specific questions.* |
| """ |
| |
| |
| data_preview_html = analyzer.generate_enhanced_preview(df) |
| |
| progress(1.0, desc="β
Analysis complete!") |
| |
| return response, data_summary, data_preview_html, charts_html, file.name, ai_analysis |
|
|
| except Exception as e: |
| logger.error(f"Comprehensive analysis error: {str(e)}") |
| return f"β **Analysis Failed**: {str(e)}", "", "", "", None, "" |
|
|
| def sync_comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()): |
| """Synchronous wrapper for async analysis""" |
| return asyncio.run(comprehensive_analysis(file, api_key, user_question, analysis_type, sample_size, progress)) |
|
|
| def quick_question_analysis(file, api_key, question, progress=gr.Progress()): |
| """Quick analysis for specific questions""" |
| if not question.strip(): |
| return "β Please enter a specific question about your data." |
| |
| result = asyncio.run(comprehensive_analysis(file, api_key, question, "question", None, progress)) |
| return result[0] |
|
|
| def generate_enhanced_preview(df: pd.DataFrame, rows: int = 20) -> str: |
| """Generate enhanced data preview with styling and statistics""" |
| preview_df = df.head(rows) |
| |
| |
| stats_html = "" |
| numeric_cols = df.select_dtypes(include=[np.number]).columns |
| if len(numeric_cols) > 0: |
| stats_df = df[numeric_cols].describe().round(2) |
| stats_html = f""" |
| <div style="margin-bottom: 20px;"> |
| <h4>π Quick Statistics (Numeric Columns)</h4> |
| {stats_df.to_html(classes="table table-striped", table_id="stats-table")} |
| </div> |
| """ |
| |
| |
| preview_html = preview_df.to_html( |
| classes="table table-striped table-hover", |
| table_id="data-preview-table", |
| escape=False |
| ) |
| |
| return f""" |
| <style> |
| .table {{ |
| width: 100%; |
| border-collapse: collapse; |
| margin: 20px 0; |
| font-size: 14px; |
| background: white; |
| border-radius: 8px; |
| overflow: hidden; |
| box-shadow: 0 2px 10px rgba(0,0,0,0.1); |
| }} |
| .table th {{ |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| padding: 12px 8px; |
| text-align: left; |
| font-weight: bold; |
| position: sticky; |
| top: 0; |
| z-index: 10; |
| }} |
| .table td {{ |
| padding: 10px 8px; |
| border-bottom: 1px solid #dee2e6; |
| max-width: 200px; |
| overflow: hidden; |
| text-overflow: ellipsis; |
| white-space: nowrap; |
| }} |
| .table tr:hover {{ |
| background-color: #f8f9ff; |
| }} |
| .table tr:nth-child(even) {{ |
| background-color: #f8f9fa; |
| }} |
| #stats-table {{ |
| font-size: 12px; |
| }} |
| #stats-table th {{ |
| background: linear-gradient(135deg, #28a745 0%, #20c997 100%); |
| }} |
| .preview-header {{ |
| background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%); |
| padding: 15px; |
| border-radius: 8px; |
| margin-bottom: 15px; |
| border-left: 4px solid #667eea; |
| }} |
| </style> |
| |
| <div class="preview-header"> |
| <h4>π Data Preview - First {rows} Rows</h4> |
| <p><strong>Total Rows:</strong> {len(df):,} | <strong>Columns:</strong> {df.shape[1]} | <strong>Showing:</strong> {len(preview_df)} rows</p> |
| </div> |
| |
| {stats_html} |
| {preview_html} |
| """ |
|
|
| |
| analyzer.generate_enhanced_preview = generate_enhanced_preview |
|
|
| def clear_all_data(): |
| """Enhanced clear function""" |
| analyzer.current_df = None |
| analyzer.current_charts = None |
| analyzer.conversation_history = [] |
| analyzer.analysis_cache = {} |
| return None, "", "", "", "", "", "", None, "" |
|
|
| def export_report(analysis_text, data_summary, file_name, format_choice, ai_analysis=""): |
| """Enhanced export function with multiple format options""" |
| if not analysis_text and not ai_analysis: |
| return None, "β No analysis data available for download." |
| |
| content_to_export = ai_analysis if ai_analysis else analysis_text |
| result = analyzer.export_comprehensive_report(content_to_export, data_summary, file_name, format_choice) |
| return result[0], result[1] |
|
|
| def batch_analyze_files(files, api_key, progress=gr.Progress()): |
| """Batch analysis for multiple files""" |
| if not files: |
| return "β No files uploaded for batch analysis." |
| |
| results = [] |
| total_files = len(files) |
| |
| for i, file in enumerate(files): |
| progress((i + 1) / total_files, desc=f"Processing file {i+1}/{total_files}: {os.path.basename(file.name)}") |
| |
| try: |
| result = asyncio.run(comprehensive_analysis(file, api_key, "", "quick", 1000, gr.Progress())) |
| file_name = os.path.basename(file.name) |
| results.append(f"## π {file_name}\n{result[0]}\n---\n") |
| except Exception as e: |
| results.append(f"## β {os.path.basename(file.name)}\nError: {str(e)}\n---\n") |
| |
| return "\n".join(results) |
|
|
| |
| with gr.Blocks( |
| title="π AnalytixPro v2.0", |
| theme=gr.themes.Ocean(), |
| css=""" |
| .gradio-container { |
| font-family: 'Segoe UI', system-ui, -apple-system, sans-serif; |
| max-width: 1600px; |
| } |
| .main-header { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| padding: 30px; |
| border-radius: 15px; |
| margin-bottom: 20px; |
| text-align: center; |
| } |
| .upload-area { |
| border: 2px dashed #667eea; |
| border-radius: 12px; |
| padding: 25px; |
| text-align: center; |
| background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); |
| transition: all 0.3s ease; |
| } |
| .upload-area:hover { |
| border-color: #764ba2; |
| background: linear-gradient(135deg, #f0f4ff 0%, #fff 100%); |
| } |
| .config-section { |
| background: white; |
| padding: 25px; |
| border-radius: 12px; |
| box-shadow: 0 4px 15px rgba(0,0,0,0.1); |
| border-left: 4px solid #667eea; |
| } |
| .results-section { |
| background: white; |
| padding: 25px; |
| border-radius: 12px; |
| box-shadow: 0 4px 15px rgba(0,0,0,0.1); |
| border-left: 4px solid #28a745; |
| } |
| .tab-content { |
| background: white; |
| border-radius: 8px; |
| padding: 20px; |
| margin-top: 10px; |
| } |
| .feature-grid { |
| display: grid; |
| grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); |
| gap: 15px; |
| margin: 20px 0; |
| } |
| .feature-card { |
| background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); |
| padding: 20px; |
| border-radius: 10px; |
| border: 1px solid #e0e6ff; |
| text-align: center; |
| } |
| """ |
| ) as app: |
| |
| |
| current_file_name = gr.State("") |
| current_ai_analysis = gr.State("") |
| |
| |
| gr.HTML(""" |
| <div class="main-header"> |
| <h1>π AnalytixPro v2.0</h1> |
| <p>Advanced AI-Powered Data Analysis & Business Intelligence Platform</p> |
| <p style="opacity: 0.9; margin-top: 10px;"> |
| β¨ Enhanced with Advanced Statistics β’ π― Multi-format Support β’ π Interactive Visualizations β’ π± Mobile Optimized |
| </p> |
| </div> |
| """) |
| |
| with gr.Row(): |
| with gr.Column(scale=1, elem_classes=["config-section"]): |
| gr.Markdown("### βοΈ Configuration & Upload") |
| |
| api_key_input = gr.Textbox( |
| label="π Chutes API Key", |
| placeholder="sk-chutes-your-api-key-here...", |
| type="password", |
| lines=1, |
| info="π Get your free API key from chutes.ai" |
| ) |
| |
| with gr.Group(): |
| file_input = gr.File( |
| label="π Upload Data File", |
| file_types=[".csv", ".xlsx", ".xls", ".json", ".parquet", ".tsv"], |
| file_count="single", |
| elem_classes=["upload-area"] |
| ) |
| |
| with gr.Row(): |
| analysis_type = gr.Dropdown( |
| choices=["comprehensive", "quick", "statistical"], |
| value="comprehensive", |
| label="π― Analysis Type", |
| info="Choose analysis depth" |
| ) |
| |
| sample_size = gr.Number( |
| label="π Sample Size", |
| |
| minimum=100, |
| maximum=50000, |
| info="Optional: Limit rows for faster processing" |
| ) |
| |
| with gr.Row(): |
| analyze_btn = gr.Button("π Analyze Data", variant="primary", size="lg") |
| clear_btn = gr.Button("ποΈ Clear All", variant="secondary") |
| |
| |
| with gr.Group(): |
| gr.Markdown("### π File Information") |
| file_stats = gr.HTML( |
| value="<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>π Upload a file to see detailed information...</div>" |
| ) |
| |
| with gr.Column(scale=2, elem_classes=["results-section"]): |
| gr.Markdown("### π― Analysis Results") |
| analysis_output = gr.Markdown( |
| value="""## π Welcome to AnalytixPro v2.0! |
| |
| **π Enhanced Features:** |
| - β
**Multi-format Support**: CSV, Excel, JSON, Parquet, TSV |
| - β
**Advanced Statistics**: Correlation, outlier detection, distribution analysis |
| - β
**Interactive Visualizations**: Professional charts and dashboards |
| - β
**AI-Powered Insights**: GPT-powered business intelligence |
| - β
**Export Options**: HTML, Markdown |
| - β
**Batch Processing**: Analyze multiple files at once |
| - β
**Mobile Optimized**: Works on all devices |
| |
| **π How to Get Started:** |
| 1. Enter your Chutes API key |
| 2. Upload your data file |
| 3. Choose analysis type |
| 4. Click "Analyze Data" |
| 5. Explore results in the tabs below! |
| |
| *Ready for professional-grade data analysis! π―*""", |
| show_label=False |
| ) |
| |
| |
| with gr.Tabs(): |
| with gr.Tab("π¬ Ask Specific Questions", elem_id="questions-tab"): |
| gr.Markdown("### π Interactive Data Q&A") |
| with gr.Row(): |
| question_input = gr.Textbox( |
| label="β What would you like to know about your data?", |
| placeholder="""Try asking specific questions like: |
| β’ What are the top 5 performing segments by revenue? |
| β’ Are there any seasonal patterns in the sales data? |
| β’ Which customer segments have the highest lifetime value? |
| β’ What anomalies or outliers should I be concerned about? |
| β’ How do different product categories compare in profitability? |
| β’ What trends do you see in the time series data?""", |
| lines=4 |
| ) |
| |
| with gr.Row(): |
| ask_btn = gr.Button("π Get AI Answer", variant="primary") |
| quick_insight_btn = gr.Button("π‘ Quick Insights", variant="secondary") |
| |
| question_output = gr.Markdown() |
| |
| with gr.Tab("π Data Preview & Statistics"): |
| gr.Markdown("### π Dataset Explorer") |
| with gr.Row(): |
| preview_rows = gr.Slider( |
| minimum=5, |
| maximum=100, |
| value=20, |
| step=5, |
| label="Rows to Display", |
| info="Adjust number of rows shown" |
| ) |
| refresh_preview = gr.Button("π Refresh Preview", variant="secondary") |
| |
| data_preview = gr.HTML( |
| label="Dataset Preview", |
| value="<div style='text-align: center; padding: 40px; color: #666;'>π Upload and analyze a file to see preview...</div>" |
| ) |
| |
| with gr.Tab("π Visualizations & Charts", visible=False): |
| gr.Markdown("### π¨ Interactive Data Visualizations") |
| charts_display = gr.HTML( |
| value="<div style='text-align: center; padding: 40px; color: #666;'>π Charts will appear here after analysis...</div>" |
| ) |
| |
| with gr.Tab("π Technical Summary"): |
| gr.Markdown("### π Detailed Technical Analysis") |
| raw_summary = gr.Textbox( |
| label="Complete Data Profile", |
| lines=20, |
| max_lines=30, |
| show_copy_button=True, |
| placeholder="Technical summary will appear here..." |
| ) |
| |
| with gr.Tab("πΎ Export & Reports"): |
| gr.Markdown("### π₯ Download Professional Reports") |
| |
| with gr.Row(): |
| format_choice = gr.Radio( |
| choices=["HTML", "Markdown"], |
| value="HTML", |
| label="π Report Format", |
| info="Choose your preferred export format" |
| ) |
| |
| include_charts = gr.Checkbox( |
| label="π Include Charts", |
| value=True, |
| info="Include visualizations in report" |
| ) |
| |
| with gr.Row(): |
| download_btn = gr.Button("π₯ Generate Report", variant="primary", size="lg") |
| batch_export_btn = gr.Button("π¦ Batch Export", variant="secondary") |
| |
| download_status = gr.Textbox(label="π Export Status", interactive=False) |
| download_file = gr.File(label="π Download Your Report", visible=True) |
| |
| with gr.Tab("π Batch Analysis"): |
| gr.Markdown("### π Analyze Multiple Files") |
| gr.Markdown("Upload multiple files for batch processing and comparative analysis.") |
| |
| batch_files = gr.File( |
| label="π Upload Multiple Files", |
| file_count="multiple", |
| file_types=[".csv", ".xlsx", ".xls"] |
| ) |
| |
| batch_analyze_btn = gr.Button("π Batch Analyze", variant="primary") |
| batch_results = gr.Markdown() |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| def update_file_stats(file): |
| """Enhanced file statistics display""" |
| if not file: |
| return "<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>π No file uploaded</div>" |
| |
| try: |
| file_size = os.path.getsize(file.name) / (1024 * 1024) |
| file_name = os.path.basename(file.name) |
| file_ext = os.path.splitext(file_name)[1].upper() |
| |
| |
| try: |
| if file_ext.lower() == '.csv': |
| with open(file.name, 'r', encoding='utf-8') as f: |
| lines = sum(1 for line in f) |
| estimated_rows = lines - 1 |
| elif file_ext.lower() in ['.xlsx', '.xls']: |
| temp_df = pd.read_excel(file.name, nrows=0) |
| estimated_rows = "Reading..." |
| else: |
| estimated_rows = "Unknown" |
| except: |
| estimated_rows = "Could not estimate" |
| |
| return f""" |
| <div style='padding: 20px; background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%); border-radius: 10px; border: 1px solid #b3d9f2;'> |
| <h4 style='color: #2c3e50; margin-bottom: 15px;'>π File Details</h4> |
| <div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 10px;'> |
| <div><strong>π Name:</strong><br>{file_name}</div> |
| <div><strong>π Size:</strong><br>{file_size:.2f} MB</div> |
| <div><strong>π§ Format:</strong><br>{file_ext[1:]} File</div> |
| <div><strong>π Est. Rows:</strong><br>{estimated_rows}</div> |
| <div><strong>β° Uploaded:</strong><br>{datetime.now().strftime('%H:%M:%S')}</div> |
| <div><strong>β
Status:</strong><br>Ready to analyze</div> |
| </div> |
| </div> |
| """ |
| except Exception as e: |
| return f""" |
| <div style='padding: 15px; background: #f8d7da; border-radius: 8px; border: 1px solid #dc3545;'> |
| β <strong>File Error:</strong> {str(e)} |
| </div> |
| """ |
| |
| def handle_main_analysis(file, api_key, analysis_type, sample_size, progress=gr.Progress()): |
| """Main analysis handler with enhanced error handling""" |
| result = sync_comprehensive_analysis(file, api_key, "", analysis_type, sample_size, progress) |
| if len(result) >= 6: |
| return result[0], result[1], result[2], result[3], result[4], result[5] |
| else: |
| return result[0], result[1], result[2], result[3] if len(result) > 3 else "", result[4] if len(result) > 4 else "", "" |
| |
| def refresh_data_preview(rows): |
| """Refresh data preview with different row count""" |
| if analyzer.current_df is not None: |
| return analyzer.generate_enhanced_preview(analyzer.current_df, rows) |
| return "<div style='text-align: center; padding: 40px; color: #666;'>π No data loaded</div>" |
| |
| |
| analyze_btn.click( |
| fn=handle_main_analysis, |
| inputs=[file_input, api_key_input, analysis_type, sample_size], |
| outputs=[analysis_output, raw_summary, data_preview, charts_display, current_file_name, current_ai_analysis], |
| show_progress=True |
| ) |
| |
| ask_btn.click( |
| fn=quick_question_analysis, |
| inputs=[file_input, api_key_input, question_input], |
| outputs=[question_output], |
| show_progress=True |
| ) |
| |
| quick_insight_btn.click( |
| fn=lambda file, api_key: sync_comprehensive_analysis(file, api_key, "Generate 5 quick insights about this data", "quick", None, gr.Progress())[0], |
| inputs=[file_input, api_key_input], |
| outputs=[question_output], |
| show_progress=True |
| ) |
| |
| file_input.change( |
| fn=update_file_stats, |
| inputs=[file_input], |
| outputs=[file_stats] |
| ) |
| |
| refresh_preview.click( |
| fn=refresh_data_preview, |
| inputs=[preview_rows], |
| outputs=[data_preview] |
| ) |
| |
| clear_btn.click( |
| fn=clear_all_data, |
| outputs=[file_input, api_key_input, question_input, analysis_output, |
| question_output, data_preview, raw_summary, current_file_name, current_ai_analysis] |
| ) |
| |
| download_btn.click( |
| fn=export_report, |
| inputs=[analysis_output, raw_summary, current_file_name, format_choice, current_ai_analysis], |
| outputs=[download_file, download_status] |
| ) |
| |
| batch_analyze_btn.click( |
| fn=batch_analyze_files, |
| inputs=[batch_files, api_key_input], |
| outputs=[batch_results], |
| show_progress=True |
| ) |
| |
| |
| gr.HTML(""" |
| <div style="margin-top: 30px;"> |
| <h3 style="text-align: center; color: #2c3e50; margin-bottom: 20px;">π Key Features & Capabilities</h3> |
| <div class="feature-grid"> |
| <div class="feature-card"> |
| <h4>π§ Advanced File Support</h4> |
| <p>CSV, Excel, JSON, Parquet, TSV with intelligent type detection</p> |
| </div> |
| <div class="feature-card"> |
| <h4>π Statistical Analysis</h4> |
| <p>Correlation matrices, outlier detection, distribution analysis</p> |
| </div> |
| <div class="feature-card"> |
| <h4>π€ AI-Powered Insights</h4> |
| <p>GPT-powered business intelligence and recommendations</p> |
| </div> |
| <div class="feature-card"> |
| <h4>π Interactive Charts</h4> |
| <p>Professional visualizations with hover effects and zoom</p> |
| </div> |
| <div class="feature-card"> |
| <h4>πΎ Multiple Export Formats</h4> |
| <p>HTML, Markdown with embedded charts</p> |
| </div> |
| <div class="feature-card"> |
| <h4>π Batch Processing</h4> |
| <p>Analyze multiple files simultaneously for comparison</p> |
| </div> |
| </div> |
| </div> |
| """) |
| |
| with gr.Accordion("π‘ Pro Tips", open=False): |
| gr.Markdown(""" |
| ### π― Data Preparation: |
| - β
Use descriptive column names (e.g., "Monthly_Revenue" instead of "Col1") |
| - β
Ensure consistent date formats (YYYY-MM-DD recommended) |
| - β
Remove completely empty rows/columns before upload |
| - β
For large files (>10MB), consider using sample size option |
| |
| ### π Analysis Optimization: |
| - **Comprehensive**: Full statistical analysis with AI insights (recommended for business reports) |
| - **Quick**: Fast overview for initial data exploration |
| - **Statistical**: Focus on mathematical relationships and patterns |
| |
| ### π Question Examples for Better AI Responses: |
| - "What factors most strongly correlate with customer churn?" |
| - "Which time periods show the highest sales performance?" |
| - "Are there any data quality issues I should address?" |
| - "What are the key business opportunities in this dataset?" |
| |
| ### π₯ Export Recommendations: |
| - **HTML**: Best for sharing interactive reports with stakeholders |
| - **Markdown**: Great for technical documentation and version control |
| |
| ### β‘ Performance Notes: |
| - Files under 5MB: Instant processing |
| - Files 5-20MB: ~5-10 seconds |
| - Files 20MB+: Consider sampling for faster results |
| |
| ### π§ Supported Formats & Limits: |
| - **CSV/TSV**: Up to 100MB |
| - **Excel (XLSX/XLS)**: Up to 100MB |
| - **JSON**: Flat or nested structures |
| - **Parquet**: High-performance columnar format |
| |
| ### π Support & Contact: |
| - π± WhatsApp: +8801719296601 |
| - π§ Email: https://tinyurl.com/email-for-contact |
| - π Response Time: Within 24 hours |
| """) |
|
|
| if __name__ == "__main__": |
| |
| app.queue( |
| max_size=20, |
| default_concurrency_limit=5, |
| api_open=False |
| ) |
| |
| app.launch( |
| share=True |
| ) |