Spaces:

baconnier
/

csv-plus-plus

Running

App Files Files Community

baconnier commited on Oct 26, 2024

Commit

b2f41cc

verified ·

1 Parent(s): 8e3aecb

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -30

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ class DataAnalyzer:
         if df is None:
             return "Please upload a dataset first"
         report = sv.analyze(df)
         report_path = os.path.join(self.temp_dir, "report.html")
         report.show_html(report_path, open_browser=False)
@@ -43,7 +44,7 @@ class DataAnalyzer:
         os.remove(report_path)
         return html_with_table
     def generate_autoviz_report(self, df):
         if df is None:
             return "Please upload a dataset first"
@@ -52,34 +53,69 @@ class DataAnalyzer:
         if os.path.exists(viz_temp_dir):
             shutil.rmtree(viz_temp_dir)
         os.makedirs(viz_temp_dir)
         try:
-            # Sample data if it's too large
             if len(df) > 5000:
                 df = df.sample(n=5000, random_state=42)
-            plt.close('all')  # Close any existing plots
-            # Configure AutoViz with correct parameters
             dfte = self.AV.AutoViz(
                 filename='',
                 sep=',',
-                depVar='',  # No target variable
-                dfte=df,    # Pass the dataframe directly
                 header=0,
-                verbose=1,  # Set to 1 to see progress
                 lowess=False,
-                chart_format='html',
                 max_rows_analyzed=5000,
                 max_cols_analyzed=30,
                 save_plot_dir=viz_temp_dir
             )
-            # Collect and combine HTML files
             html_parts = []
             if os.path.exists(viz_temp_dir):
                 for file in sorted(os.listdir(viz_temp_dir)):
-                    if file.endswith('.html'):
                         file_path = os.path.join(viz_temp_dir, file)
                         try:
                             with open(file_path, 'r', encoding='utf-8') as f:
@@ -88,45 +124,57 @@ class DataAnalyzer:
                                     html_parts.append(content)
                         except Exception as e:
                             print(f"Error reading file {file}: {str(e)}")
             if not html_parts:
-                return """
                 <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
                     <h3>No visualizations were generated</h3>
                     <p>This might be due to:</p>
                     <ul>
-                        <li>Data format issues</li>
-                        <li>Too few unique values in columns</li>
-                        <li>All categorical data with high cardinality</li>
                     </ul>
-                    <p>Try with a different dataset or check your data formatting.</p>
                 </div>
                 """
-            # Combine all HTML content with proper styling
             combined_html = f"""
             <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
                 <h2 style="text-align: center;">AutoViz Analysis Report</h2>
-                <p style="text-align: center;">Analysis of {len(df)} rows and {len(df.columns)} columns</p>
                 <hr>
                 {'<hr>'.join(html_parts)}
             </div>
             """
             return combined_html
         except Exception as e:
             error_message = f"""
             <div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
                 <h3>Error in AutoViz Analysis</h3>
                 <p>Error details: {str(e)}</p>
-                <p>Troubleshooting steps:</p>
-                <ul>
-                    <li>Check if your data contains valid numerical or categorical values</li>
-                    <li>Ensure there are no completely empty columns</li>
-                    <li>Try with a smaller dataset</li>
-                    <li>Check for any special characters in column names</li>
-                </ul>
             </div>
             """
             return error_message
@@ -134,7 +182,6 @@ class DataAnalyzer:
             if os.path.exists(viz_temp_dir):
                 shutil.rmtree(viz_temp_dir)
 def create_interface():
     analyzer = DataAnalyzer()

         if df is None:
             return "Please upload a dataset first"
+        self.df = df
         report = sv.analyze(df)
         report_path = os.path.join(self.temp_dir, "report.html")
         report.show_html(report_path, open_browser=False)
         os.remove(report_path)
         return html_with_table
     def generate_autoviz_report(self, df):
         if df is None:
             return "Please upload a dataset first"
         if os.path.exists(viz_temp_dir):
             shutil.rmtree(viz_temp_dir)
         os.makedirs(viz_temp_dir)
         try:
+            # Data preprocessing
+            df = df.copy()
+            # Handle datetime columns
+            for col in df.columns:
+                try:
+                    df[col] = pd.to_datetime(df[col], errors='ignore')
+                except:
+                    pass
+            datetime_columns = df.select_dtypes(include=['datetime64']).columns
+            for col in datetime_columns:
+                df[f'{col}_year'] = df[col].dt.year
+                df[f'{col}_month'] = df[col].dt.month
+                df = df.drop(columns=[col])
+            # Try to convert string columns to numeric where possible
+            for col in df.select_dtypes(include=['object']).columns:
+                try:
+                    df[col] = pd.to_numeric(df[col], errors='ignore')
+                except:
+                    pass
+            # Convert remaining string columns to categorical if cardinality is low
+            object_columns = df.select_dtypes(include=['object']).columns
+            for col in object_columns:
+                if df[col].nunique() < 50:
+                    df[col] = df[col].astype('category')
+            # Sample data if needed
             if len(df) > 5000:
                 df = df.sample(n=5000, random_state=42)
+            # Print data info for debugging
+            print("\nDataset Info:")
+            print(df.info())
+            print("\nColumn Types:")
+            print(df.dtypes)
+            plt.close('all')
+            # Run AutoViz
             dfte = self.AV.AutoViz(
                 filename='',
                 sep=',',
+                depVar='',
+                dfte=df,
                 header=0,
+                verbose=1,
                 lowess=False,
+                chart_format='svg',
                 max_rows_analyzed=5000,
                 max_cols_analyzed=30,
                 save_plot_dir=viz_temp_dir
             )
+            # Collect visualizations
             html_parts = []
             if os.path.exists(viz_temp_dir):
                 for file in sorted(os.listdir(viz_temp_dir)):
+                    if file.endswith('.html') or file.endswith('.svg'):
                         file_path = os.path.join(viz_temp_dir, file)
                         try:
                             with open(file_path, 'r', encoding='utf-8') as f:
                                     html_parts.append(content)
                         except Exception as e:
                             print(f"Error reading file {file}: {str(e)}")
             if not html_parts:
+                return f"""
                 <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
+                    <h3>Data Summary</h3>
+                    <p>Total Rows: {len(df)}</p>
+                    <p>Total Columns: {len(df.columns)}</p>
+                    <p>Column Types:</p>
+                    <pre>{df.dtypes.to_string()}</pre>
+                    <hr>
                     <h3>No visualizations were generated</h3>
                     <p>This might be due to:</p>
                     <ul>
+                        <li>All columns being categorical with high cardinality</li>
+                        <li>No numeric columns for analysis</li>
+                        <li>Data format not suitable for visualization</li>
                     </ul>
                 </div>
                 """
             combined_html = f"""
             <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
                 <h2 style="text-align: center;">AutoViz Analysis Report</h2>
+                <div style="margin: 20px;">
+                    <h3>Dataset Summary</h3>
+                    <p>Rows analyzed: {len(df)}</p>
+                    <p>Columns: {len(df.columns)}</p>
+                    <p>Column Types:</p>
+                    <pre>{df.dtypes.to_string()}</pre>
+                </div>
                 <hr>
                 {'<hr>'.join(html_parts)}
             </div>
             """
             return combined_html
         except Exception as e:
+            import traceback
             error_message = f"""
             <div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
                 <h3>Error in AutoViz Analysis</h3>
                 <p>Error details: {str(e)}</p>
+                <p>Stack trace:</p>
+                <pre>{traceback.format_exc()}</pre>
+                <p>Dataset Info:</p>
+                <pre>
+                Rows: {len(df)}
+                Columns: {len(df.columns)}
+                Types:\n{df.dtypes.to_string()}
+                </pre>
             </div>
             """
             return error_message
             if os.path.exists(viz_temp_dir):
                 shutil.rmtree(viz_temp_dir)
 def create_interface():
     analyzer = DataAnalyzer()