Spaces:

MicroHealth
/

autodata-visualizer

Paused

App Files Files Community

bluenevus commited on Apr 12

Commit

2e5c04a

verified ·

1 Parent(s): 05370c5

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -22

app.py CHANGED Viewed

@@ -22,20 +22,23 @@ def process_file(file, instructions, api_key):
             Analyze the following dataset and instructions:
             Data columns: {list(df.columns)}
             Instructions: {instructions}
-            Based on this, create 3 appropriate visualizations. For each visualization, provide:
-            1. A title
-            2. The most suitable plot type (choose from: bar, line, scatter, hist)
-            3. The column to use for the x-axis
-            4. The column(s) to use for the y-axis (can be a list for multiple columns, or None for histograms)
-            5. Any necessary data preprocessing steps (e.g., grouping, sorting, etc.)
             Return your response as a Python list of dictionaries:
             [
-                {{"title": "...", "plot_type": "...", "x": "...", "y": "...", "preprocessing": "..."}},
-                {{"title": "...", "plot_type": "...", "x": "...", "y": "...", "preprocessing": "..."}},
-                {{"title": "...", "plot_type": "...", "x": "...", "y": "...", "preprocessing": "..."}}
             ]
         """)
@@ -56,29 +59,41 @@ def process_file(file, instructions, api_key):
         for plot in plots[:3]:  # Ensure max 3 plots
             fig, ax = plt.subplots(figsize=(10, 6))
-            # Apply preprocessing
             plot_df = df.copy()
-            if 'Group data by' in plot['preprocessing']:
-                group_by = plot['x']
-                agg_column = plot['y'][0] if isinstance(plot['y'], list) else plot['y']
-                plot_df = plot_df.groupby(group_by)[agg_column].sum().reset_index()
-            if 'Sort' in plot['preprocessing']:
-                plot_df = plot_df.sort_values(by=plot['y'][0] if isinstance(plot['y'], list) else plot['y'], ascending=False)
-            if 'Filter to keep only the top 5' in plot['preprocessing']:
-                plot_df = plot_df.head(5)
             if plot['plot_type'] == 'bar':
                 plot_df.plot(kind='bar', x=plot['x'], y=plot['y'], ax=ax)
             elif plot['plot_type'] == 'line':
                 plot_df.plot(kind='line', x=plot['x'], y=plot['y'], ax=ax)
             elif plot['plot_type'] == 'scatter':
-                plot_df.plot(kind='scatter', x=plot['x'], y=plot['y'], ax=ax)
             elif plot['plot_type'] == 'hist':
-                plot_df[plot['x']].hist(ax=ax)
             ax.set_title(plot['title'])
-            ax.set_xlabel(plot['x'])
-            ax.set_ylabel(plot['y'][0] if isinstance(plot['y'], list) else plot['y'])
             plt.tight_layout()
             buf = io.BytesIO()

             Analyze the following dataset and instructions:
             Data columns: {list(df.columns)}
+            Data shape: {df.shape}
             Instructions: {instructions}
+            Based on this, create 3 appropriate visualizations that provide meaningful insights. For each visualization:
+            1. Choose the most suitable plot type (bar, line, scatter, hist, pie, heatmap)
+            2. Determine appropriate data aggregation (e.g., top 5 categories, monthly averages)
+            3. Select relevant columns for x-axis, y-axis, and any additional dimensions (color, size)
+            4. Provide a clear, concise title that explains the insight
+            Consider data density and choose visualizations that simplify and clarify the information.
+            Limit the number of data points displayed to ensure readability (e.g., top 5, top 10).
             Return your response as a Python list of dictionaries:
             [
+                {{"title": "...", "plot_type": "...", "x": "...", "y": "...", "agg_func": "...", "top_n": ..., "additional": {{"color": "...", "size": "..."}}}},
+                {{"title": "...", "plot_type": "...", "x": "...", "y": "...", "agg_func": "...", "top_n": ..., "additional": {{"color": "...", "size": "..."}}}},
+                {{"title": "...", "plot_type": "...", "x": "...", "y": "...", "agg_func": "...", "top_n": ..., "additional": {{"color": "...", "size": "..."}}}
             ]
         """)
         for plot in plots[:3]:  # Ensure max 3 plots
             fig, ax = plt.subplots(figsize=(10, 6))
+            # Apply preprocessing and aggregation
             plot_df = df.copy()
+            if plot['agg_func'] == 'sum':
+                plot_df = plot_df.groupby(plot['x'])[plot['y']].sum().reset_index()
+            elif plot['agg_func'] == 'mean':
+                plot_df = plot_df.groupby(plot['x'])[plot['y']].mean().reset_index()
+            elif plot['agg_func'] == 'count':
+                plot_df = plot_df.groupby(plot['x']).size().reset_index(name=plot['y'])
+            if 'top_n' in plot and plot['top_n']:
+                plot_df = plot_df.nlargest(plot['top_n'], plot['y'])
             if plot['plot_type'] == 'bar':
                 plot_df.plot(kind='bar', x=plot['x'], y=plot['y'], ax=ax)
             elif plot['plot_type'] == 'line':
                 plot_df.plot(kind='line', x=plot['x'], y=plot['y'], ax=ax)
             elif plot['plot_type'] == 'scatter':
+                plot_df.plot(kind='scatter', x=plot['x'], y=plot['y'], ax=ax,
+                             c=plot['additional'].get('color'), s=plot_df[plot['additional'].get('size', 'y')])
             elif plot['plot_type'] == 'hist':
+                plot_df[plot['x']].hist(ax=ax, bins=20)
+            elif plot['plot_type'] == 'pie':
+                plot_df.plot(kind='pie', y=plot['y'], labels=plot_df[plot['x']], ax=ax, autopct='%1.1f%%')
+            elif plot['plot_type'] == 'heatmap':
+                pivot_df = plot_df.pivot(index=plot['x'], columns=plot['additional']['color'], values=plot['y'])
+                ax.imshow(pivot_df, cmap='YlOrRd')
+                ax.set_xticks(range(len(pivot_df.columns)))
+                ax.set_yticks(range(len(pivot_df.index)))
+                ax.set_xticklabels(pivot_df.columns)
+                ax.set_yticklabels(pivot_df.index)
             ax.set_title(plot['title'])
+            if plot['plot_type'] != 'pie':
+                ax.set_xlabel(plot['x'])
+                ax.set_ylabel(plot['y'])
             plt.tight_layout()
             buf = io.BytesIO()