Spaces:

AtharvaThakur
/

Insights

Sleeping

App Files Files Community

shrutisd1003 commited on May 25

Commit

0459303

•

1 Parent(s): 8e5dec6

automated visualizations

Browse files

Files changed (8) hide show

.gitignore +1 -0
Experimentation/visualizations.py +68 -0
Modules/__init__.py +0 -0
Modules/data_analyzer.py +15 -11
Modules/data_transformer.py +31 -39
Modules/data_visualizer.py +65 -63
app.py +7 -4
requirements.txt +2 -1

.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
 .aider*
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class

 .aider*
 # Byte-compiled / optimized / DLL files
+Modules/__pycache__/
 __pycache__/
 *.py[cod]
 *$py.class

Experimentation/visualizations.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from litellm import completion
+from dotenv import load_dotenv
+import os
+import pandas as pd
+load_dotenv()  # take environment variables from .env.
+os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
+def define_viz():
+    info = get_info()
+    message = f'''
+    You are a data analyst working with a given dataset. Below is the column-wise information about the dataset:
+    {info}
+    Each line represents a column name followed by its respective information or statistics. Columns are separated by "*****".
+    Your task:
+     - Analyze the dataset to determine the appropriate visualization for each column.
+     - Generate ONLY a Python dictionary where the key is the column name and the value is the visualization suitable for the column.
+     - You can use BAR PLOT, HISTOGRAMS and PIE CHARTS.
+     - Assign the value "NA" to columns that CANNOT have a meaningful count plot, such as ID columns or columns with UNIQUE VALUES FOR EACH ENTRY.
+    '''
+    output = completion(
+        model="gemini/gemini-pro",
+        messages=[
+                {"role": "user", "content": message}
+            ]
+    )
+    return output.choices[0].message.content
+def get_info():
+    file_path = './test_data.csv'
+    data = pd.read_csv(file_path)
+    numeric_cols = data.describe()
+    non_numeric_cols = data.describe(include=object)
+    formatted_str = ""
+    # For numeric columns
+    for col in numeric_cols.columns:
+        formatted_str += f"{col}\n"
+        for stat in numeric_cols.index:
+            formatted_str += f"{stat} = {numeric_cols.loc[stat, col]}\n"
+        formatted_str += "\n*****\n\n"
+    # For non-numeric columns
+    for col in non_numeric_cols.columns:
+        formatted_str += f"{col}\n"
+        for stat in non_numeric_cols.index:
+            formatted_str += f"{stat} = {non_numeric_cols.loc[stat, col]}\n"
+        formatted_str += "\n*****\n\n"
+    return formatted_str
+def main():
+    print(define_viz())
+if __name__ == "__main__":
+    main()

Modules/__init__.py DELETED Viewed

File without changes

Modules/data_analyzer.py CHANGED Viewed

@@ -9,10 +9,12 @@ class DataAnalyzer:
         self.data = data
         st.header("Exploratory Data Analysis")
-    def show_eda(self):
         st.subheader("Summary")
         summary = LLM_summary()
-        st.write(summary)
         st.write("Number of rows:", self.data.shape[0])
         st.write("Number of columns:", self.data.shape[1])
         null_counts = self.data.isnull().sum()
@@ -21,7 +23,7 @@ class DataAnalyzer:
         null_percentages = (null_counts / total_rows) * 100
         columns_stats = []
         for column_name in self.data.columns:
-            dtype = self.data[column_name].dtype
             null_count = null_counts[column_name]
             null_percentage = null_percentages[column_name]
             columns_stats.append({
@@ -40,14 +42,16 @@ class DataAnalyzer:
             st.write(self.data.describe(include=object))
     def count_plot(self, column_name):
-        st.write(column_name)
-        unique_values_ratio = self.data[column_name].nunique() / len(self.data)
-        fig, ax = plt.subplots(figsize=(9, 5))
-        if unique_values_ratio <= 0.3:
-            sns.countplot(data=self.data, x=column_name, ax=ax)
-        else:
-            sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
-        st.pyplot(fig)
     def show_count_plots(self):
         st.subheader("Count Plots")

         self.data = data
         st.header("Exploratory Data Analysis")
+    def show_llm_summary(self):
         st.subheader("Summary")
         summary = LLM_summary()
+        st.write(summary)
+    def show_eda(self):
         st.write("Number of rows:", self.data.shape[0])
         st.write("Number of columns:", self.data.shape[1])
         null_counts = self.data.isnull().sum()
         null_percentages = (null_counts / total_rows) * 100
         columns_stats = []
         for column_name in self.data.columns:
+            dtype = str(self.data[column_name].dtype)
             null_count = null_counts[column_name]
             null_percentage = null_percentages[column_name]
             columns_stats.append({
             st.write(self.data.describe(include=object))
     def count_plot(self, column_name):
+        unique_values = self.data[column_name].nunique()
+        unique_values_ratio = unique_values / len(self.data)
+        if unique_values_ratio != 1 and unique_values != 1:
+            st.write(column_name)
+            fig, ax = plt.subplots(figsize=(9, 5))
+            if unique_values_ratio <= 0.3:
+                sns.countplot(data=self.data, x=column_name, ax=ax)
+            else:
+                sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
+            st.pyplot(fig)
     def show_count_plots(self):
         st.subheader("Count Plots")

Modules/data_transformer.py CHANGED Viewed

@@ -5,7 +5,6 @@ import numpy as np
 class DataTransformer:
     def __init__(self, data):
         self.data = data
     def perform_column_operation(self):
         column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
@@ -15,22 +14,42 @@ class DataTransformer:
             st.write(self.data)
         return self.data
     def handle_null(self):
         left, right = st.columns([2,1])
         with left:
             st.subheader("Remove Null Values")
             col = st.multiselect('Choose columns to remove nulls', self.data.columns)
             if st.button('Remove Null'):
-                self.handle_null_remove(col)
-                st.success("Null values removed")
             st.subheader("Impute Null Values")
             col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
             option = st.selectbox('Impute nulls with', ('-Select-','mean', 'mode', '0'))
             if st.button('Impute Null'):
                 try:
-                    self.handle_null_impute(col,option)
                     st.success("Null values filled")
-                except ValueError as e:
                     st.error(str(e))
         with right:
             st.write("Null Stats")
@@ -56,7 +75,10 @@ class DataTransformer:
         st.subheader("Convert Categorical to Numerical")
         columns_to_encode = st.multiselect('Choose columns to convert', self.data.select_dtypes(include=object).columns)
         if st.button('Convert'):
-            self.categorical_to_numerical_func(columns_to_encode)
             st.success("Converted categoricals variables")
             st.write(self.data.head())
         return self.data
@@ -65,46 +87,16 @@ class DataTransformer:
         st.subheader("Remove Columns")
         col = st.multiselect('Choose columns to remove', self.data.columns)
         if st.button('Remove Columns'):
-            self.remove_columns_func(col)
             st.success("Columns removed")
         return self.data
-    #---CORE FUNCTIONALITY---
-    def remove_columns_func(self,col):
-        self.data.drop(columns=col, inplace=True)
-        self.data.to_csv("data.csv", index=False)
-        return self.data
-    def handle_null_remove(self,col):
-        self.data.dropna(subset=col, inplace=True)
-        print(self.data)
-        self.data.to_csv("data.csv", index=False)
-    def handle_null_impute(self,col,option):
-        if option == "mean":
-            self.data[col] = self.data[col].fillna(self.data[col].mean())
-        elif option == "mode":
-            self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0])
-        elif option == "0":
-            self.data[col] = self.data[col].fillna(0)
-        elif option == "-Select-":
-            raise ValueError("Select an option")
-        self.data.to_csv("data.csv", index=False)
-    def categorical_to_numerical_func(self,columns_to_encode):
-        for col in columns_to_encode:
-            one_hot_encoded = pd.get_dummies(self.data[col], prefix=col).astype(int)
-            self.data = pd.concat([self.data, one_hot_encoded], axis=1)
-            self.data.drop(col, axis=1, inplace=True)
-        self.data.to_csv("data.csv", index=False)
     # PROBLEMS RESOLVED
         #transformed data is not retained
         #null values handling
         #2 options - to remove or to impute that is the question
         #categorical to numerical
-        # PROBLEMS TO BE ADDRESSED
         #give option to analyse the transformed dataset or save it.

 class DataTransformer:
     def __init__(self, data):
         self.data = data
     def perform_column_operation(self):
         column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
             st.write(self.data)
         return self.data
+    def handle_null_remove(self,col):
+        self.data.dropna(subset=col, inplace=True)
+        self.data.to_csv("data.csv", index=False)
+        return self.data
+    def handle_null_impute(self,col,option):
+        if option == "mean":
+            self.data[col] = self.data[col].fillna(self.data[col].mean())
+        elif option == "mode":
+            self.data[col] = self.data[col].fillna(self.data[col].mode())
+        elif option == "0":
+            self.data[col] = self.data[col].fillna(0)
+        elif option == "-Select-":
+            raise ValueError("Select an option")
+        self.data.to_csv("data.csv", index=False)
+        return self.data
     def handle_null(self):
         left, right = st.columns([2,1])
         with left:
             st.subheader("Remove Null Values")
             col = st.multiselect('Choose columns to remove nulls', self.data.columns)
             if st.button('Remove Null'):
+                try:
+                    self.handle_null_remove(col)
+                    st.success("Null values removed")
+                except Exception as e:
+                    st.error(str(e))
             st.subheader("Impute Null Values")
             col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
             option = st.selectbox('Impute nulls with', ('-Select-','mean', 'mode', '0'))
             if st.button('Impute Null'):
                 try:
+                    self.handle_null_impute(col, option)
                     st.success("Null values filled")
+                except Exception as e:
                     st.error(str(e))
         with right:
             st.write("Null Stats")
         st.subheader("Convert Categorical to Numerical")
         columns_to_encode = st.multiselect('Choose columns to convert', self.data.select_dtypes(include=object).columns)
         if st.button('Convert'):
+            for col in columns_to_encode:
+                one_hot_encoded = pd.get_dummies(self.data[col], prefix=col).astype(int)
+                self.data = pd.concat([self.data, one_hot_encoded], axis=1)
+                self.data.drop(col, axis=1, inplace=True)
             st.success("Converted categoricals variables")
             st.write(self.data.head())
         return self.data
         st.subheader("Remove Columns")
         col = st.multiselect('Choose columns to remove', self.data.columns)
         if st.button('Remove Columns'):
+            self.data.drop(columns=col, inplace=True)
+            self.data.to_csv("data.csv", index=False)
             st.success("Columns removed")
         return self.data
     # PROBLEMS RESOLVED
         #transformed data is not retained
         #null values handling
         #2 options - to remove or to impute that is the question
         #categorical to numerical
+    # PROBLEMS TO BE ADDRESSED
         #give option to analyse the transformed dataset or save it.

Modules/data_visualizer.py CHANGED Viewed

@@ -1,74 +1,76 @@
 import streamlit as st
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
 class DataVisualizer:
     def __init__(self, data):
         self.data = data
         st.subheader("Data Visualizer")
-    def visualize_data(self):
-        plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
-        if plot_type == 'Histogram':
-            numeric_columns = self.data.select_dtypes(include=[np.number]).columns
-            if numeric_columns.empty:
-                st.warning('No numeric columns in the data to visualize.')
-            else:
-                column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
-                fig, ax = plt.subplots()
-                ax.hist(self.data[column_to_visualize])
-                ax.set_title(f'Histogram of {column_to_visualize}')
-                ax.set_xlabel(column_to_visualize)
-                ax.set_ylabel('Frequency')
-                st.pyplot(fig)
-        elif plot_type == 'Box Plot':
-            numeric_columns = self.data.select_dtypes(include=[np.number]).columns
-            if numeric_columns.empty:
-                st.warning('No numeric columns in the data to visualize.')
-            else:
-                column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
-                fig, ax = plt.subplots()
-                ax.boxplot(self.data[column_to_visualize].dropna())
-                ax.set_title(f'Box Plot of {column_to_visualize}')
-                ax.set_ylabel(column_to_visualize)
-                st.pyplot(fig)
-        elif plot_type == 'Pie Chart':
-            nonnumeric_columns = self.data.select_dtypes(include=['object']).columns
-            if nonnumeric_columns.empty:
-                st.warning('No non numeric columns in the data to visualize.')
-            else:
-                column_to_visualize = st.selectbox('Choose a column to visualize', nonnumeric_columns)
-                fig, ax = plt.subplots()
-                self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
-                ax.set_title(f'Pie Chart of {column_to_visualize}')
-                ax.set_ylabel('')
-                st.pyplot(fig)
-        elif plot_type == 'Scatter Plot':
-            left, right = st.columns(2)
-            with left:
-                x_col = st.selectbox('Choose values on X axis', self.data.select_dtypes(include=[np.number]).columns)
-            with right:
-                y_col = st.selectbox('Choose values on Y axis', self.data.select_dtypes(include=[np.number]).columns)
-            if x_col == y_col:
-                st.warning('Please select two different columns for scatter plot.')
             else:
-                fig, ax = plt.subplots()
-                ax.scatter(self.data[x_col], self.data[y_col])
-                ax.set_title(f'Scatter Plot of {x_col} vs {y_col}')
-                ax.set_xlabel(x_col)
-                ax.set_ylabel(y_col)
-                st.pyplot(fig)
-        elif plot_type == 'Heatmap':
-            numeric_data = self.data.select_dtypes(include=[np.number])
-            corr = numeric_data.corr()
-            fig, ax = plt.subplots()
-            sns.heatmap(corr, annot=True, ax=ax)
-            ax.set_title('Correlation Heatmap')
-            st.pyplot(fig)

 import streamlit as st
+import re
+from litellm import completion
+from dotenv import load_dotenv
+import os
+load_dotenv()  # take environment variables from .env.
+os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
 class DataVisualizer:
     def __init__(self, data):
         self.data = data
         st.subheader("Data Visualizer")
+    def suggestions(self):
+        message = f'''
+        You are a data analyst working with a given dataset. Below is the information about the dataset:
+        ========
+        {self.data.describe(include='all')}
+        ========
+        Here is a sample of the data:
+        {self.data.head()}
+        Number of rows in the dataset: {self.data.shape[0]}
+        Your task:
+        Suggest 5 visualizations that can be made in bullet points
+        '''
+        output = completion(
+            model="gemini/gemini-pro",
+            messages=[
+                    {"role": "user", "content": message}
+                ]
+        )
+        output_str = output.choices[0].message.content
+        st.write("Here are some suggestions")
+        st.write(output_str)
+    def generate_viz(self):
+        graph = st.text_input("What graph do you want to generate?")
+        if graph:
+            message = f'''
+            You are a data analyst working with a given dataset. Below is the information about the dataset:
+            {self.data.describe(include='all')}
+            Here is a sample of the data:
+            {self.data.head()}
+            Your task:
+            Generate a python code to create the following visualization and show it in streamlit - {graph}
+            The data is stored in a csv file named "data.csv"
+            '''
+            output = completion(
+                model="gemini/gemini-pro",
+                messages=[
+                        {"role": "user", "content": message}
+                    ]
+            )
+            output_str = output.choices[0].message.content
+            pattern = r'`python(.*?)`'
+            match = re.search(pattern, output_str, re.DOTALL)
+            if match:
+                code_block = match.group(1).strip()
             else:
+                code_block = output_str.strip()  # If no code block found, assume entire text is code
+            try:
+                exec(code_block)
+            except Exception as e:
+                print(e)

app.py CHANGED Viewed

@@ -48,7 +48,9 @@ def main():
             data_analyzer.show_count_plots()
             data_visualizer = DataVisualizer(data)
-            data_visualizer.visualize_data()
         # --- DATA CLEANING ---
         if selected == "Data Cleaning":
@@ -110,10 +112,11 @@ def main():
         # --- DATA PARTY ---
         if selected == "Data Party":
-            st.write("To be Added)")
-    except:
-        st.write("Please upload a csv file")
 if __name__ == "__main__":

             data_analyzer.show_count_plots()
             data_visualizer = DataVisualizer(data)
+            data_visualizer.suggestions()
+            data_visualizer.generate_viz()
+            # data_visualizer.visualize_data()
         # --- DATA CLEANING ---
         if selected == "Data Cleaning":
         # --- DATA PARTY ---
         if selected == "Data Party":
+            st.write("To be Added:)")
+    except Exception as e:
+        # st.write("Please upload a csv file")
+        print(e)
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ tabulate
 litellm
 streamlit_option_menu
 scikit-learn
-pytest

 litellm
 streamlit_option_menu
 scikit-learn
+pytest
+streamlit-modal