Spaces:

AtharvaThakur
/

Insights

Sleeping

App Files Files Community

shrutisd1003 commited on Apr 16

Commit

3511f77

•

1 Parent(s): 5234546

added categorical to numerical feature

Browse files

Files changed (4) hide show

Experimentation/Experiments.py +21 -57
Modules/data_analyzer.py +2 -2
Modules/data_transformer.py +32 -12
app.py +11 -4

Experimentation/Experiments.py CHANGED Viewed

@@ -1,60 +1,24 @@
-from litellm import completion
-from dotenv import load_dotenv
 import os
 import pandas as pd
-from python_interpreter import PythonInterpreter, run_interpreter
-from data_code_run import DataCodeRun
-load_dotenv()  # take environment variables from .env.
-os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
-def LLM_summary():
-    file_path = './test_data.csv'
-    df = pd.read_csv(file_path)
-    string_data= df.to_string(index=False)
-    # Get column names
-    column_names = ", ".join(df.columns.tolist())
-    # Get data types
-    data_types = ", ".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
-    # Get number of rows and columns
-    num_rows, num_cols = df.shape
-    # Construct the dataset information string
-    info_string = f"Dataset Information:\n"
-    info_string += f"Columns: {column_names}\n"
-    info_string += f"Data Types: {data_types}\n"
-    info_string += f"Number of Rows: {num_rows}\n"
-    info_string += f"Number of Columns: {num_cols}\n"
-    message = f'''
-    You are a data analyser agent working with a given dataset.
-    Below is the info about the dataset -
-    ========
-    {info_string}
-    ========
-    Your task -
-    Write a summary report of the dataset. You have to explain what the dataset is about and what kind of information could be gained from the dataset.
-    Do not infer any data based on previous training, strictly use only source text given below as input.
-    '''
-    output = completion(
-        model="gemini/gemini-pro",
-        messages=[
-                {"role": "user", "content": message}
-            ]
-    )
-    print(output.choices[0].message.content)
-LLM_summary()

 import os
 import pandas as pd
+import streamlit as st
+import numpy as np
+def categorical_to_numerical(data):
+    st.write(data.head())
+    st.subheader("Convert Categorical to Numerical")
+    columns_to_encode = st.multiselect('Choose columns to convert', data.select_dtypes(include=object).columns)
+    if st.button('Convert'):
+        for col in columns_to_encode:
+            one_hot_encoded = pd.get_dummies(data[col], prefix=col).astype(int)
+            data = pd.concat([data, one_hot_encoded], axis=1)
+            data.drop(col, axis=1, inplace=True)
+            # data = pd.DataFrame(one_hot_encoded)
+        st.success("Converted categoricals variables")
+        # data.to_csv("data.csv", index=False)
+        st.write(data.head())
+        st.write(data.describe())
+        return data
+data = pd.read_csv("data.csv")
+data = categorical_to_numerical(data)

Modules/data_analyzer.py CHANGED Viewed

@@ -41,9 +41,9 @@ class DataAnalyzer:
     def count_plot(self, column_name):
         st.write(column_name)
-        unique_values = self.data[column_name].nunique()
         fig, ax = plt.subplots(figsize=(9, 5))
-        if unique_values <= 12:
             sns.countplot(data=self.data, x=column_name, ax=ax)
         else:
             sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)

     def count_plot(self, column_name):
         st.write(column_name)
+        unique_values_ratio = self.data[column_name].nunique() / len(self.data)
         fig, ax = plt.subplots(figsize=(9, 5))
+        if unique_values_ratio <= 0.3:
             sns.countplot(data=self.data, x=column_name, ax=ax)
         else:
             sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)

Modules/data_transformer.py CHANGED Viewed

@@ -26,15 +26,20 @@ class DataTransformer:
                 self.data.to_csv("data.csv", index=False)
             st.subheader("Impute Null Values")
             col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
-            option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
             if st.button('Impute Null'):
-                if option == "mean":
-                    self.data[col] = self.data[col].fillna(self.data[col].mean())
-                elif option == "mode":
-                    self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0])  # mode() returns a DataFrame, so we select the first row
-                elif option == "0":
-                    self.data[col] = self.data[col].fillna(0)
-                st.success("Null values filled")
                 self.data.to_csv("data.csv", index=False)
         with right:
             st.write("Null Stats")
@@ -47,15 +52,30 @@ class DataTransformer:
                 null_percentage = null_percentages[column_name]
                 columns_stats.append({
                     'Column Name': column_name,
-                    'Percentage Null': str(np.round(null_percentage, 2)) + " %"
                 })
             null_stats_df = pd.DataFrame(columns_stats)
             st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
-            st.write("Total percentage of null values:", np.round((total_null / (total_rows * self.data.shape[1])) * 100, 2), "%")
         return self.data
     def remove_columns(self):
-        st.header("Remove Columns")
         col = st.multiselect('Choose columns to remove', self.data.columns)
         if st.button('Remove Columns'):
             self.data.drop(columns=col, inplace=True)
@@ -66,8 +86,8 @@ class DataTransformer:
         #transformed data is not retained
         #null values handling
         #2 options - to remove or to impute that is the question
         # PROBLEMS TO BE ADDRESSED
-        #categorical to numerical
         #give option to analyse the transformed dataset or save it.

                 self.data.to_csv("data.csv", index=False)
             st.subheader("Impute Null Values")
             col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
+            option = st.selectbox('Impute nulls with', ('-Select-','mean', 'mode', '0'))
             if st.button('Impute Null'):
+                try:
+                    if option == "mean":
+                        self.data[col] = self.data[col].fillna(self.data[col].mean())
+                    elif option == "mode":
+                        self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0])
+                    elif option == "0":
+                        self.data[col] = self.data[col].fillna(0)
+                    elif option == "-Select-":
+                        raise ValueError("Select a value")
+                    st.success("Null values filled")
+                except ValueError as e:
+                    st.error(str(e))
                 self.data.to_csv("data.csv", index=False)
         with right:
             st.write("Null Stats")
                 null_percentage = null_percentages[column_name]
                 columns_stats.append({
                     'Column Name': column_name,
+                    '% Null': str(np.round(null_percentage, 2)) + " %"
                 })
             null_stats_df = pd.DataFrame(columns_stats)
             st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
+            st.write("Total percentage of nulls:", np.round((total_null / (total_rows * self.data.shape[1])) * 100, 2), "%")
+            st.write("Total number of rows:", self.data.shape[0])
+            st.write("Total number of columns:", self.data.shape[1])
+        return self.data
+    def categorical_to_numerical(self):
+        st.subheader("Convert Categorical to Numerical")
+        columns_to_encode = st.multiselect('Choose columns to convert', self.data.select_dtypes(include=object).columns)
+        if st.button('Convert'):
+            for col in columns_to_encode:
+                one_hot_encoded = pd.get_dummies(self.data[col], prefix=col).astype(int)
+                self.data = pd.concat([self.data, one_hot_encoded], axis=1)
+                self.data.drop(col, axis=1, inplace=True)
+            st.success("Converted categoricals variables")
+            self.data.to_csv("data.csv", index=False)
+            st.write(self.data.head())
         return self.data
     def remove_columns(self):
+        st.subheader("Remove Columns")
         col = st.multiselect('Choose columns to remove', self.data.columns)
         if st.button('Remove Columns'):
             self.data.drop(columns=col, inplace=True)
         #transformed data is not retained
         #null values handling
         #2 options - to remove or to impute that is the question
+        #categorical to numerical
         # PROBLEMS TO BE ADDRESSED
         #give option to analyse the transformed dataset or save it.

app.py CHANGED Viewed

@@ -46,10 +46,8 @@ def main():
             # modified_data = data_transformer.perform_column_operation()
             data = data_transformer.handle_null()
-            # modified_data = data_transformer.remove_columns()
             # data_filter = DataFilter(modified_data)
             # data = data_filter.filter_rows()
@@ -61,9 +59,18 @@ def main():
         # --- DATA PARTY ---
         if selected == "Data Party":
             st.write("To be continued... :)")
     except:
         st.write("Please upload a csv file")
 if __name__ == "__main__":
     main()

             # modified_data = data_transformer.perform_column_operation()
             data = data_transformer.handle_null()
+            data = data_transformer.categorical_to_numerical()
+            data = data_transformer.remove_columns()
             # data_filter = DataFilter(modified_data)
             # data = data_filter.filter_rows()
         # --- DATA PARTY ---
         if selected == "Data Party":
             st.write("To be continued... :)")
     except:
         st.write("Please upload a csv file")
 if __name__ == "__main__":
     main()
+# TO DO:
+# 1. automate categorical to numerical conversion
+# 2. toggle btn for data (original and modified)
+# 3. ask to save modified data before saving
+# 4. streamline prompts in llm_summary
+# 5. ml models