shrutisd1003 commited on
Commit
3511f77
1 Parent(s): 5234546

added categorical to numerical feature

Browse files
Experimentation/Experiments.py CHANGED
@@ -1,60 +1,24 @@
1
- from litellm import completion
2
- from dotenv import load_dotenv
3
  import os
4
  import pandas as pd
5
- from python_interpreter import PythonInterpreter, run_interpreter
6
- from data_code_run import DataCodeRun
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- load_dotenv() # take environment variables from .env.
9
- os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
10
-
11
-
12
- def LLM_summary():
13
- file_path = './test_data.csv'
14
- df = pd.read_csv(file_path)
15
-
16
- string_data= df.to_string(index=False)
17
-
18
- # Get column names
19
- column_names = ", ".join(df.columns.tolist())
20
-
21
- # Get data types
22
- data_types = ", ".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
23
-
24
- # Get number of rows and columns
25
- num_rows, num_cols = df.shape
26
-
27
- # Construct the dataset information string
28
- info_string = f"Dataset Information:\n"
29
- info_string += f"Columns: {column_names}\n"
30
- info_string += f"Data Types: {data_types}\n"
31
- info_string += f"Number of Rows: {num_rows}\n"
32
- info_string += f"Number of Columns: {num_cols}\n"
33
-
34
-
35
-
36
- message = f'''
37
- You are a data analyser agent working with a given dataset.
38
- Below is the info about the dataset -
39
- ========
40
- {info_string}
41
- ========
42
-
43
- Your task -
44
- Write a summary report of the dataset. You have to explain what the dataset is about and what kind of information could be gained from the dataset.
45
-
46
-
47
- Do not infer any data based on previous training, strictly use only source text given below as input.
48
-
49
- '''
50
- output = completion(
51
- model="gemini/gemini-pro",
52
- messages=[
53
- {"role": "user", "content": message}
54
- ]
55
- )
56
-
57
- print(output.choices[0].message.content)
58
-
59
-
60
- LLM_summary()
 
 
 
1
  import os
2
  import pandas as pd
3
+ import streamlit as st
4
+ import numpy as np
5
+
6
+ def categorical_to_numerical(data):
7
+ st.write(data.head())
8
+ st.subheader("Convert Categorical to Numerical")
9
+ columns_to_encode = st.multiselect('Choose columns to convert', data.select_dtypes(include=object).columns)
10
+ if st.button('Convert'):
11
+ for col in columns_to_encode:
12
+ one_hot_encoded = pd.get_dummies(data[col], prefix=col).astype(int)
13
+ data = pd.concat([data, one_hot_encoded], axis=1)
14
+ data.drop(col, axis=1, inplace=True)
15
+ # data = pd.DataFrame(one_hot_encoded)
16
+ st.success("Converted categoricals variables")
17
+ # data.to_csv("data.csv", index=False)
18
+ st.write(data.head())
19
+ st.write(data.describe())
20
+ return data
21
+
22
+ data = pd.read_csv("data.csv")
23
+ data = categorical_to_numerical(data)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modules/data_analyzer.py CHANGED
@@ -41,9 +41,9 @@ class DataAnalyzer:
41
 
42
  def count_plot(self, column_name):
43
  st.write(column_name)
44
- unique_values = self.data[column_name].nunique()
45
  fig, ax = plt.subplots(figsize=(9, 5))
46
- if unique_values <= 12:
47
  sns.countplot(data=self.data, x=column_name, ax=ax)
48
  else:
49
  sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
 
41
 
42
  def count_plot(self, column_name):
43
  st.write(column_name)
44
+ unique_values_ratio = self.data[column_name].nunique() / len(self.data)
45
  fig, ax = plt.subplots(figsize=(9, 5))
46
+ if unique_values_ratio <= 0.3:
47
  sns.countplot(data=self.data, x=column_name, ax=ax)
48
  else:
49
  sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
Modules/data_transformer.py CHANGED
@@ -26,15 +26,20 @@ class DataTransformer:
26
  self.data.to_csv("data.csv", index=False)
27
  st.subheader("Impute Null Values")
28
  col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
29
- option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
30
  if st.button('Impute Null'):
31
- if option == "mean":
32
- self.data[col] = self.data[col].fillna(self.data[col].mean())
33
- elif option == "mode":
34
- self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0]) # mode() returns a DataFrame, so we select the first row
35
- elif option == "0":
36
- self.data[col] = self.data[col].fillna(0)
37
- st.success("Null values filled")
 
 
 
 
 
38
  self.data.to_csv("data.csv", index=False)
39
  with right:
40
  st.write("Null Stats")
@@ -47,15 +52,30 @@ class DataTransformer:
47
  null_percentage = null_percentages[column_name]
48
  columns_stats.append({
49
  'Column Name': column_name,
50
- 'Percentage Null': str(np.round(null_percentage, 2)) + " %"
51
  })
52
  null_stats_df = pd.DataFrame(columns_stats)
53
  st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
54
- st.write("Total percentage of null values:", np.round((total_null / (total_rows * self.data.shape[1])) * 100, 2), "%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  return self.data
56
 
57
  def remove_columns(self):
58
- st.header("Remove Columns")
59
  col = st.multiselect('Choose columns to remove', self.data.columns)
60
  if st.button('Remove Columns'):
61
  self.data.drop(columns=col, inplace=True)
@@ -66,8 +86,8 @@ class DataTransformer:
66
  #transformed data is not retained
67
  #null values handling
68
  #2 options - to remove or to impute that is the question
 
69
 
70
  # PROBLEMS TO BE ADDRESSED
71
- #categorical to numerical
72
  #give option to analyse the transformed dataset or save it.
73
 
 
26
  self.data.to_csv("data.csv", index=False)
27
  st.subheader("Impute Null Values")
28
  col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
29
+ option = st.selectbox('Impute nulls with', ('-Select-','mean', 'mode', '0'))
30
  if st.button('Impute Null'):
31
+ try:
32
+ if option == "mean":
33
+ self.data[col] = self.data[col].fillna(self.data[col].mean())
34
+ elif option == "mode":
35
+ self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0])
36
+ elif option == "0":
37
+ self.data[col] = self.data[col].fillna(0)
38
+ elif option == "-Select-":
39
+ raise ValueError("Select a value")
40
+ st.success("Null values filled")
41
+ except ValueError as e:
42
+ st.error(str(e))
43
  self.data.to_csv("data.csv", index=False)
44
  with right:
45
  st.write("Null Stats")
 
52
  null_percentage = null_percentages[column_name]
53
  columns_stats.append({
54
  'Column Name': column_name,
55
+ '% Null': str(np.round(null_percentage, 2)) + " %"
56
  })
57
  null_stats_df = pd.DataFrame(columns_stats)
58
  st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
59
+ st.write("Total percentage of nulls:", np.round((total_null / (total_rows * self.data.shape[1])) * 100, 2), "%")
60
+ st.write("Total number of rows:", self.data.shape[0])
61
+ st.write("Total number of columns:", self.data.shape[1])
62
+ return self.data
63
+
64
+ def categorical_to_numerical(self):
65
+ st.subheader("Convert Categorical to Numerical")
66
+ columns_to_encode = st.multiselect('Choose columns to convert', self.data.select_dtypes(include=object).columns)
67
+ if st.button('Convert'):
68
+ for col in columns_to_encode:
69
+ one_hot_encoded = pd.get_dummies(self.data[col], prefix=col).astype(int)
70
+ self.data = pd.concat([self.data, one_hot_encoded], axis=1)
71
+ self.data.drop(col, axis=1, inplace=True)
72
+ st.success("Converted categoricals variables")
73
+ self.data.to_csv("data.csv", index=False)
74
+ st.write(self.data.head())
75
  return self.data
76
 
77
  def remove_columns(self):
78
+ st.subheader("Remove Columns")
79
  col = st.multiselect('Choose columns to remove', self.data.columns)
80
  if st.button('Remove Columns'):
81
  self.data.drop(columns=col, inplace=True)
 
86
  #transformed data is not retained
87
  #null values handling
88
  #2 options - to remove or to impute that is the question
89
+ #categorical to numerical
90
 
91
  # PROBLEMS TO BE ADDRESSED
 
92
  #give option to analyse the transformed dataset or save it.
93
 
app.py CHANGED
@@ -46,10 +46,8 @@ def main():
46
 
47
  # modified_data = data_transformer.perform_column_operation()
48
  data = data_transformer.handle_null()
49
-
50
-
51
- # modified_data = data_transformer.remove_columns()
52
-
53
  # data_filter = DataFilter(modified_data)
54
  # data = data_filter.filter_rows()
55
 
@@ -61,9 +59,18 @@ def main():
61
  # --- DATA PARTY ---
62
  if selected == "Data Party":
63
  st.write("To be continued... :)")
 
64
  except:
65
  st.write("Please upload a csv file")
66
 
67
 
68
  if __name__ == "__main__":
69
  main()
 
 
 
 
 
 
 
 
 
46
 
47
  # modified_data = data_transformer.perform_column_operation()
48
  data = data_transformer.handle_null()
49
+ data = data_transformer.categorical_to_numerical()
50
+ data = data_transformer.remove_columns()
 
 
51
  # data_filter = DataFilter(modified_data)
52
  # data = data_filter.filter_rows()
53
 
 
59
  # --- DATA PARTY ---
60
  if selected == "Data Party":
61
  st.write("To be continued... :)")
62
+
63
  except:
64
  st.write("Please upload a csv file")
65
 
66
 
67
  if __name__ == "__main__":
68
  main()
69
+
70
+
71
+ # TO DO:
72
+ # 1. automate categorical to numerical conversion
73
+ # 2. toggle btn for data (original and modified)
74
+ # 3. ask to save modified data before saving
75
+ # 4. streamline prompts in llm_summary
76
+ # 5. ml models