Atharva Thakur commited on
Commit
80a90b2
β€’
1 Parent(s): 372cae1

Added tests for data-transformer module

Browse files
Modules/data_transformer.py CHANGED
@@ -5,7 +5,7 @@ import numpy as np
5
  class DataTransformer:
6
  def __init__(self, data):
7
  self.data = data
8
- st.header("Data Cleaning")
9
 
10
  def perform_column_operation(self):
11
  column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
@@ -21,26 +21,17 @@ class DataTransformer:
21
  st.subheader("Remove Null Values")
22
  col = st.multiselect('Choose columns to remove nulls', self.data.columns)
23
  if st.button('Remove Null'):
24
- self.data.dropna(subset=col, inplace=True)
25
  st.success("Null values removed")
26
- self.data.to_csv("data.csv", index=False)
27
  st.subheader("Impute Null Values")
28
  col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
29
  option = st.selectbox('Impute nulls with', ('-Select-','mean', 'mode', '0'))
30
  if st.button('Impute Null'):
31
  try:
32
- if option == "mean":
33
- self.data[col] = self.data[col].fillna(self.data[col].mean())
34
- elif option == "mode":
35
- self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0])
36
- elif option == "0":
37
- self.data[col] = self.data[col].fillna(0)
38
- elif option == "-Select-":
39
- raise ValueError("Select a value")
40
  st.success("Null values filled")
41
  except ValueError as e:
42
  st.error(str(e))
43
- self.data.to_csv("data.csv", index=False)
44
  with right:
45
  st.write("Null Stats")
46
  null_counts = self.data.isnull().sum()
@@ -65,12 +56,8 @@ class DataTransformer:
65
  st.subheader("Convert Categorical to Numerical")
66
  columns_to_encode = st.multiselect('Choose columns to convert', self.data.select_dtypes(include=object).columns)
67
  if st.button('Convert'):
68
- for col in columns_to_encode:
69
- one_hot_encoded = pd.get_dummies(self.data[col], prefix=col).astype(int)
70
- self.data = pd.concat([self.data, one_hot_encoded], axis=1)
71
- self.data.drop(col, axis=1, inplace=True)
72
  st.success("Converted categoricals variables")
73
- self.data.to_csv("data.csv", index=False)
74
  st.write(self.data.head())
75
  return self.data
76
 
@@ -78,17 +65,46 @@ class DataTransformer:
78
  st.subheader("Remove Columns")
79
  col = st.multiselect('Choose columns to remove', self.data.columns)
80
  if st.button('Remove Columns'):
81
- self.data.drop(columns=col, inplace=True)
82
  st.success("Columns removed")
 
 
 
 
 
 
83
  self.data.to_csv("data.csv", index=False)
84
  return self.data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- # PROBLEMS RESOLVED
 
 
 
 
 
 
 
87
  #transformed data is not retained
88
  #null values handling
89
  #2 options - to remove or to impute that is the question
90
  #categorical to numerical
91
 
92
  # PROBLEMS TO BE ADDRESSED
93
- #give option to analyse the transformed dataset or save it.
94
-
 
5
  class DataTransformer:
6
  def __init__(self, data):
7
  self.data = data
8
+
9
 
10
  def perform_column_operation(self):
11
  column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
 
21
  st.subheader("Remove Null Values")
22
  col = st.multiselect('Choose columns to remove nulls', self.data.columns)
23
  if st.button('Remove Null'):
24
+ self.handle_null_remove(col)
25
  st.success("Null values removed")
 
26
  st.subheader("Impute Null Values")
27
  col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
28
  option = st.selectbox('Impute nulls with', ('-Select-','mean', 'mode', '0'))
29
  if st.button('Impute Null'):
30
  try:
31
+ self.handle_null_impute(col,option)
 
 
 
 
 
 
 
32
  st.success("Null values filled")
33
  except ValueError as e:
34
  st.error(str(e))
 
35
  with right:
36
  st.write("Null Stats")
37
  null_counts = self.data.isnull().sum()
 
56
  st.subheader("Convert Categorical to Numerical")
57
  columns_to_encode = st.multiselect('Choose columns to convert', self.data.select_dtypes(include=object).columns)
58
  if st.button('Convert'):
59
+ self.categorical_to_numerical_func(columns_to_encode)
 
 
 
60
  st.success("Converted categoricals variables")
 
61
  st.write(self.data.head())
62
  return self.data
63
 
 
65
  st.subheader("Remove Columns")
66
  col = st.multiselect('Choose columns to remove', self.data.columns)
67
  if st.button('Remove Columns'):
68
+ self.remove_columns_func(col)
69
  st.success("Columns removed")
70
+ return self.data
71
+
72
+
73
+ #---CORE FUNCTIONALITY---
74
+ def remove_columns_func(self,col):
75
+ self.data.drop(columns=col, inplace=True)
76
  self.data.to_csv("data.csv", index=False)
77
  return self.data
78
+
79
+ def handle_null_remove(self,col):
80
+ self.data.dropna(subset=col, inplace=True)
81
+ print(self.data)
82
+ self.data.to_csv("data.csv", index=False)
83
+
84
+ def handle_null_impute(self,col,option):
85
+ if option == "mean":
86
+ self.data[col] = self.data[col].fillna(self.data[col].mean())
87
+ elif option == "mode":
88
+ self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0])
89
+ elif option == "0":
90
+ self.data[col] = self.data[col].fillna(0)
91
+ elif option == "-Select-":
92
+ raise ValueError("Select an option")
93
+ self.data.to_csv("data.csv", index=False)
94
+
95
 
96
+ def categorical_to_numerical_func(self,columns_to_encode):
97
+ for col in columns_to_encode:
98
+ one_hot_encoded = pd.get_dummies(self.data[col], prefix=col).astype(int)
99
+ self.data = pd.concat([self.data, one_hot_encoded], axis=1)
100
+ self.data.drop(col, axis=1, inplace=True)
101
+ self.data.to_csv("data.csv", index=False)
102
+
103
+ # PROBLEMS RESOLVED
104
  #transformed data is not retained
105
  #null values handling
106
  #2 options - to remove or to impute that is the question
107
  #categorical to numerical
108
 
109
  # PROBLEMS TO BE ADDRESSED
110
+ #give option to analyse the transformed dataset or save it.
 
test_app.py β†’ Testing/test_app.py RENAMED
@@ -1,4 +1,6 @@
1
  from streamlit.testing.v1 import AppTest
 
 
2
 
3
  def test_smoke():
4
  """Basic smoke test"""
 
1
  from streamlit.testing.v1 import AppTest
2
+ import sys
3
+ sys.path.append("..")
4
 
5
  def test_smoke():
6
  """Basic smoke test"""
Testing/test_data_transformer.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import numpy as np
4
+
5
+
6
+ sys.path.append("..")
7
+
8
+ import unittest
9
+ import pandas as pd
10
+ from Modules.data_transformer import DataTransformer
11
+
12
+
13
+ class TestDataTransformer(unittest.TestCase):
14
+
15
+ def setUp(self):
16
+ # Initialize DataTransformer object with sample data
17
+ data = {
18
+ 'A': [1, 2, 3, None, 5],
19
+ 'B': [4, 5, None, 7, 8],
20
+ 'C': ['X', 'Y', 'Z', 'X', 'Y'],
21
+ 'D': ['M', 'N', 'O', 'N', 'P'],
22
+ 'E': [10.1, 20.2, None, 40.4, 50.5],
23
+ 'F': [10.1, 20.2, None, 40.4, None],
24
+ 'G': [None, 20.2, None, 40.4, 50.5]
25
+ }
26
+ self.sample_data = pd.DataFrame(data)
27
+ self.sample_data.to_csv("data.csv", index=False)
28
+ self.transformer = DataTransformer(self.sample_data.copy())
29
+
30
+ def test_handle_null_remove(self):
31
+ # Test removing rows with null values
32
+ self.transformer.handle_null_remove(['G'])
33
+ self.assertNotIn(None, self.transformer.data['G'])
34
+ # self.assertTrue(pd.read_csv("data.csv").equals(self.transformer.data))
35
+
36
+ def test_remove_columns_func(self):
37
+ # Test removing columns
38
+ self.transformer.remove_columns_func(['D'])
39
+ self.assertNotIn('D', self.transformer.data.columns)
40
+
41
+ def test_handle_null_impute(self):
42
+ # Test imputing null values with mean
43
+ self.transformer.handle_null_impute('A', 'mean')
44
+ self.assertFalse(self.transformer.data['A'].isnull().any())
45
+ self.assertTrue(pd.read_csv("data.csv").equals(self.transformer.data))
46
+
47
+ # Test imputing null values with mode
48
+ self.transformer.handle_null_impute('F', 'mode')
49
+ self.assertFalse(self.transformer.data['F'].isnull().any())
50
+ self.assertTrue(pd.read_csv("data.csv").equals(self.transformer.data))
51
+
52
+ # Test imputing null values with 0
53
+ self.transformer.handle_null_impute('G', '0')
54
+ self.assertFalse(self.transformer.data['G'].isnull().any())
55
+ self.assertTrue(pd.read_csv("data.csv").equals(self.transformer.data))
56
+
57
+ def test_categorical_to_numerical_func(self):
58
+ # Test converting categorical columns to numerical
59
+ self.transformer.categorical_to_numerical_func(['C'])
60
+ self.assertTrue(any(col.startswith('C_') for col in self.transformer.data.columns))
61
+ self.assertTrue(pd.read_csv("data.csv").equals(self.transformer.data))
62
+
63
+ def tearDown(self):
64
+ # Clean up temporary files generated during tests
65
+ import os
66
+ os.remove("data.csv")
67
+
68
+ if __name__ == '__main__':
69
+ unittest.main()
app.py CHANGED
@@ -52,6 +52,7 @@ def main():
52
 
53
  # --- DATA CLEANING ---
54
  if selected == "Data Cleaning":
 
55
  data_transformer = DataTransformer(data)
56
 
57
  # modified_data = data_transformer.perform_column_operation()
 
52
 
53
  # --- DATA CLEANING ---
54
  if selected == "Data Cleaning":
55
+ st.header("Data Cleaning")
56
  data_transformer = DataTransformer(data)
57
 
58
  # modified_data = data_transformer.perform_column_operation()