Atharva Thakur commited on
Commit
d14ee80
1 Parent(s): 91d7466

Data retaintion problem solved

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. Experiment.py +57 -0
  3. app.py +18 -9
  4. data_loader.py +24 -17
  5. data_transformer.py +8 -6
  6. test.py +0 -42
.gitignore CHANGED
@@ -9,6 +9,8 @@ __pycache__/
9
 
10
  # data set
11
  data.csv
 
 
12
  #Env variables
13
  .env
14
  # Distribution / packaging
 
9
 
10
  # data set
11
  data.csv
12
+ original_data.csv
13
+
14
  #Env variables
15
  .env
16
  # Distribution / packaging
Experiment.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ # Function to upload dataset
6
+ def upload_dataset():
7
+ uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
8
+ if uploaded_file is not None:
9
+ df = pd.read_csv(uploaded_file)
10
+ return df
11
+
12
+ # Function to impute null values
13
+ def impute_null(df):
14
+ # Implement your logic for null value imputation
15
+ col = st.multiselect('Choose columns to impute nulls', df.select_dtypes(include=[np.number]).columns)
16
+ option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
17
+ if st.button('Impute Null'):
18
+ if option == "mean":
19
+ df[col] = df[col].fillna(df[col].mean())
20
+ elif option == "mode":
21
+ df[col] = df[col].fillna(df[col].mode().iloc[0]) # mode() returns a DataFrame, so we select the first row
22
+ elif option == "0":
23
+ df[col] = df[col].fillna(0)
24
+ st.success("Null values filled")
25
+ return df
26
+
27
+ # Function to display transformed data
28
+ def display_data(df):
29
+ st.write(df)
30
+
31
+ def main():
32
+ st.title("Data Transformation App")
33
+
34
+ # Step 1: Upload Dataset
35
+ st.sidebar.title("Upload Dataset")
36
+ df = upload_dataset()
37
+
38
+ if df is not None:
39
+ # Step 2: Perform Data Transformation
40
+ st.sidebar.title("Data Transformation")
41
+ if st.sidebar.button("Impute Null Values"):
42
+ df = impute_null(df)
43
+ st.success("Null values imputed successfully!")
44
+
45
+ # Step 3: Display Transformed Data
46
+ st.sidebar.title("Transformed Data")
47
+ if st.sidebar.checkbox("Show Transformed Data"):
48
+ display_data(df)
49
+
50
+ # Step 4: Store Transformed Data
51
+ # You can store the transformed data in a variable or a data structure here
52
+
53
+ # Step 5: Use Transformed Data
54
+ # You can utilize the transformed data for further analysis, visualization, etc.
55
+
56
+ if __name__ == "__main__":
57
+ main()
app.py CHANGED
@@ -8,15 +8,18 @@ from data_QA import DataQA
8
  import os
9
  from streamlit_option_menu import option_menu
10
 
 
 
 
11
  def main():
12
- if os.path.exists("data.csv"):
13
- os.remove("data.csv")
14
- with open("data.csv", 'w'):
15
- pass
16
  st.title('Insights 📶')
17
-
18
  data_loader = DataLoader()
19
- data = data_loader.load_data()
 
 
 
 
20
 
21
  if os.path.getsize("data.csv") != 0:
22
  with st.sidebar:
@@ -31,6 +34,7 @@ def main():
31
 
32
  # --- EDA ---
33
  if selected == "Exploratory Data Analysis":
 
34
  data_analyzer = DataAnalyzer(data)
35
  data_analyzer.show_eda()
36
  data_analyzer.show_null_value_statistics()
@@ -43,13 +47,18 @@ def main():
43
  # --- DATA CLEANING ---
44
  if selected == "Data Cleaning":
45
  data_transformer = DataTransformer(data)
46
- data_analyzer = DataAnalyzer(data)
47
  modified_data = data_transformer.perform_column_operation()
48
- data_analyzer.show_null_value_statistics()
49
  modified_data = data_transformer.remove_null()
50
  modified_data = data_transformer.impute_null()
51
- modified_data = data_transformer.remove_columns()
 
 
 
 
52
 
 
 
53
  # data_filter = DataFilter(modified_data)
54
  # data = data_filter.filter_rows()
55
 
 
8
  import os
9
  from streamlit_option_menu import option_menu
10
 
11
+
12
+ import pandas as pd
13
+
14
  def main():
 
 
 
 
15
  st.title('Insights 📶')
16
+
17
  data_loader = DataLoader()
18
+ load = data_loader.load_data()
19
+ if load:
20
+ data = pd.read_csv('data.csv')
21
+
22
+
23
 
24
  if os.path.getsize("data.csv") != 0:
25
  with st.sidebar:
 
34
 
35
  # --- EDA ---
36
  if selected == "Exploratory Data Analysis":
37
+ data = pd.read_csv("data.csv")
38
  data_analyzer = DataAnalyzer(data)
39
  data_analyzer.show_eda()
40
  data_analyzer.show_null_value_statistics()
 
47
  # --- DATA CLEANING ---
48
  if selected == "Data Cleaning":
49
  data_transformer = DataTransformer(data)
50
+
51
  modified_data = data_transformer.perform_column_operation()
 
52
  modified_data = data_transformer.remove_null()
53
  modified_data = data_transformer.impute_null()
54
+ data = modified_data
55
+ data_analyzer = DataAnalyzer(data)
56
+ data_analyzer.show_null_value_statistics()
57
+ new_data_analyzer = DataAnalyzer(modified_data)
58
+ data_analyzer.show_null_value_statistics()
59
 
60
+ # modified_data = data_transformer.remove_columns()
61
+
62
  # data_filter = DataFilter(modified_data)
63
  # data = data_filter.filter_rows()
64
 
data_loader.py CHANGED
@@ -3,21 +3,28 @@ import pandas as pd
3
 
4
  class DataLoader:
5
  def __init__(self):
6
- self.data = pd.DataFrame() # Initialize data as an empty DataFrame
7
 
8
- def load_data(self):
9
- data_source = st.selectbox('Select data source', ['Upload a CSV file', 'Input a URL'])
10
- if data_source == 'Upload a CSV file':
11
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
12
- if uploaded_file is not None:
13
- self.data = pd.read_csv(uploaded_file)
14
- file_path = './data.csv'
15
- self.data.to_csv(file_path, index=False)
16
- elif data_source == 'Input a URL':
17
- url = st.text_input('Enter the URL of a CSV file')
18
- if url:
19
- try:
20
- self.data = pd.read_csv(url)
21
- except:
22
- st.error('Could not load data from the provided URL. Please make sure the URL is correct and points to a CSV file.')
23
- return self.data
 
 
 
 
 
 
 
 
3
 
4
  class DataLoader:
5
  def __init__(self):
6
+ pass
7
 
8
+ @st.cache_data(experimental_allow_widgets=True)
9
+ def load_data(_):
10
+ if True:
11
+ data = pd.DataFrame()
12
+ data_source = st.selectbox('Select data source', ['Upload a CSV file', 'Input a URL'])
13
+ if data_source == 'Upload a CSV file':
14
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
15
+ if uploaded_file is not None:
16
+ data = pd.read_csv(uploaded_file)
17
+ data.to_csv('./original_data.csv', index=False)
18
+ data.to_csv('./data.csv',index=False)
19
+ return True
20
+ elif data_source == 'Input a URL':
21
+ url = st.text_input('Enter the URL of a CSV file')
22
+ if url:
23
+ try:
24
+ data = pd.read_csv(url)
25
+ data_loaded = True
26
+ except:
27
+ st.error('Could not load data from the provided URL. Please make sure the URL is correct and points to a CSV file.')
28
+ return True
29
+ print("data loader ran once")
30
+ return data
data_transformer.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
 
4
  class DataTransformer:
5
  def __init__(self, data):
@@ -22,16 +23,17 @@ class DataTransformer:
22
  return self.data
23
 
24
  def impute_null(self):
 
 
25
  if st.button('Impute Null'):
26
- col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
27
- option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
28
  if option == "mean":
29
- self.data.fillna(df.mean())
30
  elif option == "mode":
31
- self.data.fillna(df.mode())
32
  elif option == "0":
33
- self.data.fillna("0")
34
- st.toast("Null values filled")
 
35
  return self.data
36
 
37
  def remove_columns(self):
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import numpy as np
4
 
5
  class DataTransformer:
6
  def __init__(self, data):
 
23
  return self.data
24
 
25
  def impute_null(self):
26
+ col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
27
+ option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
28
  if st.button('Impute Null'):
 
 
29
  if option == "mean":
30
+ self.data[col] = self.data[col].fillna(self.data[col].mean())
31
  elif option == "mode":
32
+ self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0]) # mode() returns a DataFrame, so we select the first row
33
  elif option == "0":
34
+ self.data[col] = self.data[col].fillna(0)
35
+ st.success("Null values filled")
36
+ self.data.to_csv("data.csv", index=False)
37
  return self.data
38
 
39
  def remove_columns(self):
test.py DELETED
@@ -1,42 +0,0 @@
1
- import streamlit as st
2
- from data_loader import DataLoader
3
- from data_analyzer import DataAnalyzer
4
- from data_filter import DataFilter
5
- from data_transformer import DataTransformer
6
- from data_visualizer import DataVisualizer
7
- from data_QA import DataQA
8
- import os
9
-
10
- def main():
11
- if os.path.exists("data.csv"):
12
- os.remove("data.csv")
13
- with open("data.csv", 'w'):
14
- pass
15
- st.title('Insights 📶')
16
-
17
- data_loader = DataLoader()
18
- data = data_loader.load_data()
19
-
20
- if os.path.getsize("data.csv") != 0:
21
- data_analyzer = DataAnalyzer(data)
22
- data_analyzer.show_summary_statistics()
23
- data_analyzer.show_data_types()
24
- data_analyzer.show_null_value_statistics()
25
-
26
- data_filter = DataFilter(data)
27
- data = data_filter.filter_rows()
28
-
29
- data_transformer = DataTransformer(data)
30
- data = data_transformer.perform_column_operation()
31
- data = data_transformer.remove_null()
32
- data = data_transformer.impute_null()
33
- data = data_transformer.remove_columns()
34
-
35
- data_visualizer = DataVisualizer(data)
36
- data_visualizer.visualize_data()
37
-
38
- data_QA = DataQA(data)
39
- data_QA.ask_csv()
40
-
41
- if __name__ == "__main__":
42
- main()