Atharva Thakur commited on
Commit
ce41758
1 Parent(s): 2243073

Code gen and runner added

Browse files
.gitignore CHANGED
@@ -13,7 +13,8 @@ data.csv
13
  original_data.csv
14
 
15
  #code files
16
- ./code.py
 
17
 
18
  #Env variables
19
  .env
 
13
  original_data.csv
14
 
15
  #code files
16
+ code.py
17
+ data.pdf
18
 
19
  #Env variables
20
  .env
Experimentation/MLtoolkit.py DELETED
@@ -1,239 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from sklearn import datasets
4
- from sklearn.model_selection import train_test_split
5
- from sklearn.svm import SVC, SVR
6
- from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
7
- from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
8
- from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
9
- from sklearn.naive_bayes import GaussianNB
10
- from sklearn.linear_model import LinearRegression, LogisticRegression
11
- from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
12
- from sklearn.decomposition import PCA
13
- import matplotlib.pyplot as plt
14
- import seaborn as sns
15
- import streamlit as st
16
- from sklearn.preprocessing import LabelEncoder
17
-
18
- st.title("ML Algorithms on Inbuilt and Kaggle Datasets")
19
-
20
- algorithm = st.selectbox("Select Supervised Learning Algorithm", ("KNN", "SVM", "Decision Tree", "Naive Bayes", "Random Forest", "Linear Regression", "Logistic Regression"))
21
-
22
- if algorithm != 'Linear Regression' and algorithm != 'Logistic Regression' and algorithm != "Naive Bayes":
23
- algorithm_type = st.selectbox("Select Algorithm Type", ("Classifier", "Regressor"))
24
- else:
25
- st.write(f"In {algorithm} Classifier and Regressor dosen't exist separately")
26
- if algorithm == "Linear Regression":
27
- algorithm_type = "Regressor"
28
- st.write("{} only does Regression".format(algorithm))
29
- else:
30
- algorithm_type = "Classifier"
31
- st.write(f"{algorithm} only does Classification")
32
-
33
-
34
-
35
- data = pd.read_csv("test_data.csv")
36
-
37
- def one_hot_encode_categorical(df, threshold=0.05):
38
- categorical_columns = df.select_dtypes(include=['object', 'category']).columns
39
-
40
- unique_ratio = df[categorical_columns].nunique() / len(df)
41
-
42
- selected_categorical_columns = unique_ratio[unique_ratio < threshold].index
43
-
44
- df_encoded = pd.get_dummies(df, columns=selected_categorical_columns)
45
-
46
- return df_encoded
47
-
48
-
49
- data = one_hot_encode_categorical(data, threshold=0.05)
50
-
51
- def select_features_and_target(df, algorithm):
52
- st.write("### Select Features and Target Variable")
53
-
54
- # Display available columns based on the algorithm
55
- st.write("#### Available Columns:")
56
-
57
- if algorithm in ["Linear Regression", "Logistic Regression"]:
58
- numerical_columns = df.select_dtypes(include=[np.number]).columns
59
- selected_features = st.multiselect("Select Numerical Features (X)", numerical_columns)
60
- else:
61
- selected_features = st.multiselect("Select Features (X)", df.columns)
62
-
63
- if algorithm == "Naive Bayes":
64
- target_variable = st.selectbox("Select Target Variable (y)", df.columns)
65
- elif algorithm == "Linear Regression":
66
- numerical_columns = df.select_dtypes(include=[np.number]).columns
67
- target_variable = st.selectbox("Select Target Variable (y)", numerical_columns)
68
- else:
69
- target_variable = st.selectbox("Select Target Variable (y)", df.columns)
70
-
71
- # Ensure at least one feature and one target variable is selected
72
- if len(selected_features) < 1 or target_variable is None:
73
- st.error("Please select at least one feature (X) and a target variable (y).")
74
- return None, None
75
-
76
- return df[selected_features], df[target_variable]
77
-
78
- X, Y = select_features_and_target(data,algorithm)
79
-
80
- def add_parameter_classifier_general(algorithm):
81
-
82
- params = dict()
83
-
84
- if algorithm == 'SVM':
85
-
86
- c_regular = st.slider('C (Regularization)', 0.01, 10.0)
87
- kernel_custom = st.selectbox('Kernel', ('linear', 'poly ', 'rbf', 'sigmoid'))
88
- params['C'] = c_regular
89
- params['kernel'] = kernel_custom
90
-
91
- elif algorithm == 'KNN':
92
-
93
- k_n = st.slider('Number of Neighbors (K)', 1, 20,key="k_n_slider")
94
- params['K'] = k_n
95
- weights_custom = st.selectbox('Weights', ('uniform', 'distance'))
96
- params['weights'] = weights_custom
97
-
98
- elif algorithm == 'Naive Bayes':
99
- st.info("This is a simple Algorithm. It doesn't have Parameters for Hyper-tuning.")
100
-
101
- elif algorithm == 'Decision Tree':
102
-
103
- max_depth = st.slider('Max Depth', 2, 17)
104
- criterion = st.selectbox('Criterion', ('gini', 'entropy'))
105
- splitter = st.selectbox("Splitter", ("best", "random"))
106
- params['max_depth'] = max_depth
107
- params['criterion'] = criterion
108
- params['splitter'] = splitter
109
-
110
- try:
111
- random = st.text_input("Enter Random State")
112
- params['random_state'] = int(random)
113
- except:
114
- params['random_state'] = 4567
115
-
116
- elif algorithm == 'Random Forest':
117
-
118
- max_depth = st.slider('Max Depth', 2, 17)
119
- n_estimators = st.slider('Number of Estimators', 1, 90)
120
- criterion = st.selectbox('Criterion', ('gini', 'entropy', 'log_loss'))
121
- params['max_depth'] = max_depth
122
- params['n_estimators'] = n_estimators
123
- params['criterion'] = criterion
124
-
125
-
126
- try:
127
- random = st.text_input("Enter Random State")
128
- params['random_state'] = int(random)
129
- except:
130
- params['random_state'] = 4567
131
-
132
- else:
133
-
134
- c_regular = st.slider('C (Regularization)', 0.01, 10.0)
135
- params['C'] = c_regular
136
- fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
137
- params['fit_intercept'] = bool(fit_intercept)
138
- penalty = st.selectbox("Penalty", ('l2', None))
139
- params['penalty'] = penalty
140
- n_jobs = st.selectbox("Number of Jobs", (None, -1))
141
- params['n_jobs'] = n_jobs
142
-
143
- return params
144
-
145
- def add_parameter_regressor(algorithm):
146
- params = dict()
147
- if algorithm == 'Decision Tree':
148
- max_depth = st.slider('Max Depth', 2, 17)
149
- criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
150
- splitter = st.selectbox("Splitter", ("best", "random"))
151
- params['max_depth'] = max_depth
152
- params['criterion'] = criterion
153
- params['splitter'] = splitter
154
- try:
155
- random = st.text_input("Enter Random State")
156
- params['random_state'] = int(random)
157
- except:
158
- params['random_state'] = 4567
159
- elif algorithm == 'Linear Regression':
160
- fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
161
- params['fit_intercept'] = bool(fit_intercept)
162
- n_jobs = st.selectbox("Number of Jobs", (None, -1))
163
- params['n_jobs'] = n_jobs
164
- else:
165
- max_depth = st.slider('Max Depth', 2, 17)
166
- n_estimators = st.slider('Number of Estimators', 1, 90)
167
- criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
168
- params['max_depth'] = max_depth
169
- params['n_estimators'] = n_estimators
170
- params['criterion'] = criterion
171
- try:
172
- random = st.text_input("Enter Random State")
173
- params['random_state'] = int(random)
174
- except:
175
- params['random_state'] = 4567
176
- return params
177
-
178
- if (algorithm_type == "Regressor") and (algorithm == 'Decision Tree' or algorithm == 'Random Forest' or algorithm_type == "Linear Regression"):
179
- params = add_parameter_regressor(algorithm)
180
- else:
181
- params = add_parameter_classifier_general(algorithm)
182
-
183
- def model_classifier(algorithm, params):
184
- if algorithm == 'KNN':
185
- return KNeighborsClassifier(n_neighbors=params['K'], weights=params['weights'])
186
- elif algorithm == 'SVM':
187
- return SVC(C=params['C'], kernel=params['kernel'])
188
- elif algorithm == 'Decision Tree':
189
- return DecisionTreeClassifier(
190
- criterion=params['criterion'], splitter=params['splitter'],
191
- random_state=params['random_state'])
192
- elif algorithm == 'Naive Bayes':
193
- return GaussianNB()
194
- elif algorithm == 'Random Forest':
195
- return RandomForestClassifier(n_estimators=params['n_estimators'],
196
- max_depth=params['max_depth'],
197
- criterion=params['criterion'],
198
- random_state=params['random_state'])
199
- elif algorithm == 'Linear Regression':
200
- return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
201
- else:
202
- return LogisticRegression(fit_intercept=params['fit_intercept'],
203
- penalty=params['penalty'], C=params['C'], n_jobs=params['n_jobs'])
204
-
205
- def model_regressor(algorithm, params):
206
- if algorithm == 'KNN':
207
- return KNeighborsRegressor(n_neighbors=params['K'], weights=params['weights'])
208
- elif algorithm == 'SVM':
209
- return SVR(C=params['C'], kernel=params['kernel'])
210
- elif algorithm == 'Decision Tree':
211
- return DecisionTreeRegressor(
212
- criterion=params['criterion'], splitter=params['splitter'],
213
- random_state=params['random_state'])
214
- elif algorithm == 'Random Forest':
215
- return RandomForestRegressor(n_estimators=params['n_estimators'],
216
- max_depth=params['max_depth'],
217
- criterion=params['criterion'],
218
- random_state=params['random_state'])
219
- else:
220
- return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
221
-
222
-
223
- if algorithm_type == "Regressor":
224
- algo_model = model_regressor(algorithm, params)
225
- else:
226
- algo_model = model_classifier(algorithm, params)
227
-
228
- x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
229
-
230
- algo_model.fit(x_train, y_train)
231
-
232
- predict = algo_model.predict(x_test)
233
-
234
- if algorithm != 'Linear Regression' and algorithm_type != 'Regressor':
235
- st.write("Training Accuracy is:", algo_model.score(x_train, y_train) * 100)
236
- st.write("Testing Accuracy is:", accuracy_score(y_test, predict) * 100)
237
- else:
238
- st.write("Mean Squared error is:", mean_squared_error(y_test, predict))
239
- st.write("Mean Absolute error is:", mean_absolute_error(y_test, predict))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experimentation/dataCodeTest.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import pandas as pd
4
+
5
+ sys.path.append("..")
6
+
7
+ from Modules.data_code_run import DataCodeRun
8
+
9
+ # data = pd.read_csv("test_data.csv")
10
+
11
+ code_runner = DataCodeRun()
12
+
13
+ message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset.
14
+ The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .'''
15
+
16
+ response= code_runner.generate_code(message)
17
+ # print("Response:", response)
18
+
19
+
20
+ plan, python_code = code_runner.extract_code(response)
21
+
22
+ print(python_code)
Modules/data_QA.py CHANGED
@@ -5,17 +5,31 @@ import pandas as pd
5
  from dotenv import load_dotenv
6
  import os
7
 
 
 
 
8
  load_dotenv() # take environment variables from .env.
9
 
10
  class DataQA:
11
- def __init__(self, data):
12
- self.data = data
 
 
 
 
 
 
 
 
13
 
14
  def ask_csv(self):
15
- GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
16
- llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
17
- csv_agent = create_csv_agent(llm,"data.csv", verbose=True)
18
  question = st.text_input("Ask your question:")
 
19
  if question:
20
- response = csv_agent.invoke(question)
21
- st.write(response)
 
 
 
 
 
 
5
  from dotenv import load_dotenv
6
  import os
7
 
8
+ from Modules.data_code_run import DataCodeRun
9
+ from Modules.python_interpreter import PythonInterpreter, run_interpreter
10
+
11
  load_dotenv() # take environment variables from .env.
12
 
13
  class DataQA:
14
+ def __init__(self):
15
+ print("dataQA")
16
+ # def ask_csv(self):
17
+ # GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
18
+ # llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
19
+ # csv_agent = create_csv_agent(llm,"data.csv", verbose=True)
20
+ # question = st.text_input("Ask your question:")
21
+ # if question:
22
+ # response = csv_agent.invoke(question)
23
+ # st.write(response)
24
 
25
  def ask_csv(self):
 
 
 
26
  question = st.text_input("Ask your question:")
27
+ code_runner = DataCodeRun()
28
  if question:
29
+ response= code_runner.generate_code(question)
30
+ plan, python_code = code_runner.extract_code(response)
31
+ st.write(plan)
32
+ st.code(python_code)
33
+ if st.button("Run Code") and python_code:
34
+ interpreter_code_output = run_interpreter(python_code)
35
+ print("Python code output:\n", interpreter_code_output)
Modules/data_code_run.py CHANGED
@@ -11,50 +11,50 @@ class DataCodeRun:
11
  def __init__(self):
12
  pass
13
 
14
- def run_code(self,message):
15
- os.environ['GEMINI_API_KEY'] = os.getenv("GOOGLE_API_KEY")
 
16
 
17
  # message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
18
  # '''
19
  output = completion(
20
  model="gemini/gemini-pro",
21
  messages=[
22
- {"role": "user", "content": "You are a computer with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with a plan, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
23
- {"role": "assistant", "content": "I am a computer with the ability to run any code I want when I am given a prompt and return a response with a plan of what code I want to run I will start my response with a plan. The commands I provide should be in a single code block encapulated in '''python and ''' and should be a valid Python program."},
24
  {"role": "user", "content": message}
25
  ]
26
  )
27
 
28
  response = output.choices[0].message.content
 
29
 
30
- if response:
31
-
32
- # if True:
33
- print("Response:", response)
34
- plan = response.split("```python")[0]
35
- plan = plan.replace("'", "")
36
- plan = plan.replace('`', "")
37
- print("plan:", plan)
38
- # else:
39
- # print(response.choices[0].message.content)
40
- # # Extract plan from the response
41
- # plan = response.choices[0].message.content.split("```python")[0]
42
- # plan = plan.replace("'", "")
43
- # plan = plan.replace('`', "")
44
- # print("plan:", plan)
45
-
46
- if "```python" in response:
47
- python_code = response.split("```python")[1].split("```")[0].strip()
48
- print("Python code:", python_code)
49
- elif "```" in response:
50
- python_code = response.split("```")[1].split("```")[0].strip()
51
- print("Code found in the response but not Left out the word python:", python_code)
52
- elif "```python" in response.choices[0].message.content:
53
- python_code = response.choices[0].message.content.split(
54
- "```python")[1].split("```")[0].strip()
55
- print("Python code:", python_code)
56
-
57
 
58
- if python_code:
59
- interpreter_code_output = run_interpreter(python_code)
60
- print("Python code output:\n", interpreter_code_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def __init__(self):
12
  pass
13
 
14
+ @st.cache_data(experimental_allow_widgets=True)
15
+ def generate_code(_,message):
16
+ os.environ['GEMINI_API_KEY'] = "AIzaSyAPlmL2oeRaldWRf2viQINPd92_vm3QN6o"
17
 
18
  # message = '''generate the code to find the relation between 'Air temperature [K]' and 'Target' columns of the given dataset. The 'Target' column holds failure prediction values as 0 (no failure) and 1 (failure). the name of the dataset is test_data.csv .
19
  # '''
20
  output = completion(
21
  model="gemini/gemini-pro",
22
  messages=[
23
+ {"role": "user", "content": "You are a computer with the ability to run any code you want when you are given a prompt and return a response with a plan of what code you want to run. You should start your response with a plan that describes what the code is going do in detail, The commands you provide should be in a single code block encapsulated in '''python and ''' for Python and should be valid Python programs."},
24
+ {"role": "assistant", "content": "I am a computer with the ability to run any code I want when I am given a prompt and return a response with a plan of what code I want to run I will start my response with a plan that would be encapsulated in ```plan and ```. Afterwards, The commands I provide should be in a single code block encapulated in ```python and ``` and should be a valid Python program."},
25
  {"role": "user", "content": message}
26
  ]
27
  )
28
 
29
  response = output.choices[0].message.content
30
+ return response
31
 
32
+
33
+ def extract_code(self,response):
34
+ plan = response.split("```python")[0]
35
+ plan = plan.replace("'", "")
36
+ plan = plan.replace('`', "")
37
+ # else:
38
+ # print(response.choices[0].message.content)
39
+ # # Extract plan from the response
40
+ # plan = response.choices[0].message.content.split("```python")[0]
41
+ # plan = plan.replace("'", "")
42
+ # plan = plan.replace('`', "")
43
+ # print("plan:", plan)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ if "```python" in response:
46
+ python_code = response.split("```python")[1].split("```")[0].strip()
47
+ return plan,python_code
48
+ elif "```" in response:
49
+ python_code = response.split("```")[1].split("```")[0].strip()
50
+ print("Code found in the response but not Left out the word python:", python_code)
51
+ return plan,python_code
52
+ elif "```python" in response.choices[0].message.content:
53
+ python_code = response.choices[0].message.content.split(
54
+ "```python")[1].split("```")[0].strip()
55
+ return plan,python_code
56
+
57
+
58
+ # if python_code:
59
+ # interpreter_code_output = run_interpreter(python_code)
60
+ # print("Python code output:\n", interpreter_code_output)
app.py CHANGED
@@ -33,7 +33,7 @@ def main():
33
  with st.sidebar:
34
  selected = option_menu(
35
  menu_title="Main Menu",
36
- options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "MLtoolkit", "Data Party"])
37
 
38
  # --- DATA LOADER ---
39
  if selected == "Data Loader":
@@ -66,8 +66,13 @@ def main():
66
 
67
  # --- QUESTION AND ANSWER ---
68
  if selected == "Q/A":
69
- data_QA = DataQA(data)
70
- data_QA.ask_csv()
 
 
 
 
 
71
 
72
  if selected == "MLtoolkit":
73
  try:
 
33
  with st.sidebar:
34
  selected = option_menu(
35
  menu_title="Main Menu",
36
+ options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "MLtoolkit"])
37
 
38
  # --- DATA LOADER ---
39
  if selected == "Data Loader":
 
66
 
67
  # --- QUESTION AND ANSWER ---
68
  if selected == "Q/A":
69
+ try:
70
+ data_QA = DataQA()
71
+ data_QA.ask_csv()
72
+ except Exception as e:
73
+ # Handle the exception (e.g., logging, printing an error message, etc.)
74
+ print(f"An error occurred: {e}")
75
+
76
 
77
  if selected == "MLtoolkit":
78
  try: