AshmithaIRRI commited on
Commit
7e44a95
·
verified ·
1 Parent(s): c120492

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -101
app.py CHANGED
@@ -1,15 +1,27 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- Created on Wed Jan 15 10:25:34 2025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  @author: Ashmitha
6
  """
7
 
8
- #-----------------------------------------------------------Libraries----------------------------------------------------------------------------
9
  import pandas as pd
10
  import numpy as np
11
  import gradio as gr
12
- #! pip install scikit-learn
13
  from sklearn.metrics import mean_squared_error,r2_score
14
  from scipy.stats import pearsonr
15
  from sklearn.preprocessing import StandardScaler
@@ -30,104 +42,157 @@ import io
30
  from sklearn.feature_selection import SelectFromModel
31
  import tempfile
32
 
 
33
 
34
- #--------------------------------Random Forest for Feature selection-------------------------------------------
35
- def RandomForestFeatureSelection(trainX, trainy,num_features=60):
36
- rf=RandomForestRegressor(n_estimators=1000,random_state=50)
37
- rf.fit(trainX,trainy)
38
- importances=rf.feature_importances_
39
- indices=np.argsort(importances)[-num_features:]
40
- return indices
41
- #------------------------------------------------------------------GRU model--------------------------------------------------
42
- def GRUModel(trainX,trainy,testX,testy,epochs=1000,batch_size=64,learning_rate=0.0001,l1_reg=0.001,l2_reg=0.001,dropout_rate=0.2,feature_selection=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  if feature_selection:
44
- rf=RandomForestRegressor(n_estimators=100,random_state=42)
45
- rf.fit(trainX,trainy)
46
- selector=SelectFromModel(rf,threshold="mean",prefit=True)
47
- trainX=selector.transform(trainX)
 
 
 
48
  if testX is not None:
49
- testX=selector.transform(testX)
50
- print(f"Selected {trainX.shape[1]} features based on feature importance")
51
- scaler=MinMaxScaler()
52
- trainX_scaled=scaler.fit_transform(trainX)
 
 
53
  if testX is not None:
54
- testX_scaled=scaler.transform(testX)
55
- target_scaler=MinMaxScaler()
56
- trainy_scaled=target_scaler.fit_transform(trainy.reshape(-1,1))
57
- trainX=trainX_scaled.reshape((trainX.shape[0],1,trainX.shape[1]))
 
 
 
 
58
  if testX is not None:
59
- testX=testX_scaled.reshape((testX.shape[0],1,testX.shape[1]))
60
- model=Sequential()
61
- model.add(GRU(512, input_shape=(trainX.shape[1],trainX.shape[2]), return_sequences=False,kernel_regularizer=regularizers.l1_l2(l1=l1_reg,l2=l2_reg)))
62
- model.add(Dense(256,kernel_initializer='he_normal',kernel_regularizer=regularizers.l1_l2(l1=l1_reg,l2=l2_reg)))
 
 
 
 
 
63
  model.add(BatchNormalization())
64
  model.add(Dropout(dropout_rate))
65
  model.add(LeakyReLU(alpha=0.1))
66
-
67
- model.add(Dense(128,kernel_initializer="he_normal",kernel_regularizer=regularizers.l1_l2(l1=l1_reg,l2=l2_reg)))
68
  model.add(BatchNormalization())
69
  model.add(Dropout(dropout_rate))
70
  model.add(LeakyReLU(alpha=0.1))
71
 
72
- model.add(Dense(64,kernel_initializer='he_normal',kernel_regularizer=regularizers.l1_l2(l1=l1_reg,l2=l2_reg)))
73
  model.add(BatchNormalization())
74
  model.add(Dropout(dropout_rate))
75
  model.add(LeakyReLU(alpha=0.1))
76
 
77
- model.add(Dense(32,kernel_initializer='he_normal',kernel_regularizer=regularizers.l1_l2(l1=l1_reg,l2=l2_reg)))
78
  model.add(BatchNormalization())
79
  model.add(Dropout(dropout_rate))
80
  model.add(LeakyReLU(alpha=0.1))
81
 
82
- model.add(Dense(1,activation="relu"))
83
- model.compile(loss="mse",optimizer=Adam(learning_rate=learning_rate),metrics=["mse"])
84
- learning_rate_reduction=ReduceLROnPlateau(monitor="val_loss",patience=10,verbose=1,factor=0.5,min_lr=1e-6)
85
- early_stopping=EarlyStopping(monitor='val_loss',verbose=1,restore_best_weights=True,patience=10)
 
 
 
 
 
 
 
86
  history = model.fit(trainX, trainy_scaled, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1,
87
  callbacks=[learning_rate_reduction, early_stopping])
88
- predicted_train=model.predict(trainX)
89
- predicted_test=model.predict(testX) if testX is not None else None
90
- predicted_train=model.predict(trainX)
91
- predicted_test=model.predict(testX) if testX is not None else None
92
- predicted_train=predicted_train.flatten()
 
 
93
  if predicted_test is not None:
94
- predicted_test =predicted_test.flatten()
95
  else:
96
- predicted_test=np.zeros_like(predicted_train)
97
- predicted_train=target_scaler.inverse_transform(predicted_train.reshape(-1,1)).flatten()
 
 
98
  if predicted_test is not None:
99
- predicted_test=target_scaler.inverse_transform(predicted_test.reshape(-1,1).flatten())
100
- return predicted_train.predicted_test,history
101
- #----------------------------------------------------CNN-----------------------------------------------
102
- def CNNModel(trainX,trainy,testX,testy,epochs=1000,batch_size=64,learning_rate=0.0001,l1_reg=0.0001,l2_reg=0.0001,dropout_rate=0.3,feature_selection=True):
 
 
 
 
 
103
  if feature_selection:
104
  rf=RandomForestRegressor(n_estimators=100,random_state=42)
105
  rf.fit(trainX,trainy)
106
- selector=SelectFromModel(rf,threshold="mean",prefit=True)
 
107
  trainX=selector.transform(trainX)
108
  if testX is not None:
109
  testX=selector.transform(testX)
110
- print(f"Selected {trainX.shape[1]} feature based on the importance feature")
111
- scaler=MinMaxScaler()
112
- trainX_scaled=scaler.fit.transform(trainX)
 
 
 
 
113
  if testX is not None:
114
- testX_scaled=scaler.transfom(testX)
115
- trainX=trainX_scaled.reshape((trainX.shape[0], trainX.shape[1],1))
 
 
116
  if testX is not None:
117
- testX = testX_scaled.reshape((testX.shape[0]),testX.shape[1],1)
118
- model=Sequential()
119
- model.add(Conv1D(512, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
120
- model.add(MaxPooling1D(pool_size=2))
121
- model.add(Dropout(dropout_rate))
122
 
 
123
  model.add(Conv1D(256, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
124
  model.add(MaxPooling1D(pool_size=2))
125
  model.add(Dropout(dropout_rate))
126
 
127
- model.add(Conv1D(128, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
128
  model.add(MaxPooling1D(pool_size=2))
129
  model.add(Dropout(dropout_rate))
130
 
 
131
  model.add(Flatten())
132
  model.add(Dense(64, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
133
  model.add(LeakyReLU(alpha=0.1))
@@ -135,14 +200,14 @@ def CNNModel(trainX,trainy,testX,testy,epochs=1000,batch_size=64,learning_rate=0
135
 
136
  model.add(Dense(1, activation='linear'))
137
 
138
-
139
  model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate), metrics=['mse'])
140
 
141
-
142
  learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=1, factor=0.5, min_lr=1e-6)
143
  early_stopping = EarlyStopping(monitor='val_loss', verbose=1, restore_best_weights=True, patience=10)
144
 
145
-
146
  history = model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1,
147
  callbacks=[learning_rate_reduction, early_stopping])
148
 
@@ -150,8 +215,8 @@ def CNNModel(trainX,trainy,testX,testy,epochs=1000,batch_size=64,learning_rate=0
150
  predicted_test = model.predict(testX).flatten() if testX is not None else None
151
 
152
  return predicted_train, predicted_test, history
153
- #-------------------------------------------------------------------RFModel---------------------------------------------------------
154
 
 
155
  def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,feature_selection=True):
156
  if feature_selection:
157
  rf=RandomForestRegressor(n_estimators=100, random_state=42)
@@ -163,20 +228,20 @@ def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,featu
163
  print(f"Selected {trainX.shape[1]} feature based on the feature selection")
164
 
165
 
 
166
 
167
-
168
-
169
  scaler = MinMaxScaler()
170
  trainX_scaled = scaler.fit_transform(trainX)
171
  if testX is not None:
172
  testX_scaled = scaler.transform(testX)
173
 
174
-
175
  rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
176
  history=rf_model.fit(trainX_scaled, trainy)
177
 
178
 
179
-
180
  predicted_train = rf_model.predict(trainX_scaled)
181
  predicted_test = rf_model.predict(testX_scaled) if testX is not None else None
182
 
@@ -193,14 +258,20 @@ def XGBoostModel(trainX, trainy, testX, testy,learning_rate,min_child_weight,fea
193
  print(f"Selected {trainX.shape[1]} features based on feature importance")
194
 
195
 
196
-
197
-
 
 
 
198
  scaler = MinMaxScaler()
199
  trainX_scaled = scaler.fit_transform(trainX)
200
  if testX is not None:
201
  testX_scaled = scaler.transform(testX)
202
 
203
-
 
 
 
204
  xgb_model=XGBRegressor(objective="reg:squarederror",random_state=42)
205
  history=xgb_model.fit(trainX, trainy)
206
  param_grid={
@@ -229,12 +300,12 @@ def XGBoostModel(trainX, trainy, testX, testy,learning_rate,min_child_weight,fea
229
 
230
 
231
 
232
-
233
  def read_csv_file(uploaded_file):
234
  if uploaded_file is not None:
235
- if hasattr(uploaded_file, 'data'):
236
  return pd.read_csv(io.BytesIO(uploaded_file.data))
237
- elif hasattr(uploaded_file, 'name'):
238
  return pd.read_csv(uploaded_file.name)
239
  return None
240
 
@@ -243,31 +314,31 @@ def read_csv_file(uploaded_file):
243
 
244
 
245
  def calculate_topsis_score(df):
246
-
247
  metrics = df[['Train_MSE', 'Train_RMSE', 'Train_R2', 'Train_Corr']].dropna() # Ensure no NaN values
248
  norm_metrics = metrics / np.sqrt((metrics ** 2).sum(axis=0))
249
 
250
-
251
  ideal_best = pd.Series(index=norm_metrics.columns)
252
  ideal_worst = pd.Series(index=norm_metrics.columns)
253
 
254
-
255
  for col in ['Train_MSE', 'Train_RMSE']:
256
  ideal_best[col] = norm_metrics[col].min()
257
  ideal_worst[col] = norm_metrics[col].max()
258
 
259
-
260
  for col in ['Train_R2', 'Train_Corr']:
261
  ideal_best[col] = norm_metrics[col].max()
262
  ideal_worst[col] = norm_metrics[col].min()
263
 
264
-
265
  dist_to_best = np.sqrt(((norm_metrics - ideal_best) ** 2).sum(axis=1))
266
  dist_to_worst = np.sqrt(((norm_metrics - ideal_worst) ** 2).sum(axis=1))
267
 
268
-
269
  topsis_score = dist_to_worst / (dist_to_best + dist_to_worst)
270
- df['TOPSIS_Score'] = np.nan
271
  df.loc[metrics.index, 'TOPSIS_Score'] = topsis_score # Assign TOPSIS scores
272
  return df
273
 
@@ -281,7 +352,7 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
281
  if 'phenotypes' not in training_data.columns:
282
  raise ValueError("Training data does not contain the 'phenotypes' column.")
283
 
284
-
285
  training_additive = training_additive.iloc[:, 1:]
286
  testing_additive = testing_additive.iloc[:, 1:]
287
  training_dominance = training_dominance.iloc[:, 1:]
@@ -298,7 +369,7 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
298
  training_genotypic_data_merged = training_data_merged.iloc[:, 2:].values
299
  testing_genotypic_data_merged = testing_data_merged.iloc[:, 2:].values
300
 
301
-
302
  if feature_selection:
303
  rf = RandomForestRegressor(n_estimators=100, random_state=42)
304
  rf.fit(training_genotypic_data_merged, phenotypic_info)
@@ -307,7 +378,7 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
307
  testing_genotypic_data_merged = selector.transform(testing_genotypic_data_merged)
308
  print(f"Selected {training_genotypic_data_merged.shape[1]} features based on importance.")
309
 
310
-
311
  scaler = StandardScaler()
312
  training_genotypic_data_merged = scaler.fit_transform(training_genotypic_data_merged)
313
  testing_genotypic_data_merged = scaler.transform(testing_genotypic_data_merged)
@@ -348,7 +419,7 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
348
  predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy,learning_rate,min_child_weight)
349
 
350
 
351
-
352
  mse_train, rmse_train, r2_train, corr_train = calculate_metrics(outer_trainy, predicted_train)
353
  mse_test, rmse_test, r2_test, corr_test = calculate_metrics(outer_testy, predicted_test) if outer_testy is not None else (None, None, None, None)
354
 
@@ -373,9 +444,10 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
373
  })
374
  all_predicted_phenotypes.append(predicted_test_df)
375
 
 
376
  results_df = pd.DataFrame(results)
377
 
378
-
379
  avg_results_df = results_df.groupby('Model').agg({
380
  'Train_MSE': 'mean',
381
  'Train_RMSE': 'mean',
@@ -387,33 +459,33 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
387
  'Test_Corr': 'mean'
388
  }).reset_index()
389
 
390
-
391
  def calculate_topsis_score(df):
392
-
393
  norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
394
 
395
-
396
  ideal_positive = norm_df.max(axis=0)
397
  ideal_negative = norm_df.min(axis=0)
398
 
399
-
400
  dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
401
  dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
402
 
403
-
404
  topsis_score = dist_negative / (dist_positive + dist_negative)
405
 
406
-
407
  df['TOPSIS_Score'] = topsis_score
408
 
409
  return df
410
 
411
  avg_results_df = calculate_topsis_score(avg_results_df)
412
 
413
-
414
  avg_results_df.to_csv(output_file, index=False)
415
 
416
-
417
  if all_predicted_phenotypes:
418
  predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
419
  predicted_all_df.to_csv(predicted_phenotype_file, index=False)
@@ -425,16 +497,17 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
425
  def run_cross_validation(training_file, training_additive_file, testing_file, testing_additive_file,
426
  training_dominance_file, testing_dominance_file,feature_selection,learning_rate,min_child_weight):
427
 
428
-
429
  epochs = 1000
430
  batch_size = 64
431
  outer_n_splits = 2
432
  inner_n_splits = 2
433
  min_child_weight=5
434
  learning_rate=0.001
435
-
 
436
 
437
-
438
  training_data = pd.read_csv(training_file.name)
439
  training_additive = pd.read_csv(training_additive_file.name)
440
  testing_data = pd.read_csv(testing_file.name)
@@ -442,7 +515,7 @@ def run_cross_validation(training_file, training_additive_file, testing_file, te
442
  training_dominance = pd.read_csv(training_dominance_file.name)
443
  testing_dominance = pd.read_csv(testing_dominance_file.name)
444
 
445
-
446
  results, predicted_phenotypes = NestedKFoldCrossValidation(
447
  training_data=training_data,
448
  training_additive=training_additive,
@@ -459,7 +532,7 @@ def run_cross_validation(training_file, training_additive_file, testing_file, te
459
  feature_selection=feature_selection
460
  )
461
 
462
-
463
  results_file = "cross_validation_results.csv"
464
  predicted_file = "predicted_phenotype.csv"
465
  results.to_csv(results_file, index=False)
@@ -467,6 +540,7 @@ def run_cross_validation(training_file, training_additive_file, testing_file, te
467
 
468
  return results_file, predicted_file
469
 
 
470
  with gr.Blocks() as interface:
471
  gr.Markdown("# DeepMap - An Integrated GUI for Genotype to Phenotype Prediction")
472
 
@@ -497,8 +571,8 @@ with gr.Blocks() as interface:
497
  outputs=[output1, output2]
498
  )
499
 
500
-
501
- interface.launch(share=True)
502
 
503
 
504
 
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ Created on Sun Nov 24 12:47:37 2024
4
+
5
+ @author: Ashmitha
6
+ """
7
+
8
+ # -*- coding: utf-8 -*-
9
+ """
10
+ Created on Sun Nov 24 12:25:57 2024
11
+
12
+ @author: Ashmitha
13
+ """
14
+
15
+ # -*- coding: utf-8 -*-
16
+ """
17
+ Created on Sat Nov 9 15:44:40 2024
18
 
19
  @author: Ashmitha
20
  """
21
 
 
22
  import pandas as pd
23
  import numpy as np
24
  import gradio as gr
 
25
  from sklearn.metrics import mean_squared_error,r2_score
26
  from scipy.stats import pearsonr
27
  from sklearn.preprocessing import StandardScaler
 
42
  from sklearn.feature_selection import SelectFromModel
43
  import tempfile
44
 
45
+ #-------------------------------------Feature selection---------------------------------------------------------------------------------------------
46
 
47
+ def RandomForestFeatureSelection(trainX, trainy, num_features=60):
48
+ rf = RandomForestRegressor(n_estimators=1000, random_state=50)
49
+ rf.fit(trainX, trainy)
50
+
51
+ # Get feature importances
52
+ importances = rf.feature_importances_
53
+
54
+ # Select the top N important features
55
+ indices = np.argsort(importances)[-num_features:]
56
+ return indices
57
+ #----------------------------------------------------------GRU Model---------------------------------------------------------------------
58
+ import numpy as np
59
+ from tensorflow.keras.models import Sequential
60
+ from tensorflow.keras.layers import GRU, Dense, BatchNormalization, Dropout, LeakyReLU
61
+ from tensorflow.keras.optimizers import Adam
62
+ from tensorflow.keras import regularizers
63
+ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
64
+ from sklearn.preprocessing import MinMaxScaler
65
+ from sklearn.ensemble import RandomForestRegressor
66
+ from sklearn.feature_selection import SelectFromModel
67
+
68
+ def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2, feature_selection=True):
69
+
70
+ # Apply feature selection using Random Forest Regressor
71
  if feature_selection:
72
+ # Use RandomForestRegressor to rank features by importance
73
+ rf = RandomForestRegressor(n_estimators=100, random_state=42)
74
+ rf.fit(trainX, trainy)
75
+
76
+ # Select features with importance greater than a threshold (e.g., mean importance)
77
+ selector = SelectFromModel(rf, threshold="mean", prefit=True)
78
+ trainX = selector.transform(trainX)
79
  if testX is not None:
80
+ testX = selector.transform(testX)
81
+ print(f"Selected {trainX.shape[1]} features based on feature importance.")
82
+
83
+ # Scale the input data using MinMaxScaler to normalize the feature range
84
+ scaler = MinMaxScaler()
85
+ trainX_scaled = scaler.fit_transform(trainX)
86
  if testX is not None:
87
+ testX_scaled = scaler.transform(testX)
88
+
89
+ # Scale the target variable using MinMaxScaler
90
+ target_scaler = MinMaxScaler()
91
+ trainy_scaled = target_scaler.fit_transform(trainy.reshape(-1, 1)) # Reshape to 2D for scaler
92
+
93
+ # Reshape trainX and testX to be 3D: (samples, timesteps, features)
94
+ trainX = trainX_scaled.reshape((trainX.shape[0], 1, trainX.shape[1])) # Adjusted for general feature count
95
  if testX is not None:
96
+ testX = testX_scaled.reshape((testX.shape[0], 1, testX.shape[1])) # Reshape testX if it exists
97
+
98
+ model = Sequential()
99
+
100
+ # GRU Layer
101
+ model.add(GRU(512, input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=False, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
102
+
103
+ # Dense Layers with Batch Normalization, Dropout, LeakyReLU
104
+ model.add(Dense(256, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
105
  model.add(BatchNormalization())
106
  model.add(Dropout(dropout_rate))
107
  model.add(LeakyReLU(alpha=0.1))
108
+
109
+ model.add(Dense(128, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
110
  model.add(BatchNormalization())
111
  model.add(Dropout(dropout_rate))
112
  model.add(LeakyReLU(alpha=0.1))
113
 
114
+ model.add(Dense(64, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
115
  model.add(BatchNormalization())
116
  model.add(Dropout(dropout_rate))
117
  model.add(LeakyReLU(alpha=0.1))
118
 
119
+ model.add(Dense(32, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
120
  model.add(BatchNormalization())
121
  model.add(Dropout(dropout_rate))
122
  model.add(LeakyReLU(alpha=0.1))
123
 
124
+ # Output Layer with ReLU activation to prevent negative predictions
125
+ model.add(Dense(1, activation="relu"))
126
+
127
+ # Compile the model
128
+ model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate), metrics=['mse'])
129
+
130
+ # Callbacks for learning rate reduction and early stopping
131
+ learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=10, verbose=1, factor=0.5, min_lr=1e-6)
132
+ early_stopping = EarlyStopping(monitor='val_loss', verbose=1, restore_best_weights=True, patience=10)
133
+
134
+ # Train the model
135
  history = model.fit(trainX, trainy_scaled, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1,
136
  callbacks=[learning_rate_reduction, early_stopping])
137
+
138
+ # Predict train and test
139
+ predicted_train = model.predict(trainX)
140
+ predicted_test = model.predict(testX) if testX is not None else None
141
+
142
+ # Flatten predictions
143
+ predicted_train = predicted_train.flatten()
144
  if predicted_test is not None:
145
+ predicted_test = predicted_test.flatten()
146
  else:
147
+ predicted_test = np.zeros_like(predicted_train)
148
+
149
+ # Inverse scale the predictions to get them back to original range
150
+ predicted_train = target_scaler.inverse_transform(predicted_train.reshape(-1, 1)).flatten()
151
  if predicted_test is not None:
152
+ predicted_test = target_scaler.inverse_transform(predicted_test.reshape(-1, 1)).flatten()
153
+
154
+ return predicted_train, predicted_test, history
155
+
156
+
157
+
158
+
159
+ #-----------------------------------------------------------DeepMap-------------------------------------------------------------------------------
160
+ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.0001, l2_reg=0.0001, dropout_rate=0.3,feature_selection=True):
161
  if feature_selection:
162
  rf=RandomForestRegressor(n_estimators=100,random_state=42)
163
  rf.fit(trainX,trainy)
164
+
165
+ selector=SelectFromModel(rf, threshold="mean",prefit=True)
166
  trainX=selector.transform(trainX)
167
  if testX is not None:
168
  testX=selector.transform(testX)
169
+ print(f"Selected {trainX.shape[1]} feature based on the important feature")
170
+
171
+
172
+
173
+ # Scaling the inputs
174
+ scaler = MinMaxScaler()
175
+ trainX_scaled = scaler.fit_transform(trainX)
176
  if testX is not None:
177
+ testX_scaled = scaler.transform(testX)
178
+
179
+ # Reshape for CNN input (samples, features, channels)
180
+ trainX = trainX_scaled.reshape((trainX.shape[0], trainX.shape[1], 1))
181
  if testX is not None:
182
+ testX = testX_scaled.reshape((testX.shape[0], testX.shape[1], 1))
183
+
184
+ model = Sequential()
 
 
185
 
186
+ # Convolutional layers
187
  model.add(Conv1D(256, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
188
  model.add(MaxPooling1D(pool_size=2))
189
  model.add(Dropout(dropout_rate))
190
 
191
+ model.add(Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
192
  model.add(MaxPooling1D(pool_size=2))
193
  model.add(Dropout(dropout_rate))
194
 
195
+ # Flatten and Dense layers
196
  model.add(Flatten())
197
  model.add(Dense(64, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
198
  model.add(LeakyReLU(alpha=0.1))
 
200
 
201
  model.add(Dense(1, activation='linear'))
202
 
203
+ # Compile the model
204
  model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate), metrics=['mse'])
205
 
206
+ # Callbacks
207
  learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=1, factor=0.5, min_lr=1e-6)
208
  early_stopping = EarlyStopping(monitor='val_loss', verbose=1, restore_best_weights=True, patience=10)
209
 
210
+ # Train the model
211
  history = model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1,
212
  callbacks=[learning_rate_reduction, early_stopping])
213
 
 
215
  predicted_test = model.predict(testX).flatten() if testX is not None else None
216
 
217
  return predicted_train, predicted_test, history
 
218
 
219
+ #-------------------------------------------------------------------------Random Forest----------------------------------------------------
220
  def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,feature_selection=True):
221
  if feature_selection:
222
  rf=RandomForestRegressor(n_estimators=100, random_state=42)
 
228
  print(f"Selected {trainX.shape[1]} feature based on the feature selection")
229
 
230
 
231
+ # Log transformation of the target variable
232
 
233
+ # Scaling the feature data
 
234
  scaler = MinMaxScaler()
235
  trainX_scaled = scaler.fit_transform(trainX)
236
  if testX is not None:
237
  testX_scaled = scaler.transform(testX)
238
 
239
+ # Define and train the RandomForest model
240
  rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
241
  history=rf_model.fit(trainX_scaled, trainy)
242
 
243
 
244
+ # Predictions
245
  predicted_train = rf_model.predict(trainX_scaled)
246
  predicted_test = rf_model.predict(testX_scaled) if testX is not None else None
247
 
 
258
  print(f"Selected {trainX.shape[1]} features based on feature importance")
259
 
260
 
261
+ #trainy_log = np.log1p(trainy) # Log-transform to handle large phenotypic values
262
+ #if testy is not None:
263
+ # testy_log = np.log1p(testy)
264
+
265
+ # Scale the features
266
  scaler = MinMaxScaler()
267
  trainX_scaled = scaler.fit_transform(trainX)
268
  if testX is not None:
269
  testX_scaled = scaler.transform(testX)
270
 
271
+ # Define and train the XGBoost model
272
+ # xgb_model = XGBRegressor(n_estimators=n_estimators, max_depth=100, random_state=42)
273
+ #xgb_model = XGBRegressor(objective ='reg:linear',
274
+ # n_estimators = 100, seed = 100)
275
  xgb_model=XGBRegressor(objective="reg:squarederror",random_state=42)
276
  history=xgb_model.fit(trainX, trainy)
277
  param_grid={
 
300
 
301
 
302
 
303
+ # Helper function to read the uploaded CSV file
304
  def read_csv_file(uploaded_file):
305
  if uploaded_file is not None:
306
+ if hasattr(uploaded_file, 'data'): # For NamedBytes
307
  return pd.read_csv(io.BytesIO(uploaded_file.data))
308
+ elif hasattr(uploaded_file, 'name'): # For NamedString
309
  return pd.read_csv(uploaded_file.name)
310
  return None
311
 
 
314
 
315
 
316
  def calculate_topsis_score(df):
317
+ # Normalize the metrics
318
  metrics = df[['Train_MSE', 'Train_RMSE', 'Train_R2', 'Train_Corr']].dropna() # Ensure no NaN values
319
  norm_metrics = metrics / np.sqrt((metrics ** 2).sum(axis=0))
320
 
321
+ # Define ideal best and worst for each metric
322
  ideal_best = pd.Series(index=norm_metrics.columns)
323
  ideal_worst = pd.Series(index=norm_metrics.columns)
324
 
325
+ # For RMSE and MSE (minimization criteria): min is best, max is worst
326
  for col in ['Train_MSE', 'Train_RMSE']:
327
  ideal_best[col] = norm_metrics[col].min()
328
  ideal_worst[col] = norm_metrics[col].max()
329
 
330
+ # For R2 and Corr (maximization criteria): max is best, min is worst
331
  for col in ['Train_R2', 'Train_Corr']:
332
  ideal_best[col] = norm_metrics[col].max()
333
  ideal_worst[col] = norm_metrics[col].min()
334
 
335
+ # Calculate Euclidean distance to ideal best and worst
336
  dist_to_best = np.sqrt(((norm_metrics - ideal_best) ** 2).sum(axis=1))
337
  dist_to_worst = np.sqrt(((norm_metrics - ideal_worst) ** 2).sum(axis=1))
338
 
339
+ # Calculate TOPSIS score
340
  topsis_score = dist_to_worst / (dist_to_best + dist_to_worst)
341
+ df['TOPSIS_Score'] = np.nan # Initialize with NaN
342
  df.loc[metrics.index, 'TOPSIS_Score'] = topsis_score # Assign TOPSIS scores
343
  return df
344
 
 
352
  if 'phenotypes' not in training_data.columns:
353
  raise ValueError("Training data does not contain the 'phenotypes' column.")
354
 
355
+ # Remove Sample ID columns from additive and dominance data
356
  training_additive = training_additive.iloc[:, 1:]
357
  testing_additive = testing_additive.iloc[:, 1:]
358
  training_dominance = training_dominance.iloc[:, 1:]
 
369
  training_genotypic_data_merged = training_data_merged.iloc[:, 2:].values
370
  testing_genotypic_data_merged = testing_data_merged.iloc[:, 2:].values
371
 
372
+ # Feature selection
373
  if feature_selection:
374
  rf = RandomForestRegressor(n_estimators=100, random_state=42)
375
  rf.fit(training_genotypic_data_merged, phenotypic_info)
 
378
  testing_genotypic_data_merged = selector.transform(testing_genotypic_data_merged)
379
  print(f"Selected {training_genotypic_data_merged.shape[1]} features based on importance.")
380
 
381
+ # Standardize the genotypic data
382
  scaler = StandardScaler()
383
  training_genotypic_data_merged = scaler.fit_transform(training_genotypic_data_merged)
384
  testing_genotypic_data_merged = scaler.transform(testing_genotypic_data_merged)
 
419
  predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy,learning_rate,min_child_weight)
420
 
421
 
422
+ # Calculate metrics
423
  mse_train, rmse_train, r2_train, corr_train = calculate_metrics(outer_trainy, predicted_train)
424
  mse_test, rmse_test, r2_test, corr_test = calculate_metrics(outer_testy, predicted_test) if outer_testy is not None else (None, None, None, None)
425
 
 
444
  })
445
  all_predicted_phenotypes.append(predicted_test_df)
446
 
447
+ # Compile results
448
  results_df = pd.DataFrame(results)
449
 
450
+ # Calculate the average metrics for each model
451
  avg_results_df = results_df.groupby('Model').agg({
452
  'Train_MSE': 'mean',
453
  'Train_RMSE': 'mean',
 
459
  'Test_Corr': 'mean'
460
  }).reset_index()
461
 
462
+ # Calculate the TOPSIS score for the average metrics (considering only MSE, RMSE, R², and Correlation)
463
  def calculate_topsis_score(df):
464
+ # Normalize the data
465
  norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
466
 
467
+ # Calculate the positive and negative ideal solutions
468
  ideal_positive = norm_df.max(axis=0)
469
  ideal_negative = norm_df.min(axis=0)
470
 
471
+ # Calculate the Euclidean distances
472
  dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
473
  dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
474
 
475
+ # Calculate the TOPSIS score
476
  topsis_score = dist_negative / (dist_positive + dist_negative)
477
 
478
+ # Add the TOPSIS score to the dataframe
479
  df['TOPSIS_Score'] = topsis_score
480
 
481
  return df
482
 
483
  avg_results_df = calculate_topsis_score(avg_results_df)
484
 
485
+ # Save the results with TOPSIS scores to the file
486
  avg_results_df.to_csv(output_file, index=False)
487
 
488
+ # Save predicted phenotypes
489
  if all_predicted_phenotypes:
490
  predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
491
  predicted_all_df.to_csv(predicted_phenotype_file, index=False)
 
497
  def run_cross_validation(training_file, training_additive_file, testing_file, testing_additive_file,
498
  training_dominance_file, testing_dominance_file,feature_selection,learning_rate,min_child_weight):
499
 
500
+ # Default parameters
501
  epochs = 1000
502
  batch_size = 64
503
  outer_n_splits = 2
504
  inner_n_splits = 2
505
  min_child_weight=5
506
  learning_rate=0.001
507
+ #learning_rate=learning_rate
508
+ # min_child_weight=min_child_weight
509
 
510
+ # Load datasets
511
  training_data = pd.read_csv(training_file.name)
512
  training_additive = pd.read_csv(training_additive_file.name)
513
  testing_data = pd.read_csv(testing_file.name)
 
515
  training_dominance = pd.read_csv(training_dominance_file.name)
516
  testing_dominance = pd.read_csv(testing_dominance_file.name)
517
 
518
+ # Call the cross-validation function
519
  results, predicted_phenotypes = NestedKFoldCrossValidation(
520
  training_data=training_data,
521
  training_additive=training_additive,
 
532
  feature_selection=feature_selection
533
  )
534
 
535
+ # Save outputs
536
  results_file = "cross_validation_results.csv"
537
  predicted_file = "predicted_phenotype.csv"
538
  results.to_csv(results_file, index=False)
 
540
 
541
  return results_file, predicted_file
542
 
543
+ # Gradio interface
544
  with gr.Blocks() as interface:
545
  gr.Markdown("# DeepMap - An Integrated GUI for Genotype to Phenotype Prediction")
546
 
 
571
  outputs=[output1, output2]
572
  )
573
 
574
+ # Launch the interface
575
+ interface.launch()
576
 
577
 
578