Sadjad Alikhani commited on
Commit
bef8ac3
·
verified ·
1 Parent(s): 798b3c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -4
app.py CHANGED
@@ -112,6 +112,31 @@ def plot_confusion_matrix(y_true, y_pred, title):
112
  plt.savefig(f"{title}.png")
113
  return Image.open(f"{title}.png")
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  # Store the original working directory when the app starts
116
  original_dir = os.getcwd()
117
 
@@ -163,13 +188,20 @@ def process_hdf5_file(uploaded_file, percentage_idx):
163
  labels = np.array(f['labels']) # Assuming 'labels' dataset in the HDF5 file
164
  print(f"Loaded dataset with {channels.shape[0]} samples.")
165
 
166
- # Step 6: Split the dataset into training and test sets
167
- train_data_raw, test_data_raw, train_labels, test_labels = split_dataset(channels, labels, percentage_idx)
168
-
169
  # Step 7: Tokenize the data using the tokenizer from input_preprocess
170
  preprocessed_chs = input_preprocess.tokenizer(manual_data=channels)
171
- train_data_emb, test_data_emb, _, _ = split_dataset(preprocessed_chs, labels, percentage_idx)
172
 
 
 
 
 
 
 
 
 
 
 
 
173
  # Step 8: Perform classification using the Euclidean distance for both raw and embeddings
174
  pred_raw = classify_based_on_distance(train_data_raw, train_labels, test_data_raw)
175
  pred_emb = classify_based_on_distance(train_data_emb, train_labels, test_data_emb)
 
112
  plt.savefig(f"{title}.png")
113
  return Image.open(f"{title}.png")
114
 
115
+ identical_train_test_split(output_emb, output_raw, percentage):
116
+ N = output_emb.shape[0] # Get the total number of samples
117
+
118
+ # Generate the indices for shuffling and splitting
119
+ indices = torch.randperm(N) # Randomly shuffle the indices
120
+
121
+ # Calculate the split index
122
+ split_index = int(N * percentage)
123
+
124
+ # Split indices into train and test
125
+ train_indices = indices[:split_index] # First 80% for training
126
+ test_indices = indices[split_index:] # Remaining 20% for testing
127
+
128
+ # Select the same indices from both output_emb and output_raw
129
+ train_emb = output_emb[train_indices]
130
+ test_emb = output_emb[test_indices]
131
+
132
+ train_raw = output_raw[train_indices]
133
+ test_raw = output_raw[test_indices]
134
+
135
+ train_labels = labels[train_indices]
136
+ test_labels = labels[test_indices]
137
+
138
+ return train_emb, test_emb, train_raw, test_raw, train_labels, test_labels
139
+
140
  # Store the original working directory when the app starts
141
  original_dir = os.getcwd()
142
 
 
188
  labels = np.array(f['labels']) # Assuming 'labels' dataset in the HDF5 file
189
  print(f"Loaded dataset with {channels.shape[0]} samples.")
190
 
 
 
 
191
  # Step 7: Tokenize the data using the tokenizer from input_preprocess
192
  preprocessed_chs = input_preprocess.tokenizer(manual_data=channels)
 
193
 
194
+ # Step 7: Perform inference using the functions from inference.py
195
+ output_emb = inference.lwm_inference(preprocessed_chs, 'channel_emb', model)
196
+ output_raw = inference.create_raw_dataset(preprocessed_chs, device)
197
+
198
+ print(f"Output Embeddings Shape: {output_emb.shape}")
199
+ print(f"Output Raw Shape: {output_raw.shape}")
200
+
201
+ train_data_emb, test_data_emb, train_data_raw, test_data_raw, train_labels, test_labels = identical_train_test_split(output_emb.view(len(output_emb),-1),
202
+ output_raw.view(len(output_raw),-1),
203
+ percentage_idx)
204
+
205
  # Step 8: Perform classification using the Euclidean distance for both raw and embeddings
206
  pred_raw = classify_based_on_distance(train_data_raw, train_labels, test_data_raw)
207
  pred_emb = classify_based_on_distance(train_data_emb, train_labels, test_data_emb)