Spaces:
Running
Running
Sadjad Alikhani
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -112,6 +112,31 @@ def plot_confusion_matrix(y_true, y_pred, title):
|
|
112 |
plt.savefig(f"{title}.png")
|
113 |
return Image.open(f"{title}.png")
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
# Store the original working directory when the app starts
|
116 |
original_dir = os.getcwd()
|
117 |
|
@@ -163,13 +188,20 @@ def process_hdf5_file(uploaded_file, percentage_idx):
|
|
163 |
labels = np.array(f['labels']) # Assuming 'labels' dataset in the HDF5 file
|
164 |
print(f"Loaded dataset with {channels.shape[0]} samples.")
|
165 |
|
166 |
-
# Step 6: Split the dataset into training and test sets
|
167 |
-
train_data_raw, test_data_raw, train_labels, test_labels = split_dataset(channels, labels, percentage_idx)
|
168 |
-
|
169 |
# Step 7: Tokenize the data using the tokenizer from input_preprocess
|
170 |
preprocessed_chs = input_preprocess.tokenizer(manual_data=channels)
|
171 |
-
train_data_emb, test_data_emb, _, _ = split_dataset(preprocessed_chs, labels, percentage_idx)
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
# Step 8: Perform classification using the Euclidean distance for both raw and embeddings
|
174 |
pred_raw = classify_based_on_distance(train_data_raw, train_labels, test_data_raw)
|
175 |
pred_emb = classify_based_on_distance(train_data_emb, train_labels, test_data_emb)
|
|
|
112 |
plt.savefig(f"{title}.png")
|
113 |
return Image.open(f"{title}.png")
|
114 |
|
115 |
+
identical_train_test_split(output_emb, output_raw, percentage):
|
116 |
+
N = output_emb.shape[0] # Get the total number of samples
|
117 |
+
|
118 |
+
# Generate the indices for shuffling and splitting
|
119 |
+
indices = torch.randperm(N) # Randomly shuffle the indices
|
120 |
+
|
121 |
+
# Calculate the split index
|
122 |
+
split_index = int(N * percentage)
|
123 |
+
|
124 |
+
# Split indices into train and test
|
125 |
+
train_indices = indices[:split_index] # First 80% for training
|
126 |
+
test_indices = indices[split_index:] # Remaining 20% for testing
|
127 |
+
|
128 |
+
# Select the same indices from both output_emb and output_raw
|
129 |
+
train_emb = output_emb[train_indices]
|
130 |
+
test_emb = output_emb[test_indices]
|
131 |
+
|
132 |
+
train_raw = output_raw[train_indices]
|
133 |
+
test_raw = output_raw[test_indices]
|
134 |
+
|
135 |
+
train_labels = labels[train_indices]
|
136 |
+
test_labels = labels[test_indices]
|
137 |
+
|
138 |
+
return train_emb, test_emb, train_raw, test_raw, train_labels, test_labels
|
139 |
+
|
140 |
# Store the original working directory when the app starts
|
141 |
original_dir = os.getcwd()
|
142 |
|
|
|
188 |
labels = np.array(f['labels']) # Assuming 'labels' dataset in the HDF5 file
|
189 |
print(f"Loaded dataset with {channels.shape[0]} samples.")
|
190 |
|
|
|
|
|
|
|
191 |
# Step 7: Tokenize the data using the tokenizer from input_preprocess
|
192 |
preprocessed_chs = input_preprocess.tokenizer(manual_data=channels)
|
|
|
193 |
|
194 |
+
# Step 7: Perform inference using the functions from inference.py
|
195 |
+
output_emb = inference.lwm_inference(preprocessed_chs, 'channel_emb', model)
|
196 |
+
output_raw = inference.create_raw_dataset(preprocessed_chs, device)
|
197 |
+
|
198 |
+
print(f"Output Embeddings Shape: {output_emb.shape}")
|
199 |
+
print(f"Output Raw Shape: {output_raw.shape}")
|
200 |
+
|
201 |
+
train_data_emb, test_data_emb, train_data_raw, test_data_raw, train_labels, test_labels = identical_train_test_split(output_emb.view(len(output_emb),-1),
|
202 |
+
output_raw.view(len(output_raw),-1),
|
203 |
+
percentage_idx)
|
204 |
+
|
205 |
# Step 8: Perform classification using the Euclidean distance for both raw and embeddings
|
206 |
pred_raw = classify_based_on_distance(train_data_raw, train_labels, test_data_raw)
|
207 |
pred_emb = classify_based_on_distance(train_data_emb, train_labels, test_data_emb)
|