Spaces:

wi-lab
/

lwm-interactive-demo

Running

App Files Files Community

Sadjad Alikhani commited on Sep 20, 2024

Commit

bef8ac3

verified ·

1 Parent(s): 798b3c0

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -4

app.py CHANGED Viewed

@@ -112,6 +112,31 @@ def plot_confusion_matrix(y_true, y_pred, title):
     plt.savefig(f"{title}.png")
     return Image.open(f"{title}.png")
 # Store the original working directory when the app starts
 original_dir = os.getcwd()
@@ -163,13 +188,20 @@ def process_hdf5_file(uploaded_file, percentage_idx):
             labels = np.array(f['labels'])  # Assuming 'labels' dataset in the HDF5 file
         print(f"Loaded dataset with {channels.shape[0]} samples.")
-        # Step 6: Split the dataset into training and test sets
-        train_data_raw, test_data_raw, train_labels, test_labels = split_dataset(channels, labels, percentage_idx)
         # Step 7: Tokenize the data using the tokenizer from input_preprocess
         preprocessed_chs = input_preprocess.tokenizer(manual_data=channels)
-        train_data_emb, test_data_emb, _, _ = split_dataset(preprocessed_chs, labels, percentage_idx)
         # Step 8: Perform classification using the Euclidean distance for both raw and embeddings
         pred_raw = classify_based_on_distance(train_data_raw, train_labels, test_data_raw)
         pred_emb = classify_based_on_distance(train_data_emb, train_labels, test_data_emb)

     plt.savefig(f"{title}.png")
     return Image.open(f"{title}.png")
+identical_train_test_split(output_emb, output_raw, percentage):
+    N = output_emb.shape[0]  # Get the total number of samples
+    # Generate the indices for shuffling and splitting
+    indices = torch.randperm(N)  # Randomly shuffle the indices
+    # Calculate the split index
+    split_index = int(N * percentage)
+    # Split indices into train and test
+    train_indices = indices[:split_index]  # First 80% for training
+    test_indices = indices[split_index:]   # Remaining 20% for testing
+    # Select the same indices from both output_emb and output_raw
+    train_emb = output_emb[train_indices]
+    test_emb = output_emb[test_indices]
+    train_raw = output_raw[train_indices]
+    test_raw = output_raw[test_indices]
+    train_labels = labels[train_indices]
+    test_labels = labels[test_indices]
+    return train_emb, test_emb, train_raw, test_raw, train_labels, test_labels
 # Store the original working directory when the app starts
 original_dir = os.getcwd()
             labels = np.array(f['labels'])  # Assuming 'labels' dataset in the HDF5 file
         print(f"Loaded dataset with {channels.shape[0]} samples.")
         # Step 7: Tokenize the data using the tokenizer from input_preprocess
         preprocessed_chs = input_preprocess.tokenizer(manual_data=channels)
+        # Step 7: Perform inference using the functions from inference.py
+        output_emb = inference.lwm_inference(preprocessed_chs, 'channel_emb', model)
+        output_raw = inference.create_raw_dataset(preprocessed_chs, device)
+        print(f"Output Embeddings Shape: {output_emb.shape}")
+        print(f"Output Raw Shape: {output_raw.shape}")
+        train_data_emb, test_data_emb, train_data_raw, test_data_raw, train_labels, test_labels = identical_train_test_split(output_emb.view(len(output_emb),-1),
+                                                                                                                             output_raw.view(len(output_raw),-1),
+                                                                                                                             percentage_idx)
         # Step 8: Perform classification using the Euclidean distance for both raw and embeddings
         pred_raw = classify_based_on_distance(train_data_raw, train_labels, test_data_raw)
         pred_emb = classify_based_on_distance(train_data_emb, train_labels, test_data_emb)