Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Sleeping

App Files Files Community

Milad Alshomary commited on Oct 20

Commit

ce8f806

1 Parent(s): f310e45

updates

Browse files

Files changed (2) hide show

cluster_corpus.py +7 -0
utils/clustering_utils.py +31 -0

cluster_corpus.py CHANGED Viewed

@@ -72,6 +72,12 @@ def main():
         default=None,
         help="A list of specific eps values to test for DBSCAN. If not provided, a default range is used."
     )
     args = parser.parse_args()
@@ -107,6 +113,7 @@ def main():
         embedding_clm=embedding_col_name,
         eps_values=args.eps_values,
         min_samples=args.min_samples,
         metric=args.metric
     )

         default=None,
         help="A list of specific eps values to test for DBSCAN. If not provided, a default range is used."
     )
+    parser.add_argument(
+        "--pca_dimensions",
+        type=int,
+        default=None,
+        help="If provided, apply PCA to reduce embeddings to this number of dimensions before clustering."
+    )
     args = parser.parse_args()
         embedding_clm=embedding_col_name,
         eps_values=args.eps_values,
         min_samples=args.min_samples,
+        pca_dimensions=args.pca_dimensions,
         metric=args.metric
     )

utils/clustering_utils.py CHANGED Viewed

@@ -66,6 +66,7 @@ def clustering_author(background_corpus_df: pd.DataFrame,
                       embedding_clm: str = 'style_embedding',
                       eps_values: List[float] = None,
                       min_samples: int = 5,
                       metric: str = 'cosine') -> pd.DataFrame:
     """
     Performs DBSCAN clustering on embeddings in a DataFrame.
@@ -83,6 +84,8 @@ def clustering_author(background_corpus_df: pd.DataFrame,
                                             For 'euclidean', scale depends on embedding magnitudes.
         min_samples (int): DBSCAN `min_samples` parameter. Minimum number of
                            samples in a neighborhood for a point to be a core point.
         metric (str): The distance metric to use for DBSCAN and silhouette score
                       (e.g., 'cosine', 'euclidean').
@@ -139,6 +142,24 @@ def clustering_author(background_corpus_df: pd.DataFrame,
         background_corpus_df['cluster_label'] = final_labels_for_df
         return background_corpus_df
     # For cosine metric, normalize embeddings to unit length.
     # This is standard practice as cosine similarity is equivalent to Euclidean
     # distance on L2-normalized vectors. DBSCAN's 'cosine' metric internally
@@ -148,6 +169,16 @@ def clustering_author(background_corpus_df: pd.DataFrame,
         print("Normalizing embeddings for cosine distance...")
         X = normalize(X, norm='l2', axis=1)
     if eps_values is None:
         if metric == 'cosine':
             #eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

                       embedding_clm: str = 'style_embedding',
                       eps_values: List[float] = None,
                       min_samples: int = 5,
+                      pca_dimensions: int | None = None,
                       metric: str = 'cosine') -> pd.DataFrame:
     """
     Performs DBSCAN clustering on embeddings in a DataFrame.
                                             For 'euclidean', scale depends on embedding magnitudes.
         min_samples (int): DBSCAN `min_samples` parameter. Minimum number of
                            samples in a neighborhood for a point to be a core point.
+        pca_dimensions (int | None): If an integer is provided, PCA will be applied to reduce
+                                     embeddings to this number of dimensions before clustering.
         metric (str): The distance metric to use for DBSCAN and silhouette score
                       (e.g., 'cosine', 'euclidean').
         background_corpus_df['cluster_label'] = final_labels_for_df
         return background_corpus_df
+    # --- Optional: Apply PCA for dimensionality reduction ---
+    if pca_dimensions is not None and X.shape[1] > pca_dimensions:
+        from sklearn.decomposition import PCA
+        print(f"Applying PCA to reduce dimensions from {X.shape[1]} to {pca_dimensions}...")
+        pca = PCA(n_components=pca_dimensions, random_state=42)
+        X = pca.fit_transform(X)
+        # If a test set is provided, transform its embeddings using the same PCA model
+        if test_corpus_df is not None:
+            test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
+            if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[1] == pca.n_features_in_:
+                print(f"Transforming test set embeddings with the same PCA model...")
+                transformed_test_embeddings = pca.transform(test_embeddings_matrix)
+                # Update the test DataFrame's embedding column with the reduced embeddings
+                test_corpus_df[embedding_clm] = list(transformed_test_embeddings)
+            else:
+                print("Warning: Could not apply PCA to test set due to dimension mismatch or invalid data.")
     # For cosine metric, normalize embeddings to unit length.
     # This is standard practice as cosine similarity is equivalent to Euclidean
     # distance on L2-normalized vectors. DBSCAN's 'cosine' metric internally
         print("Normalizing embeddings for cosine distance...")
         X = normalize(X, norm='l2', axis=1)
+        # Also normalize the test corpus embeddings if they exist
+        if test_corpus_df is not None:
+            print("Normalizing test corpus embeddings for cosine distance...")
+            test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
+            if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[0] > 0:
+                normalized_test_embeddings = normalize(test_embeddings_matrix, norm='l2', axis=1)
+                test_corpus_df[embedding_clm] = list(normalized_test_embeddings)
+            else:
+                print("Warning: Could not normalize test set embeddings due to invalid data.")
     if eps_values is None:
         if metric == 'cosine':
             #eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]