Milad Alshomary commited on
Commit
ce8f806
·
1 Parent(s): f310e45
Files changed (2) hide show
  1. cluster_corpus.py +7 -0
  2. utils/clustering_utils.py +31 -0
cluster_corpus.py CHANGED
@@ -72,6 +72,12 @@ def main():
72
  default=None,
73
  help="A list of specific eps values to test for DBSCAN. If not provided, a default range is used."
74
  )
 
 
 
 
 
 
75
 
76
  args = parser.parse_args()
77
 
@@ -107,6 +113,7 @@ def main():
107
  embedding_clm=embedding_col_name,
108
  eps_values=args.eps_values,
109
  min_samples=args.min_samples,
 
110
  metric=args.metric
111
  )
112
 
 
72
  default=None,
73
  help="A list of specific eps values to test for DBSCAN. If not provided, a default range is used."
74
  )
75
+ parser.add_argument(
76
+ "--pca_dimensions",
77
+ type=int,
78
+ default=None,
79
+ help="If provided, apply PCA to reduce embeddings to this number of dimensions before clustering."
80
+ )
81
 
82
  args = parser.parse_args()
83
 
 
113
  embedding_clm=embedding_col_name,
114
  eps_values=args.eps_values,
115
  min_samples=args.min_samples,
116
+ pca_dimensions=args.pca_dimensions,
117
  metric=args.metric
118
  )
119
 
utils/clustering_utils.py CHANGED
@@ -66,6 +66,7 @@ def clustering_author(background_corpus_df: pd.DataFrame,
66
  embedding_clm: str = 'style_embedding',
67
  eps_values: List[float] = None,
68
  min_samples: int = 5,
 
69
  metric: str = 'cosine') -> pd.DataFrame:
70
  """
71
  Performs DBSCAN clustering on embeddings in a DataFrame.
@@ -83,6 +84,8 @@ def clustering_author(background_corpus_df: pd.DataFrame,
83
  For 'euclidean', scale depends on embedding magnitudes.
84
  min_samples (int): DBSCAN `min_samples` parameter. Minimum number of
85
  samples in a neighborhood for a point to be a core point.
 
 
86
  metric (str): The distance metric to use for DBSCAN and silhouette score
87
  (e.g., 'cosine', 'euclidean').
88
 
@@ -139,6 +142,24 @@ def clustering_author(background_corpus_df: pd.DataFrame,
139
  background_corpus_df['cluster_label'] = final_labels_for_df
140
  return background_corpus_df
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  # For cosine metric, normalize embeddings to unit length.
143
  # This is standard practice as cosine similarity is equivalent to Euclidean
144
  # distance on L2-normalized vectors. DBSCAN's 'cosine' metric internally
@@ -148,6 +169,16 @@ def clustering_author(background_corpus_df: pd.DataFrame,
148
  print("Normalizing embeddings for cosine distance...")
149
  X = normalize(X, norm='l2', axis=1)
150
 
 
 
 
 
 
 
 
 
 
 
151
  if eps_values is None:
152
  if metric == 'cosine':
153
  #eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
 
66
  embedding_clm: str = 'style_embedding',
67
  eps_values: List[float] = None,
68
  min_samples: int = 5,
69
+ pca_dimensions: int | None = None,
70
  metric: str = 'cosine') -> pd.DataFrame:
71
  """
72
  Performs DBSCAN clustering on embeddings in a DataFrame.
 
84
  For 'euclidean', scale depends on embedding magnitudes.
85
  min_samples (int): DBSCAN `min_samples` parameter. Minimum number of
86
  samples in a neighborhood for a point to be a core point.
87
+ pca_dimensions (int | None): If an integer is provided, PCA will be applied to reduce
88
+ embeddings to this number of dimensions before clustering.
89
  metric (str): The distance metric to use for DBSCAN and silhouette score
90
  (e.g., 'cosine', 'euclidean').
91
 
 
142
  background_corpus_df['cluster_label'] = final_labels_for_df
143
  return background_corpus_df
144
 
145
+ # --- Optional: Apply PCA for dimensionality reduction ---
146
+ if pca_dimensions is not None and X.shape[1] > pca_dimensions:
147
+ from sklearn.decomposition import PCA
148
+ print(f"Applying PCA to reduce dimensions from {X.shape[1]} to {pca_dimensions}...")
149
+ pca = PCA(n_components=pca_dimensions, random_state=42)
150
+ X = pca.fit_transform(X)
151
+
152
+ # If a test set is provided, transform its embeddings using the same PCA model
153
+ if test_corpus_df is not None:
154
+ test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
155
+ if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[1] == pca.n_features_in_:
156
+ print(f"Transforming test set embeddings with the same PCA model...")
157
+ transformed_test_embeddings = pca.transform(test_embeddings_matrix)
158
+ # Update the test DataFrame's embedding column with the reduced embeddings
159
+ test_corpus_df[embedding_clm] = list(transformed_test_embeddings)
160
+ else:
161
+ print("Warning: Could not apply PCA to test set due to dimension mismatch or invalid data.")
162
+
163
  # For cosine metric, normalize embeddings to unit length.
164
  # This is standard practice as cosine similarity is equivalent to Euclidean
165
  # distance on L2-normalized vectors. DBSCAN's 'cosine' metric internally
 
169
  print("Normalizing embeddings for cosine distance...")
170
  X = normalize(X, norm='l2', axis=1)
171
 
172
+ # Also normalize the test corpus embeddings if they exist
173
+ if test_corpus_df is not None:
174
+ print("Normalizing test corpus embeddings for cosine distance...")
175
+ test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
176
+ if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[0] > 0:
177
+ normalized_test_embeddings = normalize(test_embeddings_matrix, norm='l2', axis=1)
178
+ test_corpus_df[embedding_clm] = list(normalized_test_embeddings)
179
+ else:
180
+ print("Warning: Could not normalize test set embeddings due to invalid data.")
181
+
182
  if eps_values is None:
183
  if metric == 'cosine':
184
  #eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]