Milad Alshomary
commited on
Commit
·
ce8f806
1
Parent(s):
f310e45
updates
Browse files- cluster_corpus.py +7 -0
- utils/clustering_utils.py +31 -0
cluster_corpus.py
CHANGED
|
@@ -72,6 +72,12 @@ def main():
|
|
| 72 |
default=None,
|
| 73 |
help="A list of specific eps values to test for DBSCAN. If not provided, a default range is used."
|
| 74 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
args = parser.parse_args()
|
| 77 |
|
|
@@ -107,6 +113,7 @@ def main():
|
|
| 107 |
embedding_clm=embedding_col_name,
|
| 108 |
eps_values=args.eps_values,
|
| 109 |
min_samples=args.min_samples,
|
|
|
|
| 110 |
metric=args.metric
|
| 111 |
)
|
| 112 |
|
|
|
|
| 72 |
default=None,
|
| 73 |
help="A list of specific eps values to test for DBSCAN. If not provided, a default range is used."
|
| 74 |
)
|
| 75 |
+
parser.add_argument(
|
| 76 |
+
"--pca_dimensions",
|
| 77 |
+
type=int,
|
| 78 |
+
default=None,
|
| 79 |
+
help="If provided, apply PCA to reduce embeddings to this number of dimensions before clustering."
|
| 80 |
+
)
|
| 81 |
|
| 82 |
args = parser.parse_args()
|
| 83 |
|
|
|
|
| 113 |
embedding_clm=embedding_col_name,
|
| 114 |
eps_values=args.eps_values,
|
| 115 |
min_samples=args.min_samples,
|
| 116 |
+
pca_dimensions=args.pca_dimensions,
|
| 117 |
metric=args.metric
|
| 118 |
)
|
| 119 |
|
utils/clustering_utils.py
CHANGED
|
@@ -66,6 +66,7 @@ def clustering_author(background_corpus_df: pd.DataFrame,
|
|
| 66 |
embedding_clm: str = 'style_embedding',
|
| 67 |
eps_values: List[float] = None,
|
| 68 |
min_samples: int = 5,
|
|
|
|
| 69 |
metric: str = 'cosine') -> pd.DataFrame:
|
| 70 |
"""
|
| 71 |
Performs DBSCAN clustering on embeddings in a DataFrame.
|
|
@@ -83,6 +84,8 @@ def clustering_author(background_corpus_df: pd.DataFrame,
|
|
| 83 |
For 'euclidean', scale depends on embedding magnitudes.
|
| 84 |
min_samples (int): DBSCAN `min_samples` parameter. Minimum number of
|
| 85 |
samples in a neighborhood for a point to be a core point.
|
|
|
|
|
|
|
| 86 |
metric (str): The distance metric to use for DBSCAN and silhouette score
|
| 87 |
(e.g., 'cosine', 'euclidean').
|
| 88 |
|
|
@@ -139,6 +142,24 @@ def clustering_author(background_corpus_df: pd.DataFrame,
|
|
| 139 |
background_corpus_df['cluster_label'] = final_labels_for_df
|
| 140 |
return background_corpus_df
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
# For cosine metric, normalize embeddings to unit length.
|
| 143 |
# This is standard practice as cosine similarity is equivalent to Euclidean
|
| 144 |
# distance on L2-normalized vectors. DBSCAN's 'cosine' metric internally
|
|
@@ -148,6 +169,16 @@ def clustering_author(background_corpus_df: pd.DataFrame,
|
|
| 148 |
print("Normalizing embeddings for cosine distance...")
|
| 149 |
X = normalize(X, norm='l2', axis=1)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
if eps_values is None:
|
| 152 |
if metric == 'cosine':
|
| 153 |
#eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
|
|
|
|
| 66 |
embedding_clm: str = 'style_embedding',
|
| 67 |
eps_values: List[float] = None,
|
| 68 |
min_samples: int = 5,
|
| 69 |
+
pca_dimensions: int | None = None,
|
| 70 |
metric: str = 'cosine') -> pd.DataFrame:
|
| 71 |
"""
|
| 72 |
Performs DBSCAN clustering on embeddings in a DataFrame.
|
|
|
|
| 84 |
For 'euclidean', scale depends on embedding magnitudes.
|
| 85 |
min_samples (int): DBSCAN `min_samples` parameter. Minimum number of
|
| 86 |
samples in a neighborhood for a point to be a core point.
|
| 87 |
+
pca_dimensions (int | None): If an integer is provided, PCA will be applied to reduce
|
| 88 |
+
embeddings to this number of dimensions before clustering.
|
| 89 |
metric (str): The distance metric to use for DBSCAN and silhouette score
|
| 90 |
(e.g., 'cosine', 'euclidean').
|
| 91 |
|
|
|
|
| 142 |
background_corpus_df['cluster_label'] = final_labels_for_df
|
| 143 |
return background_corpus_df
|
| 144 |
|
| 145 |
+
# --- Optional: Apply PCA for dimensionality reduction ---
|
| 146 |
+
if pca_dimensions is not None and X.shape[1] > pca_dimensions:
|
| 147 |
+
from sklearn.decomposition import PCA
|
| 148 |
+
print(f"Applying PCA to reduce dimensions from {X.shape[1]} to {pca_dimensions}...")
|
| 149 |
+
pca = PCA(n_components=pca_dimensions, random_state=42)
|
| 150 |
+
X = pca.fit_transform(X)
|
| 151 |
+
|
| 152 |
+
# If a test set is provided, transform its embeddings using the same PCA model
|
| 153 |
+
if test_corpus_df is not None:
|
| 154 |
+
test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
|
| 155 |
+
if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[1] == pca.n_features_in_:
|
| 156 |
+
print(f"Transforming test set embeddings with the same PCA model...")
|
| 157 |
+
transformed_test_embeddings = pca.transform(test_embeddings_matrix)
|
| 158 |
+
# Update the test DataFrame's embedding column with the reduced embeddings
|
| 159 |
+
test_corpus_df[embedding_clm] = list(transformed_test_embeddings)
|
| 160 |
+
else:
|
| 161 |
+
print("Warning: Could not apply PCA to test set due to dimension mismatch or invalid data.")
|
| 162 |
+
|
| 163 |
# For cosine metric, normalize embeddings to unit length.
|
| 164 |
# This is standard practice as cosine similarity is equivalent to Euclidean
|
| 165 |
# distance on L2-normalized vectors. DBSCAN's 'cosine' metric internally
|
|
|
|
| 169 |
print("Normalizing embeddings for cosine distance...")
|
| 170 |
X = normalize(X, norm='l2', axis=1)
|
| 171 |
|
| 172 |
+
# Also normalize the test corpus embeddings if they exist
|
| 173 |
+
if test_corpus_df is not None:
|
| 174 |
+
print("Normalizing test corpus embeddings for cosine distance...")
|
| 175 |
+
test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
|
| 176 |
+
if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[0] > 0:
|
| 177 |
+
normalized_test_embeddings = normalize(test_embeddings_matrix, norm='l2', axis=1)
|
| 178 |
+
test_corpus_df[embedding_clm] = list(normalized_test_embeddings)
|
| 179 |
+
else:
|
| 180 |
+
print("Warning: Could not normalize test set embeddings due to invalid data.")
|
| 181 |
+
|
| 182 |
if eps_values is None:
|
| 183 |
if metric == 'cosine':
|
| 184 |
#eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
|