Milad Alshomary commited on
Commit
3269340
Β·
1 Parent(s): ce8f806

changes to work with reddit data

Browse files
app.py CHANGED
@@ -26,14 +26,14 @@ def load_config(path="config/config.yaml"):
26
  cfg = load_config()
27
 
28
 
29
- download_file_override(cfg.get('interp_space_url'), cfg.get('interp_space_path'))
30
- download_file_override(cfg.get('instances_to_explain_url'), cfg.get('instances_to_explain_path'))
31
- download_file_override(cfg.get('gram2vec_feats_url'), cfg.get('gram2vec_feats_path'))
32
- download_file_override(cfg.get('embeddings_cache_url'), cfg.get('embeddings_cache_path'))
33
- download_file_override(cfg.get('zoom_cache_url'), cfg.get('zoom_cache_path'))
34
- download_file_override(cfg.get('region_cache_url'), cfg.get('region_cache_path'))
35
- download_file_override(cfg.get('tsne_cache_url'), cfg.get('tsne_cache_path'))
36
- download_file_override(cfg.get('llm_style_features_cache_url'), cfg.get('llm_style_features_cache_path'))
37
 
38
  from utils.visualizations import *
39
  from utils.llm_feat_utils import *
@@ -64,8 +64,10 @@ def validate_ground_truth(gt1, gt2, gt3):
64
  def app(share=False):
65
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
66
 
67
- interp = load_interp_space(cfg)
68
- clustered_authors_df = interp['clustered_authors_df']
 
 
69
 
70
  with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
71
  # ── Big Centered Title ──────────────────────────────────────────
@@ -227,7 +229,6 @@ def app(share=False):
227
  load_button = gr.Button("Load Task & Generate Embeddings")
228
 
229
  # ── HTML outputs for author texts ───────────────────────────
230
- default_outputs = load_instance(0, instances)
231
  #dont need defaults since they are loaded only on click of the load button
232
  header = gr.HTML()
233
  mystery = gr.HTML()
 
26
  cfg = load_config()
27
 
28
 
29
+ # download_file_override(cfg.get('interp_space_url'), cfg.get('interp_space_path'))
30
+ # download_file_override(cfg.get('instances_to_explain_url'), cfg.get('instances_to_explain_path'))
31
+ # download_file_override(cfg.get('gram2vec_feats_url'), cfg.get('gram2vec_feats_path'))
32
+ # download_file_override(cfg.get('embeddings_cache_url'), cfg.get('embeddings_cache_path'))
33
+ # download_file_override(cfg.get('zoom_cache_url'), cfg.get('zoom_cache_path'))
34
+ # download_file_override(cfg.get('region_cache_url'), cfg.get('region_cache_path'))
35
+ # download_file_override(cfg.get('tsne_cache_url'), cfg.get('tsne_cache_path'))
36
+ # download_file_override(cfg.get('llm_style_features_cache_url'), cfg.get('llm_style_features_cache_path'))
37
 
38
  from utils.visualizations import *
39
  from utils.llm_feat_utils import *
 
64
  def app(share=False):
65
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
66
 
67
+ #interp = load_interp_space(cfg)
68
+ #clustered_authors_df = interp['clustered_authors_df']
69
+ clustered_authors_df = pickle.load(open(cfg['background_authors_df_path'], 'rb'))
70
+
71
 
72
  with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
73
  # ── Big Centered Title ──────────────────────────────────────────
 
229
  load_button = gr.Button("Load Task & Generate Embeddings")
230
 
231
  # ── HTML outputs for author texts ───────────────────────────
 
232
  #dont need defaults since they are loaded only on click of the load button
233
  header = gr.HTML()
234
  mystery = gr.HTML()
cluster_corpus.py CHANGED
@@ -85,6 +85,7 @@ def main():
85
  corpus_df = load_corpus(args.corpus_path)
86
  test_corpus_df = load_corpus(args.test_corpus_path)
87
 
 
88
  # 2. Generate style embeddings
89
  print(f"\nGenerating style embeddings with model: {args.model_name}")
90
  # The function returns two dataframes, we are only interested in the first one here.
@@ -117,6 +118,9 @@ def main():
117
  metric=args.metric
118
  )
119
 
 
 
 
120
  # 4. Save the results
121
  output_dir = os.path.dirname(args.output_path)
122
  if output_dir:
 
85
  corpus_df = load_corpus(args.corpus_path)
86
  test_corpus_df = load_corpus(args.test_corpus_path)
87
 
88
+ #print(corpus_df)
89
  # 2. Generate style embeddings
90
  print(f"\nGenerating style embeddings with model: {args.model_name}")
91
  # The function returns two dataframes, we are only interested in the first one here.
 
118
  metric=args.metric
119
  )
120
 
121
+ # remove authors with cluster label == -1
122
+ clustered_df = clustered_df[clustered_df['cluster_label'] != -1]
123
+
124
  # 4. Save the results
125
  output_dir = os.path.dirname(args.output_path)
126
  if output_dir:
config/config.yaml CHANGED
@@ -1,22 +1,31 @@
1
  # config.yaml
2
- instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
3
- instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
 
 
4
  interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
5
  interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
 
6
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
7
  gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
8
 
9
  embeddings_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/embeddings_cache.zip?download=true"
10
  embeddings_cache_path: "./datasets/embeddings_cache/"
 
11
  zoom_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/zoom_cache.zip?download=true"
12
  zoom_cache_path: "./datasets/zoom_cache/"
 
13
  region_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/region_cache.zip?download=true"
14
  region_cache_path: "./datasets/region_cache/"
 
15
  tsne_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/tsne_cache.pkl?download=true"
16
  tsne_cache_path: "./datasets/tsne_cache.pkl"
 
17
  llm_style_features_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/feature_spans_cache.zip?download=true"
18
  llm_style_features_cache_path: "./datasets/feature_spans_cache/"
19
 
 
 
20
  style_feat_clm: "llm_tfidf_weights"
21
  top_k: 10
22
  only_llm_feats: false
 
1
  # config.yaml
2
+ #instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
3
+ #instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
4
+ instances_to_explain_path: "./datasets/reddit_explanation_sample.json"
5
+
6
  interp_space_path: "./datasets/sentence_luar_interp_space_2_35/"
7
  interp_space_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
8
+
9
  gram2vec_feats_path: "./datasets/gram2vec_feats.csv"
10
  gram2vec_feats_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
11
 
12
  embeddings_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/embeddings_cache.zip?download=true"
13
  embeddings_cache_path: "./datasets/embeddings_cache/"
14
+
15
  zoom_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/zoom_cache.zip?download=true"
16
  zoom_cache_path: "./datasets/zoom_cache/"
17
+
18
  region_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/region_cache.zip?download=true"
19
  region_cache_path: "./datasets/region_cache/"
20
+
21
  tsne_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/tsne_cache.pkl?download=true"
22
  tsne_cache_path: "./datasets/tsne_cache.pkl"
23
+
24
  llm_style_features_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/feature_spans_cache.zip?download=true"
25
  llm_style_features_cache_path: "./datasets/feature_spans_cache/"
26
 
27
+ background_authors_df_path: "./datasets/reddit_clustered_authors.pkl"
28
+
29
  style_feat_clm: "llm_tfidf_weights"
30
  top_k: 10
31
  only_llm_feats: false
precompute_caches.py CHANGED
@@ -45,8 +45,9 @@ def precompute_all_caches(
45
  print(f"Configuration loaded from {config_path}")
46
  print(f"config : \n{cfg}")
47
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
48
- interp = load_interp_space(cfg)
49
- clustered_authors_df = interp['clustered_authors_df']
 
50
 
51
  if instances_to_process is None:
52
  instances_to_process = instance_ids
 
45
  print(f"Configuration loaded from {config_path}")
46
  print(f"config : \n{cfg}")
47
  instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
48
+ # interp = load_interp_space(cfg)
49
+ # clustered_authors_df = interp['clustered_authors_df']
50
+ clustered_authors_df = pickle.load(open(cfg['background_authors_df_path'], 'rb'))
51
 
52
  if instances_to_process is None:
53
  instances_to_process = instance_ids
prepare_data.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ import csv
4
+ import sys
5
+ import copy
6
+ import os
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ import matplotlib.pyplot as plt
11
+ import glob
12
+ from sklearn.preprocessing import minmax_scale
13
+ import random
14
+
15
+
16
+ import pickle
17
+
18
+ import json
19
+ import pandas as pd
20
+
21
+ def sample_ds(input_file, output_file, num_insts=10000, min_num_text_per_inst=0, max_num_text_per_inst=3):
22
+ """
23
+ sample_ds('/mnt/swordfish-pool2/nikhil/raw_all/test_queries.jsonl', '/mnt/swordfish-pool2/milad/hiatus-data/reddit_cluster_test.pkl',
24
+ num_insts=10000,
25
+ min_num_text_per_inst=3,
26
+ max_num_text_per_inst=10)
27
+
28
+ sample_ds('/mnt/swordfish-pool2/nikhil/raw_all/data.jsonl', '/mnt/swordfish-pool2/milad/hiatus-data/reddit_cluster_training.pkl',
29
+ num_insts=10000,
30
+ min_num_text_per_inst=3,
31
+ max_num_text_per_inst=10)
32
+ """
33
+ f = open(input_file)
34
+ out_list = []
35
+ for i in range(num_insts):
36
+ json_obj = json.loads(f.readline())
37
+ if len(json_obj['syms']) < min_num_text_per_inst:
38
+ continue
39
+
40
+ out_list.append({
41
+ 'fullText': json_obj['syms'][:max_num_text_per_inst],
42
+ 'authorID': json_obj['author_id']
43
+ })
44
+ df = pd.DataFrame(out_list)
45
+ df.to_pickle(output_file)
46
+
47
+ def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents_per_author=4):
48
+
49
+ df = pd.read_pickle(open(input_path, 'rb'))
50
+ output_objs = []
51
+
52
+ for idx, row in df.iterrows():
53
+
54
+ # Get the current author's documents
55
+ query_author_df = df[df.authorID == row['authorID']]
56
+ # split the author's documents into two: query and correct author
57
+ author_documents = query_author_df.fullText.tolist()[0]
58
+
59
+ if len(author_documents) < num_documents_per_author * 2:
60
+ continue
61
+
62
+ query_documents = author_documents[:num_documents_per_author]
63
+ correct_documents = author_documents[num_documents_per_author:]
64
+
65
+
66
+ # Sample two *other* authors
67
+ other_authors_df = df[df.authorID != row['authorID']]
68
+ other_two_authors = other_authors_df.sample(2, random_state=random_seed)
69
+
70
+ output_objs.append({
71
+ "Q_authorID": str(row["authorID"]),
72
+ "Q_fullText": query_documents,
73
+ "a0_authorID": str(other_two_authors.iloc[0]["authorID"]),
74
+ "a0_fullText": other_two_authors.iloc[0]["fullText"][:num_documents_per_author],
75
+ "a1_authorID": str(other_two_authors.iloc[1]["authorID"]),
76
+ "a1_fullText": other_two_authors.iloc[1]["fullText"][:num_documents_per_author],
77
+ "a2_authorID": str(row["authorID"]) + "_correct",
78
+ "a2_fullText": correct_documents,
79
+ "gt_idx": 2
80
+ })
81
+ random_seed += 1 # Increment seed to get different authors for the next task
82
+ if len(output_objs) >= num_instances:
83
+ break
84
+
85
+ return output_objs
86
+
87
+
88
+ def get_iarapa_pilot_data(input_path):
89
+ for data_point in glob.glob(input_path + '*/'):
90
+ candidates_file = list(glob.glob(data_point + '/data/*_candidates.jsonl'))[0]
91
+ queries_file = list(glob.glob(data_point + '/data/*_queries.jsonl'))[0]
92
+ grount_truth_file = list(glob.glob(data_point + '/groundtruth/*_groundtruth.npy'))[0]
93
+ q_labels_file = glob.glob(data_point + '/groundtruth/*_query-labels.txt')[0]
94
+ c_labels_file = glob.glob(data_point + '/groundtruth/*_candidate-labels.txt')[0]
95
+
96
+ candidates_df = pd.read_json(candidates_file, lines=True)
97
+ queries_df = pd.read_json(queries_file, lines=True)
98
+
99
+ queries_df['authorID'] = queries_df.authorIDs.apply(lambda x: x[0])
100
+ candidates_df['authorID'] = candidates_df.authorSetIDs.apply(lambda x: x[0])
101
+
102
+ queries_df = queries_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
103
+ candidates_df = candidates_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
104
+
105
+ ground_truth_assignment = np.load(open(grount_truth_file, 'rb'))
106
+ candidate_authors = [a[2:-3] for a in open(c_labels_file).read().split('\n')][:-1]
107
+ query_authors = [a[2:-3] for a in open(q_labels_file).read().split('\n')][:-1]
108
+
109
+ #print(ground_truth_assignment)
110
+ #print(candidate_authors)
111
+ #print(query_authors)
112
+ yield query_authors, candidate_authors, queries_df, candidates_df, ground_truth_assignment
113
+
114
+ def main():
115
+ """
116
+ Main entry point for the script.
117
+ """
118
+ parser = argparse.ArgumentParser(description="Prepare Reddit data for author attribution tasks.")
119
+ parser.add_argument("input_path", type=str, help="Path to the input pandas DataFrame pickle file.")
120
+ parser.add_argument("output_path", type=str, help="Path to save the output JSON file.")
121
+ parser.add_argument("--random_seed", type=int, default=123, help="Random seed for sampling.")
122
+ parser.add_argument("--num_docs", type=int, default=5, help="Number of documents per author for query and correct sets.")
123
+
124
+ args = parser.parse_args()
125
+
126
+ print(f"Processing data from: {args.input_path}")
127
+ output_data = get_reddit_data(
128
+ input_path=args.input_path,
129
+ random_seed=args.random_seed,
130
+ num_documents_per_author=args.num_docs
131
+ )
132
+
133
+ print(f"Saving {len(output_data)} tasks to: {args.output_path}")
134
+ with open(args.output_path, 'w') as f:
135
+ json.dump(output_data, f, indent=4)
136
+
137
+ print("Done.")
138
+
139
+ if __name__ == "__main__":
140
+ main()
utils/clustering_utils.py CHANGED
@@ -7,6 +7,7 @@ from sklearn.metrics import silhouette_score
7
  from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
8
  from scipy.stats import pearsonr, ConstantInputWarning
9
  from typing import List, Dict, Any
 
10
 
11
  import json
12
 
@@ -99,6 +100,7 @@ def clustering_author(background_corpus_df: pd.DataFrame,
99
 
100
  embeddings_list = background_corpus_df[embedding_clm].tolist()
101
 
 
102
  X_list = []
103
  original_indices = [] # To map results back to the original DataFrame's indices
104
 
@@ -148,17 +150,23 @@ def clustering_author(background_corpus_df: pd.DataFrame,
148
  print(f"Applying PCA to reduce dimensions from {X.shape[1]} to {pca_dimensions}...")
149
  pca = PCA(n_components=pca_dimensions, random_state=42)
150
  X = pca.fit_transform(X)
 
 
 
 
151
 
152
  # If a test set is provided, transform its embeddings using the same PCA model
153
  if test_corpus_df is not None:
154
  test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
155
- if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[1] == pca.n_features_in_:
156
  print(f"Transforming test set embeddings with the same PCA model...")
157
  transformed_test_embeddings = pca.transform(test_embeddings_matrix)
158
  # Update the test DataFrame's embedding column with the reduced embeddings
 
159
  test_corpus_df[embedding_clm] = list(transformed_test_embeddings)
160
  else:
161
- print("Warning: Could not apply PCA to test set due to dimension mismatch or invalid data.")
 
162
 
163
  # For cosine metric, normalize embeddings to unit length.
164
  # This is standard practice as cosine similarity is equivalent to Euclidean
@@ -167,7 +175,10 @@ def clustering_author(background_corpus_df: pd.DataFrame,
167
  if metric == 'cosine':
168
  from sklearn.preprocessing import normalize
169
  print("Normalizing embeddings for cosine distance...")
170
- X = normalize(X, norm='l2', axis=1)
 
 
 
171
 
172
  # Also normalize the test corpus embeddings if they exist
173
  if test_corpus_df is not None:
@@ -178,11 +189,11 @@ def clustering_author(background_corpus_df: pd.DataFrame,
178
  test_corpus_df[embedding_clm] = list(normalized_test_embeddings)
179
  else:
180
  print("Warning: Could not normalize test set embeddings due to invalid data.")
181
-
182
  if eps_values is None:
183
  if metric == 'cosine':
184
  #eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
185
- eps_values = np.arange(0.01, 0.3, 0.01)
186
  else: # 'euclidean' or other
187
  if X.shape[0] > 1:
188
  # For Euclidean, eps depends on the scale of the data.
@@ -201,6 +212,7 @@ def clustering_author(background_corpus_df: pd.DataFrame,
201
  best_score = -1.001
202
  best_labels = None
203
  best_eps = None
 
204
 
205
  # This loop now lives in `clustering_author` to have access to the full DataFrame for evaluation.
206
  for eps in eps_values:
@@ -211,6 +223,8 @@ def clustering_author(background_corpus_df: pd.DataFrame,
211
  current_labels = db.fit_predict(X)
212
 
213
  # --- Evaluation Step 1: Silhouette Score ---
 
 
214
  score = _calculate_silhouette_score(X, current_labels, metric)
215
  if score is not None:
216
  print(f" - Silhouette Score: {score:.4f}")
@@ -236,9 +250,9 @@ def clustering_author(background_corpus_df: pd.DataFrame,
236
 
237
  # --- Evaluation Step 3: Distance Preservation on Test Corpus (if provided) ---
238
  if test_corpus_df is not None:
 
239
  # We need the centroids from the current clustering of the background corpus
240
  centroids = _compute_cluster_centroids(temp_df[temp_df['cluster_label'] != -1], embedding_clm, 'cluster_label')
241
-
242
  test_correlation = evaluate_test_set_distance_preservation(test_corpus_df, centroids, embedding_clm)
243
  if test_correlation is not None:
244
  print(f" - Test Set Distance Preservation (Pearson r): {test_correlation:.4f}")
@@ -246,7 +260,14 @@ def clustering_author(background_corpus_df: pd.DataFrame,
246
  print(" - Test Set Distance Preservation (Pearson r): N/A (not enough test data or clusters)")
247
 
248
  print('Eps {}, #clusters {}, solihouette {}, Pearson {}'.format(eps, len(set(current_labels) - {-1}), score, test_correlation))
 
249
 
 
 
 
 
 
 
250
  if best_labels is not None:
251
  num_found_clusters = len(set(best_labels) - {-1})
252
  print(f"\n--- Best Clustering Result ---")
@@ -450,6 +471,8 @@ def evaluate_test_set_distance_preservation(
450
 
451
  # 2. Project test embeddings into the centroid space and get new distances
452
  projected_embeddings_matrix = _project_to_centroid_space(test_embeddings_matrix, centroids_map)
 
 
453
  if projected_embeddings_matrix.ndim != 2 or projected_embeddings_matrix.shape[1] < 2:
454
  return None # Projection failed or resulted in a space with <2 dimensions
455
 
 
7
  from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
8
  from scipy.stats import pearsonr, ConstantInputWarning
9
  from typing import List, Dict, Any
10
+ from tabulate import tabulate
11
 
12
  import json
13
 
 
100
 
101
  embeddings_list = background_corpus_df[embedding_clm].tolist()
102
 
103
+
104
  X_list = []
105
  original_indices = [] # To map results back to the original DataFrame's indices
106
 
 
150
  print(f"Applying PCA to reduce dimensions from {X.shape[1]} to {pca_dimensions}...")
151
  pca = PCA(n_components=pca_dimensions, random_state=42)
152
  X = pca.fit_transform(X)
153
+
154
+ # Update the background_corpus_df with the transformed embeddings
155
+ # This ensures subsequent centroid calculations use the reduced-dimension space.
156
+ background_corpus_df[embedding_clm] = list(X)
157
 
158
  # If a test set is provided, transform its embeddings using the same PCA model
159
  if test_corpus_df is not None:
160
  test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
161
+ if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[0] > 0 and test_embeddings_matrix.shape[1] == pca.n_features_in_:
162
  print(f"Transforming test set embeddings with the same PCA model...")
163
  transformed_test_embeddings = pca.transform(test_embeddings_matrix)
164
  # Update the test DataFrame's embedding column with the reduced embeddings
165
+ #test_corpus_df.loc[:, embedding_clm] = list(transformed_test_embeddings)
166
  test_corpus_df[embedding_clm] = list(transformed_test_embeddings)
167
  else:
168
+ print(f"Warning: Could not apply PCA to test set. Test shape: {test_embeddings_matrix.shape}, PCA features: {pca.n_features_in_}")
169
+
170
 
171
  # For cosine metric, normalize embeddings to unit length.
172
  # This is standard practice as cosine similarity is equivalent to Euclidean
 
175
  if metric == 'cosine':
176
  from sklearn.preprocessing import normalize
177
  print("Normalizing embeddings for cosine distance...")
178
+ X_normalized = normalize(X, norm='l2', axis=1)
179
+ # Update the background_corpus_df with the normalized embeddings
180
+ background_corpus_df[embedding_clm] = list(X_normalized)
181
+ X = X_normalized # Use the normalized data for clustering
182
 
183
  # Also normalize the test corpus embeddings if they exist
184
  if test_corpus_df is not None:
 
189
  test_corpus_df[embedding_clm] = list(normalized_test_embeddings)
190
  else:
191
  print("Warning: Could not normalize test set embeddings due to invalid data.")
192
+
193
  if eps_values is None:
194
  if metric == 'cosine':
195
  #eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
196
+ eps_values = np.arange(0.01, 0.2, 0.01)
197
  else: # 'euclidean' or other
198
  if X.shape[0] > 1:
199
  # For Euclidean, eps depends on the scale of the data.
 
212
  best_score = -1.001
213
  best_labels = None
214
  best_eps = None
215
+ results_for_table = []
216
 
217
  # This loop now lives in `clustering_author` to have access to the full DataFrame for evaluation.
218
  for eps in eps_values:
 
223
  current_labels = db.fit_predict(X)
224
 
225
  # --- Evaluation Step 1: Silhouette Score ---
226
+ num_clusters = len(set(current_labels) - {-1})
227
+ num_outliers = np.sum(current_labels == -1)
228
  score = _calculate_silhouette_score(X, current_labels, metric)
229
  if score is not None:
230
  print(f" - Silhouette Score: {score:.4f}")
 
250
 
251
  # --- Evaluation Step 3: Distance Preservation on Test Corpus (if provided) ---
252
  if test_corpus_df is not None:
253
+ test_correlation = None
254
  # We need the centroids from the current clustering of the background corpus
255
  centroids = _compute_cluster_centroids(temp_df[temp_df['cluster_label'] != -1], embedding_clm, 'cluster_label')
 
256
  test_correlation = evaluate_test_set_distance_preservation(test_corpus_df, centroids, embedding_clm)
257
  if test_correlation is not None:
258
  print(f" - Test Set Distance Preservation (Pearson r): {test_correlation:.4f}")
 
260
  print(" - Test Set Distance Preservation (Pearson r): N/A (not enough test data or clusters)")
261
 
262
  print('Eps {}, #clusters {}, solihouette {}, Pearson {}'.format(eps, len(set(current_labels) - {-1}), score, test_correlation))
263
+ results_for_table.append([f"{eps:.3f}", f"{score:.4f}" if score is not None else "N/A", f"{test_correlation:.4f}" if test_correlation is not None else "N/A", num_clusters, num_outliers])
264
 
265
+ # --- Print Final Summary Table ---
266
+ print("\n\n--- Clustering Run Summary ---")
267
+ headers = ["Epsilon (eps)", "Silhouette Score", "Test Dist. Preserv.", "# Clusters", "# Outliers"]
268
+ print(tabulate(results_for_table, headers=headers, tablefmt="grid"))
269
+ print("----------------------------\n")
270
+
271
  if best_labels is not None:
272
  num_found_clusters = len(set(best_labels) - {-1})
273
  print(f"\n--- Best Clustering Result ---")
 
471
 
472
  # 2. Project test embeddings into the centroid space and get new distances
473
  projected_embeddings_matrix = _project_to_centroid_space(test_embeddings_matrix, centroids_map)
474
+
475
+
476
  if projected_embeddings_matrix.ndim != 2 or projected_embeddings_matrix.shape[1] < 2:
477
  return None # Projection failed or resulted in a space with <2 dimensions
478
 
utils/ui.py CHANGED
@@ -91,7 +91,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
91
  if mode == "Predefined HRS Task":
92
  iid = int(iid.replace('Task ', ''))
93
  data = instances[iid]
94
- predicted_author = data['latent_rank'][0]
95
  ground_truth_author = 100#data['gt_idx']
96
  mystery_txt = data['Q_fullText']
97
  c1_txt = data['a0_fullText']
@@ -100,7 +99,7 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
100
  candidate_texts = [c1_txt, c2_txt, c3_txt]
101
 
102
  #create a dataframe of the task authors
103
- task_authors_df = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
104
  print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
105
  else:
106
  header_html = "<h3>Custom Uploaded Task</h3>"
@@ -136,10 +135,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
136
  task_authors_df['g2v_vector'] = task_authors_g2v
137
  print(f"Gram2Vec feature generation complete")
138
 
139
- if mode != "Predefined HRS Task":
140
- # Computing predicted author by checking pairwise cosine similarity over luar embeddings
141
- col_name = f'{model_name.split("/")[-1]}_style_embedding'
142
- predicted_author = compute_predicted_author(task_authors_df, col_name)
143
 
144
  #generating html for the task
145
  header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)
 
91
  if mode == "Predefined HRS Task":
92
  iid = int(iid.replace('Task ', ''))
93
  data = instances[iid]
 
94
  ground_truth_author = 100#data['gt_idx']
95
  mystery_txt = data['Q_fullText']
96
  c1_txt = data['a0_fullText']
 
99
  candidate_texts = [c1_txt, c2_txt, c3_txt]
100
 
101
  #create a dataframe of the task authors
102
+ task_authors_df = instance_to_df(instances[iid], predicted_author=None, ground_truth_author=ground_truth_author)
103
  print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
104
  else:
105
  header_html = "<h3>Custom Uploaded Task</h3>"
 
135
  task_authors_df['g2v_vector'] = task_authors_g2v
136
  print(f"Gram2Vec feature generation complete")
137
 
 
 
 
 
138
 
139
  #generating html for the task
140
  header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)
utils/visualizations.py CHANGED
@@ -389,9 +389,9 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
389
  print(background_authors_embeddings_df.columns)
390
  print("Generating cluster visualization")
391
  iid = int(iid)
392
- interp = load_interp_space(cfg)
393
  # dim2lat = interp['dimension_to_latent']
394
- style_names = interp['dimension_to_style']
395
  # bg_emb = np.array(interp['author_embedding'])
396
  # print(f"bg_emb shape: {bg_emb.shape}")
397
  #replace with cached embedddings
@@ -544,7 +544,7 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
544
  return (
545
  fig,
546
  # update(choices=display_clusters, value=display_clusters[cluster_label_query]),
547
- style_names,
548
  bg_proj, # Return background points
549
  bg_ids, # Return background labels
550
  background_authors_embeddings_df, # Return the DataFrame for zoom handling
 
389
  print(background_authors_embeddings_df.columns)
390
  print("Generating cluster visualization")
391
  iid = int(iid)
392
+ #interp = load_interp_space(cfg)
393
  # dim2lat = interp['dimension_to_latent']
394
+ #style_names = interp['dimension_to_style']
395
  # bg_emb = np.array(interp['author_embedding'])
396
  # print(f"bg_emb shape: {bg_emb.shape}")
397
  #replace with cached embedddings
 
544
  return (
545
  fig,
546
  # update(choices=display_clusters, value=display_clusters[cluster_label_query]),
547
+ None,
548
  bg_proj, # Return background points
549
  bg_ids, # Return background labels
550
  background_authors_embeddings_df, # Return the DataFrame for zoom handling