Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Sleeping

App Files Files Community

Milad Alshomary commited on Oct 21

Commit

3269340

1 Parent(s): ce8f806

changes to work with reddit data

Browse files

Files changed (8) hide show

app.py +12 -11
cluster_corpus.py +4 -0
config/config.yaml +11 -2
precompute_caches.py +3 -2
prepare_data.py +140 -0
utils/clustering_utils.py +29 -6
utils/ui.py +1 -6
utils/visualizations.py +3 -3

app.py CHANGED Viewed

@@ -26,14 +26,14 @@ def load_config(path="config/config.yaml"):
 cfg = load_config()
-download_file_override(cfg.get('interp_space_url'), cfg.get('interp_space_path'))
-download_file_override(cfg.get('instances_to_explain_url'), cfg.get('instances_to_explain_path'))
-download_file_override(cfg.get('gram2vec_feats_url'), cfg.get('gram2vec_feats_path'))
-download_file_override(cfg.get('embeddings_cache_url'), cfg.get('embeddings_cache_path'))
-download_file_override(cfg.get('zoom_cache_url'), cfg.get('zoom_cache_path'))
-download_file_override(cfg.get('region_cache_url'), cfg.get('region_cache_path'))
-download_file_override(cfg.get('tsne_cache_url'), cfg.get('tsne_cache_path'))
-download_file_override(cfg.get('llm_style_features_cache_url'), cfg.get('llm_style_features_cache_path'))
 from utils.visualizations import *
 from utils.llm_feat_utils import *
@@ -64,8 +64,10 @@ def validate_ground_truth(gt1, gt2, gt3):
 def app(share=False):
     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
-    interp      = load_interp_space(cfg)
-    clustered_authors_df = interp['clustered_authors_df']
     with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
         # ── Big Centered Title ──────────────────────────────────────────
@@ -227,7 +229,6 @@ def app(share=False):
         load_button = gr.Button("Load Task & Generate Embeddings")
         # ── HTML outputs for author texts ───────────────────────────
-        default_outputs = load_instance(0, instances)
         #dont need defaults since they are loaded only on click of the load button
         header  = gr.HTML()
         mystery = gr.HTML()

 cfg = load_config()
+# download_file_override(cfg.get('interp_space_url'), cfg.get('interp_space_path'))
+# download_file_override(cfg.get('instances_to_explain_url'), cfg.get('instances_to_explain_path'))
+# download_file_override(cfg.get('gram2vec_feats_url'), cfg.get('gram2vec_feats_path'))
+# download_file_override(cfg.get('embeddings_cache_url'), cfg.get('embeddings_cache_path'))
+# download_file_override(cfg.get('zoom_cache_url'), cfg.get('zoom_cache_path'))
+# download_file_override(cfg.get('region_cache_url'), cfg.get('region_cache_path'))
+# download_file_override(cfg.get('tsne_cache_url'), cfg.get('tsne_cache_path'))
+# download_file_override(cfg.get('llm_style_features_cache_url'), cfg.get('llm_style_features_cache_path'))
 from utils.visualizations import *
 from utils.llm_feat_utils import *
 def app(share=False):
     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
+    #interp      = load_interp_space(cfg)
+    #clustered_authors_df = interp['clustered_authors_df']
+    clustered_authors_df = pickle.load(open(cfg['background_authors_df_path'], 'rb'))
     with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
         # ── Big Centered Title ──────────────────────────────────────────
         load_button = gr.Button("Load Task & Generate Embeddings")
         # ── HTML outputs for author texts ───────────────────────────
         #dont need defaults since they are loaded only on click of the load button
         header  = gr.HTML()
         mystery = gr.HTML()

cluster_corpus.py CHANGED Viewed

@@ -85,6 +85,7 @@ def main():
     corpus_df = load_corpus(args.corpus_path)
     test_corpus_df = load_corpus(args.test_corpus_path)
     # 2. Generate style embeddings
     print(f"\nGenerating style embeddings with model: {args.model_name}")
     # The function returns two dataframes, we are only interested in the first one here.
@@ -117,6 +118,9 @@ def main():
         metric=args.metric
     )
     # 4. Save the results
     output_dir = os.path.dirname(args.output_path)
     if output_dir:

     corpus_df = load_corpus(args.corpus_path)
     test_corpus_df = load_corpus(args.test_corpus_path)
+    #print(corpus_df)
     # 2. Generate style embeddings
     print(f"\nGenerating style embeddings with model: {args.model_name}")
     # The function returns two dataframes, we are only interested in the first one here.
         metric=args.metric
     )
+    # remove authors with cluster label == -1
+    clustered_df = clustered_df[clustered_df['cluster_label'] != -1]
     # 4. Save the results
     output_dir = os.path.dirname(args.output_path)
     if output_dir:

config/config.yaml CHANGED Viewed

@@ -1,22 +1,31 @@
 # config.yaml
-instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
-instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
 interp_space_path:    "./datasets/sentence_luar_interp_space_2_35/"
 interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
 gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"
 gram2vec_feats_url:      "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
 embeddings_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/embeddings_cache.zip?download=true"
 embeddings_cache_path: "./datasets/embeddings_cache/"
 zoom_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/zoom_cache.zip?download=true"
 zoom_cache_path: "./datasets/zoom_cache/"
 region_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/region_cache.zip?download=true"
 region_cache_path: "./datasets/region_cache/"
 tsne_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/tsne_cache.pkl?download=true"
 tsne_cache_path: "./datasets/tsne_cache.pkl"
 llm_style_features_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/feature_spans_cache.zip?download=true"
 llm_style_features_cache_path: "./datasets/feature_spans_cache/"
 style_feat_clm:       "llm_tfidf_weights"
 top_k:                10
 only_llm_feats:       false

 # config.yaml
+#instances_to_explain_path: "./datasets/hrs_explanations_luar_clusters_2_35_balanced.json"
+#instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/hrs_explanations_luar_clusters_2_35_balanced.json?download=true"
+instances_to_explain_path: "./datasets/reddit_explanation_sample.json"
 interp_space_path:    "./datasets/sentence_luar_interp_space_2_35/"
 interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/sentence_luar_interp_space_2_35.zip?download=true"
 gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"
 gram2vec_feats_url:      "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
 embeddings_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/embeddings_cache.zip?download=true"
 embeddings_cache_path: "./datasets/embeddings_cache/"
 zoom_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/zoom_cache.zip?download=true"
 zoom_cache_path: "./datasets/zoom_cache/"
 region_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/region_cache.zip?download=true"
 region_cache_path: "./datasets/region_cache/"
 tsne_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/tsne_cache.pkl?download=true"
 tsne_cache_path: "./datasets/tsne_cache.pkl"
 llm_style_features_cache_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/feature_spans_cache.zip?download=true"
 llm_style_features_cache_path: "./datasets/feature_spans_cache/"
+background_authors_df_path: "./datasets/reddit_clustered_authors.pkl"
 style_feat_clm:       "llm_tfidf_weights"
 top_k:                10
 only_llm_feats:       false

precompute_caches.py CHANGED Viewed

@@ -45,8 +45,9 @@ def precompute_all_caches(
     print(f"Configuration loaded from {config_path}")
     print(f"config : \n{cfg}")
     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
-    interp = load_interp_space(cfg)
-    clustered_authors_df = interp['clustered_authors_df']
     if instances_to_process is None:
         instances_to_process = instance_ids

     print(f"Configuration loaded from {config_path}")
     print(f"config : \n{cfg}")
     instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
+    # interp = load_interp_space(cfg)
+    # clustered_authors_df = interp['clustered_authors_df']
+    clustered_authors_df = pickle.load(open(cfg['background_authors_df_path'], 'rb'))
     if instances_to_process is None:
         instances_to_process = instance_ids

prepare_data.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+import argparse
+import csv
+import sys
+import copy
+import os
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import glob
+from sklearn.preprocessing import minmax_scale
+import random
+import pickle
+import json
+import pandas as pd
+def sample_ds(input_file, output_file, num_insts=10000, min_num_text_per_inst=0, max_num_text_per_inst=3):
+    """
+    sample_ds('/mnt/swordfish-pool2/nikhil/raw_all/test_queries.jsonl', '/mnt/swordfish-pool2/milad/hiatus-data/reddit_cluster_test.pkl',
+          num_insts=10000,
+          min_num_text_per_inst=3,
+          max_num_text_per_inst=10)
+    sample_ds('/mnt/swordfish-pool2/nikhil/raw_all/data.jsonl', '/mnt/swordfish-pool2/milad/hiatus-data/reddit_cluster_training.pkl',
+          num_insts=10000,
+          min_num_text_per_inst=3,
+          max_num_text_per_inst=10)
+    """
+    f = open(input_file)
+    out_list = []
+    for i in range(num_insts):
+        json_obj = json.loads(f.readline())
+        if len(json_obj['syms']) < min_num_text_per_inst:
+            continue
+        out_list.append({
+            'fullText': json_obj['syms'][:max_num_text_per_inst],
+            'authorID': json_obj['author_id']
+        })
+    df = pd.DataFrame(out_list)
+    df.to_pickle(output_file)
+def get_reddit_data(input_path, random_seed=123, num_instances=50, num_documents_per_author=4):
+    df = pd.read_pickle(open(input_path, 'rb'))
+    output_objs = []
+    for idx, row in df.iterrows():
+        # Get the current author's documents
+        query_author_df   = df[df.authorID == row['authorID']]
+        # split the author's documents into two: query and correct author
+        author_documents = query_author_df.fullText.tolist()[0]
+        if len(author_documents) < num_documents_per_author * 2:
+            continue
+        query_documents    = author_documents[:num_documents_per_author]
+        correct_documents  = author_documents[num_documents_per_author:]
+        # Sample two *other* authors
+        other_authors_df = df[df.authorID != row['authorID']]
+        other_two_authors = other_authors_df.sample(2, random_state=random_seed)
+        output_objs.append({
+            "Q_authorID":  str(row["authorID"]),
+            "Q_fullText": query_documents,
+            "a0_authorID":  str(other_two_authors.iloc[0]["authorID"]),
+            "a0_fullText": other_two_authors.iloc[0]["fullText"][:num_documents_per_author],
+            "a1_authorID":  str(other_two_authors.iloc[1]["authorID"]),
+            "a1_fullText": other_two_authors.iloc[1]["fullText"][:num_documents_per_author],
+            "a2_authorID": str(row["authorID"]) + "_correct",
+            "a2_fullText": correct_documents,
+            "gt_idx": 2
+        })
+        random_seed += 1 # Increment seed to get different authors for the next task
+        if len(output_objs) >= num_instances:
+            break
+    return output_objs
+def get_iarapa_pilot_data(input_path):
+    for data_point in glob.glob(input_path + '*/'):
+        candidates_file = list(glob.glob(data_point + '/data/*_candidates.jsonl'))[0]
+        queries_file    = list(glob.glob(data_point + '/data/*_queries.jsonl'))[0]
+        grount_truth_file = list(glob.glob(data_point + '/groundtruth/*_groundtruth.npy'))[0]
+        q_labels_file = glob.glob(data_point + '/groundtruth/*_query-labels.txt')[0]
+        c_labels_file = glob.glob(data_point + '/groundtruth/*_candidate-labels.txt')[0]
+        candidates_df = pd.read_json(candidates_file, lines=True)
+        queries_df = pd.read_json(queries_file, lines=True)
+        queries_df['authorID'] = queries_df.authorIDs.apply(lambda x: x[0])
+        candidates_df['authorID'] = candidates_df.authorSetIDs.apply(lambda x: x[0])
+        queries_df = queries_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
+        candidates_df = candidates_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
+        ground_truth_assignment = np.load(open(grount_truth_file, 'rb'))
+        candidate_authors = [a[2:-3] for a in  open(c_labels_file).read().split('\n')][:-1]
+        query_authors = [a[2:-3] for a in  open(q_labels_file).read().split('\n')][:-1]
+        #print(ground_truth_assignment)
+        #print(candidate_authors)
+        #print(query_authors)
+        yield query_authors, candidate_authors, queries_df, candidates_df, ground_truth_assignment
+def main():
+    """
+    Main entry point for the script.
+    """
+    parser = argparse.ArgumentParser(description="Prepare Reddit data for author attribution tasks.")
+    parser.add_argument("input_path", type=str, help="Path to the input pandas DataFrame pickle file.")
+    parser.add_argument("output_path", type=str, help="Path to save the output JSON file.")
+    parser.add_argument("--random_seed", type=int, default=123, help="Random seed for sampling.")
+    parser.add_argument("--num_docs", type=int, default=5, help="Number of documents per author for query and correct sets.")
+    args = parser.parse_args()
+    print(f"Processing data from: {args.input_path}")
+    output_data = get_reddit_data(
+        input_path=args.input_path,
+        random_seed=args.random_seed,
+        num_documents_per_author=args.num_docs
+    )
+    print(f"Saving {len(output_data)} tasks to: {args.output_path}")
+    with open(args.output_path, 'w') as f:
+        json.dump(output_data, f, indent=4)
+    print("Done.")
+if __name__ == "__main__":
+    main()

utils/clustering_utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ from sklearn.metrics import silhouette_score
 from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
 from scipy.stats import pearsonr, ConstantInputWarning
 from typing import List, Dict, Any
 import json
@@ -99,6 +100,7 @@ def clustering_author(background_corpus_df: pd.DataFrame,
     embeddings_list = background_corpus_df[embedding_clm].tolist()
     X_list = []
     original_indices = [] # To map results back to the original DataFrame's indices
@@ -148,17 +150,23 @@ def clustering_author(background_corpus_df: pd.DataFrame,
         print(f"Applying PCA to reduce dimensions from {X.shape[1]} to {pca_dimensions}...")
         pca = PCA(n_components=pca_dimensions, random_state=42)
         X = pca.fit_transform(X)
         # If a test set is provided, transform its embeddings using the same PCA model
         if test_corpus_df is not None:
             test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
-            if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[1] == pca.n_features_in_:
                 print(f"Transforming test set embeddings with the same PCA model...")
                 transformed_test_embeddings = pca.transform(test_embeddings_matrix)
                 # Update the test DataFrame's embedding column with the reduced embeddings
                 test_corpus_df[embedding_clm] = list(transformed_test_embeddings)
             else:
-                print("Warning: Could not apply PCA to test set due to dimension mismatch or invalid data.")
     # For cosine metric, normalize embeddings to unit length.
     # This is standard practice as cosine similarity is equivalent to Euclidean
@@ -167,7 +175,10 @@ def clustering_author(background_corpus_df: pd.DataFrame,
     if metric == 'cosine':
         from sklearn.preprocessing import normalize
         print("Normalizing embeddings for cosine distance...")
-        X = normalize(X, norm='l2', axis=1)
         # Also normalize the test corpus embeddings if they exist
         if test_corpus_df is not None:
@@ -178,11 +189,11 @@ def clustering_author(background_corpus_df: pd.DataFrame,
                 test_corpus_df[embedding_clm] = list(normalized_test_embeddings)
             else:
                 print("Warning: Could not normalize test set embeddings due to invalid data.")
     if eps_values is None:
         if metric == 'cosine':
             #eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
-            eps_values = np.arange(0.01, 0.3, 0.01)
         else: # 'euclidean' or other
             if X.shape[0] > 1:
                 # For Euclidean, eps depends on the scale of the data.
@@ -201,6 +212,7 @@ def clustering_author(background_corpus_df: pd.DataFrame,
     best_score = -1.001
     best_labels = None
     best_eps = None
     # This loop now lives in `clustering_author` to have access to the full DataFrame for evaluation.
     for eps in eps_values:
@@ -211,6 +223,8 @@ def clustering_author(background_corpus_df: pd.DataFrame,
         current_labels = db.fit_predict(X)
         # --- Evaluation Step 1: Silhouette Score ---
         score = _calculate_silhouette_score(X, current_labels, metric)
         if score is not None:
             print(f"  - Silhouette Score: {score:.4f}")
@@ -236,9 +250,9 @@ def clustering_author(background_corpus_df: pd.DataFrame,
         # --- Evaluation Step 3: Distance Preservation on Test Corpus (if provided) ---
         if test_corpus_df is not None:
             # We need the centroids from the current clustering of the background corpus
             centroids = _compute_cluster_centroids(temp_df[temp_df['cluster_label'] != -1], embedding_clm, 'cluster_label')
             test_correlation = evaluate_test_set_distance_preservation(test_corpus_df, centroids, embedding_clm)
             if test_correlation is not None:
                 print(f"  - Test Set Distance Preservation (Pearson r): {test_correlation:.4f}")
@@ -246,7 +260,14 @@ def clustering_author(background_corpus_df: pd.DataFrame,
                 print("  - Test Set Distance Preservation (Pearson r): N/A (not enough test data or clusters)")
         print('Eps {}, #clusters {}, solihouette {}, Pearson {}'.format(eps, len(set(current_labels) - {-1}), score, test_correlation))
     if best_labels is not None:
         num_found_clusters = len(set(best_labels) - {-1})
         print(f"\n--- Best Clustering Result ---")
@@ -450,6 +471,8 @@ def evaluate_test_set_distance_preservation(
     # 2. Project test embeddings into the centroid space and get new distances
     projected_embeddings_matrix = _project_to_centroid_space(test_embeddings_matrix, centroids_map)
     if projected_embeddings_matrix.ndim != 2 or projected_embeddings_matrix.shape[1] < 2:
         return None # Projection failed or resulted in a space with <2 dimensions

 from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
 from scipy.stats import pearsonr, ConstantInputWarning
 from typing import List, Dict, Any
+from tabulate import tabulate
 import json
     embeddings_list = background_corpus_df[embedding_clm].tolist()
     X_list = []
     original_indices = [] # To map results back to the original DataFrame's indices
         print(f"Applying PCA to reduce dimensions from {X.shape[1]} to {pca_dimensions}...")
         pca = PCA(n_components=pca_dimensions, random_state=42)
         X = pca.fit_transform(X)
+        # Update the background_corpus_df with the transformed embeddings
+        # This ensures subsequent centroid calculations use the reduced-dimension space.
+        background_corpus_df[embedding_clm] = list(X)
         # If a test set is provided, transform its embeddings using the same PCA model
         if test_corpus_df is not None:
             test_embeddings_matrix = _safe_embeddings_to_matrix(test_corpus_df[embedding_clm])
+            if test_embeddings_matrix.ndim == 2 and test_embeddings_matrix.shape[0] > 0 and test_embeddings_matrix.shape[1] == pca.n_features_in_:
                 print(f"Transforming test set embeddings with the same PCA model...")
                 transformed_test_embeddings = pca.transform(test_embeddings_matrix)
                 # Update the test DataFrame's embedding column with the reduced embeddings
+                #test_corpus_df.loc[:, embedding_clm] = list(transformed_test_embeddings)
                 test_corpus_df[embedding_clm] = list(transformed_test_embeddings)
             else:
+                print(f"Warning: Could not apply PCA to test set. Test shape: {test_embeddings_matrix.shape}, PCA features: {pca.n_features_in_}")
     # For cosine metric, normalize embeddings to unit length.
     # This is standard practice as cosine similarity is equivalent to Euclidean
     if metric == 'cosine':
         from sklearn.preprocessing import normalize
         print("Normalizing embeddings for cosine distance...")
+        X_normalized = normalize(X, norm='l2', axis=1)
+        # Update the background_corpus_df with the normalized embeddings
+        background_corpus_df[embedding_clm] = list(X_normalized)
+        X = X_normalized # Use the normalized data for clustering
         # Also normalize the test corpus embeddings if they exist
         if test_corpus_df is not None:
                 test_corpus_df[embedding_clm] = list(normalized_test_embeddings)
             else:
                 print("Warning: Could not normalize test set embeddings due to invalid data.")
     if eps_values is None:
         if metric == 'cosine':
             #eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+            eps_values = np.arange(0.01, 0.2, 0.01)
         else: # 'euclidean' or other
             if X.shape[0] > 1:
                 # For Euclidean, eps depends on the scale of the data.
     best_score = -1.001
     best_labels = None
     best_eps = None
+    results_for_table = []
     # This loop now lives in `clustering_author` to have access to the full DataFrame for evaluation.
     for eps in eps_values:
         current_labels = db.fit_predict(X)
         # --- Evaluation Step 1: Silhouette Score ---
+        num_clusters = len(set(current_labels) - {-1})
+        num_outliers = np.sum(current_labels == -1)
         score = _calculate_silhouette_score(X, current_labels, metric)
         if score is not None:
             print(f"  - Silhouette Score: {score:.4f}")
         # --- Evaluation Step 3: Distance Preservation on Test Corpus (if provided) ---
         if test_corpus_df is not None:
+            test_correlation = None
             # We need the centroids from the current clustering of the background corpus
             centroids = _compute_cluster_centroids(temp_df[temp_df['cluster_label'] != -1], embedding_clm, 'cluster_label')
             test_correlation = evaluate_test_set_distance_preservation(test_corpus_df, centroids, embedding_clm)
             if test_correlation is not None:
                 print(f"  - Test Set Distance Preservation (Pearson r): {test_correlation:.4f}")
                 print("  - Test Set Distance Preservation (Pearson r): N/A (not enough test data or clusters)")
         print('Eps {}, #clusters {}, solihouette {}, Pearson {}'.format(eps, len(set(current_labels) - {-1}), score, test_correlation))
+        results_for_table.append([f"{eps:.3f}", f"{score:.4f}" if score is not None else "N/A", f"{test_correlation:.4f}" if test_correlation is not None else "N/A", num_clusters, num_outliers])
+    # --- Print Final Summary Table ---
+    print("\n\n--- Clustering Run Summary ---")
+    headers = ["Epsilon (eps)", "Silhouette Score", "Test Dist. Preserv.", "# Clusters", "# Outliers"]
+    print(tabulate(results_for_table, headers=headers, tablefmt="grid"))
+    print("----------------------------\n")
     if best_labels is not None:
         num_found_clusters = len(set(best_labels) - {-1})
         print(f"\n--- Best Clustering Result ---")
     # 2. Project test embeddings into the centroid space and get new distances
     projected_embeddings_matrix = _project_to_centroid_space(test_embeddings_matrix, centroids_map)
     if projected_embeddings_matrix.ndim != 2 or projected_embeddings_matrix.shape[1] < 2:
         return None # Projection failed or resulted in a space with <2 dimensions

utils/ui.py CHANGED Viewed

@@ -91,7 +91,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
     if mode == "Predefined HRS Task":
         iid = int(iid.replace('Task ', ''))
         data = instances[iid]
-        predicted_author = data['latent_rank'][0]
         ground_truth_author = 100#data['gt_idx']
         mystery_txt = data['Q_fullText']
         c1_txt = data['a0_fullText']
@@ -100,7 +99,7 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
         candidate_texts = [c1_txt, c2_txt, c3_txt]
         #create a dataframe of the task authors
-        task_authors_df  = instance_to_df(instances[iid], predicted_author=predicted_author, ground_truth_author=ground_truth_author)
         print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
     else:
         header_html = "<h3>Custom Uploaded Task</h3>"
@@ -136,10 +135,6 @@ def update_task_display(mode, iid, instances, background_df, mystery_file, cand1
     task_authors_df['g2v_vector'] = task_authors_g2v
     print(f"Gram2Vec feature generation complete")
-    if mode != "Predefined HRS Task":
-        # Computing predicted author by checking pairwise cosine similarity over luar embeddings
-        col_name = f'{model_name.split("/")[-1]}_style_embedding'
-        predicted_author = compute_predicted_author(task_authors_df, col_name)
     #generating html for the task
     header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)

     if mode == "Predefined HRS Task":
         iid = int(iid.replace('Task ', ''))
         data = instances[iid]
         ground_truth_author = 100#data['gt_idx']
         mystery_txt = data['Q_fullText']
         c1_txt = data['a0_fullText']
         candidate_texts = [c1_txt, c2_txt, c3_txt]
         #create a dataframe of the task authors
+        task_authors_df  = instance_to_df(instances[iid], predicted_author=None, ground_truth_author=ground_truth_author)
         print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
     else:
         header_html = "<h3>Custom Uploaded Task</h3>"
     task_authors_df['g2v_vector'] = task_authors_g2v
     print(f"Gram2Vec feature generation complete")
     #generating html for the task
     header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)

utils/visualizations.py CHANGED Viewed

@@ -389,9 +389,9 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
     print(background_authors_embeddings_df.columns)
     print("Generating cluster visualization")
     iid = int(iid)
-    interp      = load_interp_space(cfg)
     # dim2lat     = interp['dimension_to_latent']
-    style_names = interp['dimension_to_style']
     # bg_emb      = np.array(interp['author_embedding'])
     # print(f"bg_emb shape: {bg_emb.shape}")
     #replace with cached embedddings
@@ -544,7 +544,7 @@ def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_inp
     return (
       fig,
     #   update(choices=display_clusters, value=display_clusters[cluster_label_query]),
-      style_names,
       bg_proj,  # Return background points
       bg_ids,    # Return background labels
       background_authors_embeddings_df,  # Return the DataFrame for zoom handling

     print(background_authors_embeddings_df.columns)
     print("Generating cluster visualization")
     iid = int(iid)
+    #interp      = load_interp_space(cfg)
     # dim2lat     = interp['dimension_to_latent']
+    #style_names = interp['dimension_to_style']
     # bg_emb      = np.array(interp['author_embedding'])
     # print(f"bg_emb shape: {bg_emb.shape}")
     #replace with cached embedddings
     return (
       fig,
     #   update(choices=display_clusters, value=display_clusters[cluster_label_query]),
+      None,
       bg_proj,  # Return background points
       bg_ids,    # Return background labels
       background_authors_embeddings_df,  # Return the DataFrame for zoom handling