Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Sleeping

App Files Files Community

Milad Alshomary commited on 29 days ago

Commit

ff6e56b

1 Parent(s): 912be5c

updates

Browse files

Files changed (2) hide show

utils/interp_space_utils.py +12 -23
utils/visualizations.py +33 -31

utils/interp_space_utils.py CHANGED Viewed

@@ -579,12 +579,11 @@ def compute_clusters_style_representation_3(
     max_num_authors=10,
     max_authors_for_span_extraction=4,
     top_k: int = 10,
-    return_only_feats= False,
-    predicted_author: int = None
     ):
     print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
-    print(f"Predicted author: {predicted_author}")
     # STEP 1: Identify features on max_num_authors's max_num_documents_per_author number of documents
     background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
     background_corpus_df_feat_id = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
@@ -610,9 +609,6 @@ def compute_clusters_style_representation_3(
     # Filter-in only task authors that are part of the current selection
     task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
-    # Define mystery and predicted author names
-    mystery_author = 'Mystery author'
-    predicted_author_name = f'Candidate Author {predicted_author + 1}' if predicted_author is not None else None
     # Compute feature importance based on Mystery + Predicted author vs. other candidates
     feature_importance = {f : 0 for f in features}
@@ -620,12 +616,7 @@ def compute_clusters_style_representation_3(
         if author in task_author_names.intersection(set(cluster_ids)):
             for feature, spans in feature_map.items():
                 if spans:
-                    # Add span count if Mystery or Predicted author, subtract if other candidate
-                    if author == mystery_author or (predicted_author is not None and author == predicted_author_name):
-                        feature_importance[feature] += len(spans)
-                    else:
-                        # Other candidates - subtract their span counts
-                        feature_importance[feature] -= len(spans)
         else:
             # Background authors - subtract their span counts
             for feature, spans in feature_map.items():
@@ -750,7 +741,7 @@ def compute_clusters_g2v_representation(
     author_ids: List[Any],
     other_author_ids: List[Any],
     features_clm_name: str,
-    top_n: int = 10,
     max_candidates_for_span_sorting: int = 50,
     predicted_author: int = None
 ) -> List[tuple]:  # Changed return type to List[tuple] to include scores
@@ -789,11 +780,11 @@ def compute_clusters_g2v_representation(
     feature_scores.sort(key=lambda x: x[1], reverse=True)
     # 6) Extract top candidates for span-based sorting
-    candidate_features = feature_scores[:max_candidates_for_span_sorting]
     # 7) Extract spans for task authors to sort by frequency
     task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
-    task_authors_in_selection = [aid for aid in author_ids if aid in task_author_names]
     if not task_authors_in_selection:
         # If no task authors in selection, just return the z-score sorted features
@@ -801,15 +792,14 @@ def compute_clusters_g2v_representation(
         return feature_scores[:top_n]
     # Get task author data
-    task_authors_df = background_corpus_df[background_corpus_df['authorID'].isin(task_authors_in_selection)]
-    # Define mystery and predicted author names
-    mystery_author = 'Mystery author'
-    predicted_author_name = f'Candidate Author {predicted_author + 1}' if predicted_author is not None else None
     # Count spans for each feature: +1 for Mystery/Predicted, -1 for other candidates
     feature_span_scores = {}
-    for feat_shorthand, z_score in candidate_features:
         span_score = 0
         for _, author_row in task_authors_df.iterrows():
@@ -822,9 +812,8 @@ def compute_clusters_g2v_representation(
                 # find_feature_spans expects shorthand format like "pos_unigrams:ADJ"
                 spans = find_feature_spans(author_text, feat_shorthand)
                 span_count = len(spans)
                 # Add span count if Mystery or Predicted author, subtract if other candidate
-                if author_name == mystery_author or (predicted_author is not None and author_name == predicted_author_name):
                     span_score += span_count
                 else:
                     # Other candidates - subtract their span counts
@@ -843,7 +832,7 @@ def compute_clusters_g2v_representation(
     print(f"[INFO] Top 5 gram2vec features by span score: {[(f, feature_span_scores.get(f, 0), z) for f, z in sorted_by_spans[:5]]}")
-    return sorted_by_spans[:top_n]
 # Noticed the following function isnt actually referenced anywhere.
 # def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):

     max_num_authors=10,
     max_authors_for_span_extraction=4,
     top_k: int = 10,
+    predicted_author = None,
+    return_only_feats= False
     ):
     print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
     # STEP 1: Identify features on max_num_authors's max_num_documents_per_author number of documents
     background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
     background_corpus_df_feat_id = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
     # Filter-in only task authors that are part of the current selection
     task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
     # Compute feature importance based on Mystery + Predicted author vs. other candidates
     feature_importance = {f : 0 for f in features}
         if author in task_author_names.intersection(set(cluster_ids)):
             for feature, spans in feature_map.items():
                 if spans:
+                    feature_importance[feature] += len(spans)
         else:
             # Background authors - subtract their span counts
             for feature, spans in feature_map.items():
     author_ids: List[Any],
     other_author_ids: List[Any],
     features_clm_name: str,
+    top_n: int = 15,
     max_candidates_for_span_sorting: int = 50,
     predicted_author: int = None
 ) -> List[tuple]:  # Changed return type to List[tuple] to include scores
     feature_scores.sort(key=lambda x: x[1], reverse=True)
     # 6) Extract top candidates for span-based sorting
+    candidate_features = feature_scores[:top_n]
     # 7) Extract spans for task authors to sort by frequency
     task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
+    task_authors_in_selection = task_author_names.intersection(set(author_ids))
     if not task_authors_in_selection:
         # If no task authors in selection, just return the z-score sorted features
         return feature_scores[:top_n]
     # Get task author data
+    task_authors_df = background_corpus_df[background_corpus_df['authorID'].isin(task_author_names)]
+    print('len of task_authors_df ', len(task_authors_df))
+    print('zoomed in authors {}'.format(task_authors_in_selection))
     # Count spans for each feature: +1 for Mystery/Predicted, -1 for other candidates
     feature_span_scores = {}
+    for feat_shorthand, _ in candidate_features:
         span_score = 0
         for _, author_row in task_authors_df.iterrows():
                 # find_feature_spans expects shorthand format like "pos_unigrams:ADJ"
                 spans = find_feature_spans(author_text, feat_shorthand)
                 span_count = len(spans)
                 # Add span count if Mystery or Predicted author, subtract if other candidate
+                if author_name in task_authors_in_selection:
                     span_score += span_count
                 else:
                     # Other candidates - subtract their span counts
     print(f"[INFO] Top 5 gram2vec features by span score: {[(f, feature_span_scores.get(f, 0), z) for f, z in sorted_by_spans[:5]]}")
+    return sorted_by_spans
 # Noticed the following function isnt actually referenced anywhere.
 # def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):

utils/visualizations.py CHANGED Viewed

@@ -289,7 +289,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
         author_ids=visible_authors,
         other_author_ids=[],
         features_clm_name='g2v_vector',
-        top_n=50,
         predicted_author=predicted_author
     )
@@ -297,35 +297,36 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
     # Keep only features that have detected spans in at least 2 of the
     # task authors' texts (Mystery + Candidates 1-3)
     # Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
-    task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
-    task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
-    if task_only_df.empty:
-        task_only_df = task_authors_df
-    def _to_text(x):
-        return '\n\n'.join(x) if isinstance(x, list) else x
-    task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
-    print(f"len task_texts: {len(task_texts)}")
-    filtered_g2v_feats = []
-    for feat in g2v_feats:
-        try:
-            # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
-            occurrences = 0
-            for txt in task_texts:
-                spans = find_feature_spans(txt, feat[0])
-                if spans:
-                    occurrences += 1
-            if occurrences >= 2:
-                filtered_g2v_feats.append(feat)
-            else:
-                print(f"[INFO] Dropping G2V feature with <2 task-author spans: {feat}")
-        except Exception as e:
-            print(f"[WARN] Error while checking spans for {feat}: {e}")
-    # After filtering by spans, keep top-N by score
-    filtered_g2v_feats = filtered_g2v_feats[:10]
     # Convert to human readable for display
     HR_g2v_list = []
@@ -333,7 +334,8 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
         HR_g2v = get_fullform(feat[0])
         # print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
         if HR_g2v is None:
-            print(f"Skipping Gram2Vec feature without human readable form: {feat}")
         else:
             HR_g2v_list.append((HR_g2v, feat[1])) #get the score

         author_ids=visible_authors,
         other_author_ids=[],
         features_clm_name='g2v_vector',
+        top_n=15,
         predicted_author=predicted_author
     )
     # Keep only features that have detected spans in at least 2 of the
     # task authors' texts (Mystery + Candidates 1-3)
     # Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
+    # task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
+    # task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
+    # if task_only_df.empty:
+    #     task_only_df = task_authors_df
+    # def _to_text(x):
+    #     return '\n\n'.join(x) if isinstance(x, list) else x
+    # task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
+    # print(f"len task_texts: {len(task_texts)}")
+    # filtered_g2v_feats = []
+    # for feat in g2v_feats:
+    #     try:
+    #         # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
+    #         occurrences = 0
+    #         for txt in task_texts:
+    #             spans = find_feature_spans(txt, feat[0])
+    #             if spans:
+    #                 occurrences += 1
+    #         if occurrences >= 2:
+    #             filtered_g2v_feats.append(feat)
+    #         else:
+    #             print(f"[INFO] Dropping G2V feature with <2 task-author spans: {feat}")
+    #     except Exception as e:
+    #         print(f"[WARN] Error while checking spans for {feat}: {e}")
+    # # After filtering by spans, keep top-N by score
+    # filtered_g2v_feats = filtered_g2v_feats[:10]
+    filtered_g2v_feats = g2v_feats
     # Convert to human readable for display
     HR_g2v_list = []
         HR_g2v = get_fullform(feat[0])
         # print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
         if HR_g2v is None:
+            #print(f"Skipping Gram2Vec feature without human readable form: {feat}")
+            HR_g2v_list.append((feat[0], feat[1])) #get the score
         else:
             HR_g2v_list.append((HR_g2v, feat[1])) #get the score