Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

peter-zeng commited on Sep 15

Commit

07c4d0f

1 Parent(s): 88507e8

updated to use mystery + predicted

Browse files

Files changed (1) hide show

utils/interp_space_utils.py +133 -3

utils/interp_space_utils.py CHANGED Viewed

@@ -31,6 +31,14 @@ os.makedirs(os.path.dirname(REGION_CACHE), exist_ok=True)
 # Bump this whenever there is a change etc...
 CACHE_VERSION = 1
 class style_analysis_schema(BaseModel):
     features: list[str]
     spans: dict[str, dict[str, list[str]]]
@@ -59,8 +67,8 @@ def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd
         print(f"Number of authors after concatenation: {len(clustered_authors_df)}")
     # Gather the input texts (preserves list-of-strings if any)
-    #texts = background_corpus_df[text_clm].fillna("").tolist()
-    author_texts = ['\n\n'.join(x) for x in clustered_authors_df.fullText.tolist()]
     print(f"Number of author_texts: {len(author_texts)}")
@@ -686,7 +694,11 @@ def compute_clusters_g2v_representation(
     # Keep only features that have a positive contrastive score
     top_g2v_feats = sorted(
-        [(feat, val, z_score) for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores) if val > 0],
         key=lambda x: -x[1]  # Sort by contrastive score
     )
@@ -776,6 +788,124 @@ def compute_clusters_g2v_representation(
     return filtered_features[:top_n]  # Return tuples with z-scores
 def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
     styles_df = pd.read_csv(styles_df_path)[[feat_clm, "documentID"]]

 # Bump this whenever there is a change etc...
 CACHE_VERSION = 1
+# Features to exclude from Gram2Vec outputs
+EXCLUDED_G2V_FEATURE_PREFIXES = [
+    'num_tokens'
+]
+EXCLUDED_G2V_FEATURES = set([
+    'num_tokens:num_tokens'
+])
 class style_analysis_schema(BaseModel):
     features: list[str]
     spans: dict[str, dict[str, list[str]]]
         print(f"Number of authors after concatenation: {len(clustered_authors_df)}")
     # Gather the input texts (preserves list-of-strings if any)
+    # If an entry is a list of strings, join; otherwise use the string as-is
+    author_texts = [('\n\n'.join(x) if isinstance(x, list) else x) for x in clustered_authors_df.fullText.tolist()]
     print(f"Number of author_texts: {len(author_texts)}")
     # Keep only features that have a positive contrastive score
     top_g2v_feats = sorted(
+        [
+            (feat, val, z_score)
+            for feat, val, z_score in zip(all_g2v_feats, final_g2v_feats_values, z_scores)
+            if val > 0 and feat not in EXCLUDED_G2V_FEATURES and not any(feat.startswith(p) for p in EXCLUDED_G2V_FEATURE_PREFIXES)
+        ],
         key=lambda x: -x[1]  # Sort by contrastive score
     )
     return filtered_features[:top_n]  # Return tuples with z-scores
+def compute_task_only_g2v_similarity(
+    background_corpus_df: pd.DataFrame,
+    visible_author_ids: List[Any],
+    features_clm_name: str = 'g2v_vector',
+    top_n: int = 10,
+    require_spans: bool = True
+) -> List[tuple]:
+    """
+    Compute top Gram2Vec features that are shared between the Mystery author and the
+    predicted Candidate author, ignoring background authors and contrast.
+    Selection is limited to task authors within the zoom (i.e., present in
+    `visible_author_ids`). A feature is kept if:
+      - it has a positive value (> 0) for both Mystery and Predicted Candidate,
+      - and (optionally) at least one detected span exists in both authors' texts.
+    Scoring strategy prioritizes features strong in both authors: score = min(mystery_value, predicted_value).
+    Returns a list of (feature_name, score) tuples sorted by score desc, limited to top_n.
+    """
+    task_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
+    # Filter to visible task authors
+    is_visible = background_corpus_df['authorID'].isin(visible_author_ids)
+    is_task = background_corpus_df['authorID'].isin(task_names)
+    visible_task_df = background_corpus_df[is_visible & is_task]
+    if visible_task_df.empty:
+        return []
+    # Identify Mystery author row within the visible set
+    mystery_rows = visible_task_df[visible_task_df['authorID'] == 'Mystery author']
+    if mystery_rows.empty:
+        # If Mystery is not visible, fall back to using any available Mystery row in the corpus
+        mystery_rows = background_corpus_df[background_corpus_df['authorID'] == 'Mystery author']
+        if mystery_rows.empty:
+            return []
+    mystery_row = mystery_rows.iloc[0]
+    # Identify the predicted candidate within the visible set using the 'predicted' flag if present
+    predicted_row = None
+    if 'predicted' in visible_task_df.columns:
+        pred_candidates = visible_task_df[visible_task_df['predicted'] == True]
+        if not pred_candidates.empty:
+            predicted_row = pred_candidates.iloc[0]
+    # If not found in visible, try to find anywhere in the corpus
+    if predicted_row is None and 'predicted' in background_corpus_df.columns:
+        pred_any = background_corpus_df[background_corpus_df['predicted'] == True]
+        # Prefer one that is also a task author
+        pred_any = pred_any[pred_any['authorID'].isin(task_names)] if not pred_any.empty else pred_any
+        if not pred_any.empty:
+            predicted_row = pred_any.iloc[0]
+    # If still not found, we cannot build a pair
+    if predicted_row is None:
+        return []
+    mystery_vec = mystery_row.get(features_clm_name, {})
+    predicted_vec = predicted_row.get(features_clm_name, {})
+    if not isinstance(mystery_vec, dict) or not isinstance(predicted_vec, dict):
+        return []
+    # Prepare texts for optional span gating
+    def _norm_txt(x):
+        if isinstance(x, list):
+            return '\n\n'.join(x)
+        return str(x)
+    mystery_text = _norm_txt(mystery_row.get('fullText', ''))
+    predicted_text = _norm_txt(predicted_row.get('fullText', ''))
+    try:
+        from gram2vec.feature_locator import find_feature_spans as _find_feature_spans
+    except Exception:
+        _find_feature_spans = None
+    shared_features = []
+    # Iterate over union of feature keys (both authors share the same feature space in practice)
+    for feature_name in set(list(mystery_vec.keys()) + list(predicted_vec.keys())):
+        # Exclude unwanted features
+        if feature_name in EXCLUDED_G2V_FEATURES or any(feature_name.startswith(p) for p in EXCLUDED_G2V_FEATURE_PREFIXES):
+            continue
+        m_val = float(mystery_vec.get(feature_name, 0.0))
+        p_val = float(predicted_vec.get(feature_name, 0.0))
+        # Optional span gate: require at least one span in both texts
+        spans_m = spans_p = None
+        if require_spans and _find_feature_spans is not None:
+            try:
+                spans_m = _find_feature_spans(mystery_text, feature_name) or []
+                spans_p = _find_feature_spans(predicted_text, feature_name) or []
+                if len(spans_m) == 0 or len(spans_p) == 0:
+                    continue
+            except Exception:
+                # On span errors, skip gating and proceed
+                spans_m = spans_m if spans_m is not None else []
+                spans_p = spans_p if spans_p is not None else []
+        # Similarity metric: |m| + |p| - |m - p|
+        score = abs(m_val) + abs(p_val) - abs(m_val - p_val)
+        shared_features.append((feature_name, score, m_val, p_val, len(spans_m) if spans_m is not None else -1, len(spans_p) if spans_p is not None else -1))
+    # Rank by score desc and return top_n
+    shared_features.sort(key=lambda x: x[1], reverse=True)
+    top = shared_features[:top_n]
+    # Debug print of top-N with values and span counts for presence sanity-check
+    try:
+        print("[DEBUG] Task-only G2V top features (feature, mystery_val, predicted_val, score | spans_mystery, spans_predicted):")
+        for feat_name, sc, m_val, p_val, c_m, c_p in top:
+            print(f"    {feat_name} | mystery={m_val:.4f}, predicted={p_val:.4f}, S={sc:.4f} | spans=({c_m}, {c_p})")
+    except Exception:
+        pass
+    return [(f, s) for (f, s, _, _, _, _) in top]
 def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
     styles_df = pd.read_csv(styles_df_path)[[feat_clm, "documentID"]]