Milad Alshomary
commited on
Commit
·
ff6e56b
1
Parent(s):
912be5c
updates
Browse files- utils/interp_space_utils.py +12 -23
- utils/visualizations.py +33 -31
utils/interp_space_utils.py
CHANGED
|
@@ -579,12 +579,11 @@ def compute_clusters_style_representation_3(
|
|
| 579 |
max_num_authors=10,
|
| 580 |
max_authors_for_span_extraction=4,
|
| 581 |
top_k: int = 10,
|
| 582 |
-
|
| 583 |
-
|
| 584 |
):
|
| 585 |
|
| 586 |
print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
|
| 587 |
-
print(f"Predicted author: {predicted_author}")
|
| 588 |
# STEP 1: Identify features on max_num_authors's max_num_documents_per_author number of documents
|
| 589 |
background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
|
| 590 |
background_corpus_df_feat_id = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
|
|
@@ -610,9 +609,6 @@ def compute_clusters_style_representation_3(
|
|
| 610 |
# Filter-in only task authors that are part of the current selection
|
| 611 |
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 612 |
|
| 613 |
-
# Define mystery and predicted author names
|
| 614 |
-
mystery_author = 'Mystery author'
|
| 615 |
-
predicted_author_name = f'Candidate Author {predicted_author + 1}' if predicted_author is not None else None
|
| 616 |
|
| 617 |
# Compute feature importance based on Mystery + Predicted author vs. other candidates
|
| 618 |
feature_importance = {f : 0 for f in features}
|
|
@@ -620,12 +616,7 @@ def compute_clusters_style_representation_3(
|
|
| 620 |
if author in task_author_names.intersection(set(cluster_ids)):
|
| 621 |
for feature, spans in feature_map.items():
|
| 622 |
if spans:
|
| 623 |
-
|
| 624 |
-
if author == mystery_author or (predicted_author is not None and author == predicted_author_name):
|
| 625 |
-
feature_importance[feature] += len(spans)
|
| 626 |
-
else:
|
| 627 |
-
# Other candidates - subtract their span counts
|
| 628 |
-
feature_importance[feature] -= len(spans)
|
| 629 |
else:
|
| 630 |
# Background authors - subtract their span counts
|
| 631 |
for feature, spans in feature_map.items():
|
|
@@ -750,7 +741,7 @@ def compute_clusters_g2v_representation(
|
|
| 750 |
author_ids: List[Any],
|
| 751 |
other_author_ids: List[Any],
|
| 752 |
features_clm_name: str,
|
| 753 |
-
top_n: int =
|
| 754 |
max_candidates_for_span_sorting: int = 50,
|
| 755 |
predicted_author: int = None
|
| 756 |
) -> List[tuple]: # Changed return type to List[tuple] to include scores
|
|
@@ -789,11 +780,11 @@ def compute_clusters_g2v_representation(
|
|
| 789 |
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
| 790 |
|
| 791 |
# 6) Extract top candidates for span-based sorting
|
| 792 |
-
candidate_features = feature_scores[:
|
| 793 |
|
| 794 |
# 7) Extract spans for task authors to sort by frequency
|
| 795 |
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 796 |
-
task_authors_in_selection =
|
| 797 |
|
| 798 |
if not task_authors_in_selection:
|
| 799 |
# If no task authors in selection, just return the z-score sorted features
|
|
@@ -801,15 +792,14 @@ def compute_clusters_g2v_representation(
|
|
| 801 |
return feature_scores[:top_n]
|
| 802 |
|
| 803 |
# Get task author data
|
| 804 |
-
task_authors_df = background_corpus_df[background_corpus_df['authorID'].isin(
|
| 805 |
|
| 806 |
-
# Define mystery and predicted author names
|
| 807 |
-
mystery_author = 'Mystery author'
|
| 808 |
-
predicted_author_name = f'Candidate Author {predicted_author + 1}' if predicted_author is not None else None
|
| 809 |
|
|
|
|
|
|
|
| 810 |
# Count spans for each feature: +1 for Mystery/Predicted, -1 for other candidates
|
| 811 |
feature_span_scores = {}
|
| 812 |
-
for feat_shorthand,
|
| 813 |
span_score = 0
|
| 814 |
|
| 815 |
for _, author_row in task_authors_df.iterrows():
|
|
@@ -822,9 +812,8 @@ def compute_clusters_g2v_representation(
|
|
| 822 |
# find_feature_spans expects shorthand format like "pos_unigrams:ADJ"
|
| 823 |
spans = find_feature_spans(author_text, feat_shorthand)
|
| 824 |
span_count = len(spans)
|
| 825 |
-
|
| 826 |
# Add span count if Mystery or Predicted author, subtract if other candidate
|
| 827 |
-
if author_name
|
| 828 |
span_score += span_count
|
| 829 |
else:
|
| 830 |
# Other candidates - subtract their span counts
|
|
@@ -843,7 +832,7 @@ def compute_clusters_g2v_representation(
|
|
| 843 |
|
| 844 |
print(f"[INFO] Top 5 gram2vec features by span score: {[(f, feature_span_scores.get(f, 0), z) for f, z in sorted_by_spans[:5]]}")
|
| 845 |
|
| 846 |
-
return sorted_by_spans
|
| 847 |
|
| 848 |
# Noticed the following function isnt actually referenced anywhere.
|
| 849 |
# def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
|
|
|
| 579 |
max_num_authors=10,
|
| 580 |
max_authors_for_span_extraction=4,
|
| 581 |
top_k: int = 10,
|
| 582 |
+
predicted_author = None,
|
| 583 |
+
return_only_feats= False
|
| 584 |
):
|
| 585 |
|
| 586 |
print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
|
|
|
|
| 587 |
# STEP 1: Identify features on max_num_authors's max_num_documents_per_author number of documents
|
| 588 |
background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
|
| 589 |
background_corpus_df_feat_id = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
|
|
|
|
| 609 |
# Filter-in only task authors that are part of the current selection
|
| 610 |
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 611 |
|
|
|
|
|
|
|
|
|
|
| 612 |
|
| 613 |
# Compute feature importance based on Mystery + Predicted author vs. other candidates
|
| 614 |
feature_importance = {f : 0 for f in features}
|
|
|
|
| 616 |
if author in task_author_names.intersection(set(cluster_ids)):
|
| 617 |
for feature, spans in feature_map.items():
|
| 618 |
if spans:
|
| 619 |
+
feature_importance[feature] += len(spans)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
else:
|
| 621 |
# Background authors - subtract their span counts
|
| 622 |
for feature, spans in feature_map.items():
|
|
|
|
| 741 |
author_ids: List[Any],
|
| 742 |
other_author_ids: List[Any],
|
| 743 |
features_clm_name: str,
|
| 744 |
+
top_n: int = 15,
|
| 745 |
max_candidates_for_span_sorting: int = 50,
|
| 746 |
predicted_author: int = None
|
| 747 |
) -> List[tuple]: # Changed return type to List[tuple] to include scores
|
|
|
|
| 780 |
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
| 781 |
|
| 782 |
# 6) Extract top candidates for span-based sorting
|
| 783 |
+
candidate_features = feature_scores[:top_n]
|
| 784 |
|
| 785 |
# 7) Extract spans for task authors to sort by frequency
|
| 786 |
task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
|
| 787 |
+
task_authors_in_selection = task_author_names.intersection(set(author_ids))
|
| 788 |
|
| 789 |
if not task_authors_in_selection:
|
| 790 |
# If no task authors in selection, just return the z-score sorted features
|
|
|
|
| 792 |
return feature_scores[:top_n]
|
| 793 |
|
| 794 |
# Get task author data
|
| 795 |
+
task_authors_df = background_corpus_df[background_corpus_df['authorID'].isin(task_author_names)]
|
| 796 |
|
|
|
|
|
|
|
|
|
|
| 797 |
|
| 798 |
+
print('len of task_authors_df ', len(task_authors_df))
|
| 799 |
+
print('zoomed in authors {}'.format(task_authors_in_selection))
|
| 800 |
# Count spans for each feature: +1 for Mystery/Predicted, -1 for other candidates
|
| 801 |
feature_span_scores = {}
|
| 802 |
+
for feat_shorthand, _ in candidate_features:
|
| 803 |
span_score = 0
|
| 804 |
|
| 805 |
for _, author_row in task_authors_df.iterrows():
|
|
|
|
| 812 |
# find_feature_spans expects shorthand format like "pos_unigrams:ADJ"
|
| 813 |
spans = find_feature_spans(author_text, feat_shorthand)
|
| 814 |
span_count = len(spans)
|
|
|
|
| 815 |
# Add span count if Mystery or Predicted author, subtract if other candidate
|
| 816 |
+
if author_name in task_authors_in_selection:
|
| 817 |
span_score += span_count
|
| 818 |
else:
|
| 819 |
# Other candidates - subtract their span counts
|
|
|
|
| 832 |
|
| 833 |
print(f"[INFO] Top 5 gram2vec features by span score: {[(f, feature_span_scores.get(f, 0), z) for f, z in sorted_by_spans[:5]]}")
|
| 834 |
|
| 835 |
+
return sorted_by_spans
|
| 836 |
|
| 837 |
# Noticed the following function isnt actually referenced anywhere.
|
| 838 |
# def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
|
utils/visualizations.py
CHANGED
|
@@ -289,7 +289,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
| 289 |
author_ids=visible_authors,
|
| 290 |
other_author_ids=[],
|
| 291 |
features_clm_name='g2v_vector',
|
| 292 |
-
top_n=
|
| 293 |
predicted_author=predicted_author
|
| 294 |
)
|
| 295 |
|
|
@@ -297,35 +297,36 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
| 297 |
# Keep only features that have detected spans in at least 2 of the
|
| 298 |
# task authors' texts (Mystery + Candidates 1-3)
|
| 299 |
# Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
|
| 300 |
-
task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
|
| 301 |
-
task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
|
| 302 |
-
if task_only_df.empty:
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
def _to_text(x):
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
|
| 309 |
-
|
| 310 |
-
print(f"len task_texts: {len(task_texts)}")
|
| 311 |
-
filtered_g2v_feats = []
|
| 312 |
-
for feat in g2v_feats:
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
# After filtering by spans, keep top-N by score
|
| 328 |
-
filtered_g2v_feats = filtered_g2v_feats[:10]
|
|
|
|
| 329 |
|
| 330 |
# Convert to human readable for display
|
| 331 |
HR_g2v_list = []
|
|
@@ -333,7 +334,8 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
|
|
| 333 |
HR_g2v = get_fullform(feat[0])
|
| 334 |
# print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
|
| 335 |
if HR_g2v is None:
|
| 336 |
-
print(f"Skipping Gram2Vec feature without human readable form: {feat}")
|
|
|
|
| 337 |
else:
|
| 338 |
HR_g2v_list.append((HR_g2v, feat[1])) #get the score
|
| 339 |
|
|
|
|
| 289 |
author_ids=visible_authors,
|
| 290 |
other_author_ids=[],
|
| 291 |
features_clm_name='g2v_vector',
|
| 292 |
+
top_n=15,
|
| 293 |
predicted_author=predicted_author
|
| 294 |
)
|
| 295 |
|
|
|
|
| 297 |
# Keep only features that have detected spans in at least 2 of the
|
| 298 |
# task authors' texts (Mystery + Candidates 1-3)
|
| 299 |
# Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
|
| 300 |
+
# task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
|
| 301 |
+
# task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
|
| 302 |
+
# if task_only_df.empty:
|
| 303 |
+
# task_only_df = task_authors_df
|
| 304 |
+
|
| 305 |
+
# def _to_text(x):
|
| 306 |
+
# return '\n\n'.join(x) if isinstance(x, list) else x
|
| 307 |
+
|
| 308 |
+
# task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
|
| 309 |
+
|
| 310 |
+
# print(f"len task_texts: {len(task_texts)}")
|
| 311 |
+
# filtered_g2v_feats = []
|
| 312 |
+
# for feat in g2v_feats:
|
| 313 |
+
# try:
|
| 314 |
+
# # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
|
| 315 |
+
# occurrences = 0
|
| 316 |
+
# for txt in task_texts:
|
| 317 |
+
# spans = find_feature_spans(txt, feat[0])
|
| 318 |
+
# if spans:
|
| 319 |
+
# occurrences += 1
|
| 320 |
+
# if occurrences >= 2:
|
| 321 |
+
# filtered_g2v_feats.append(feat)
|
| 322 |
+
# else:
|
| 323 |
+
# print(f"[INFO] Dropping G2V feature with <2 task-author spans: {feat}")
|
| 324 |
+
# except Exception as e:
|
| 325 |
+
# print(f"[WARN] Error while checking spans for {feat}: {e}")
|
| 326 |
+
|
| 327 |
+
# # After filtering by spans, keep top-N by score
|
| 328 |
+
# filtered_g2v_feats = filtered_g2v_feats[:10]
|
| 329 |
+
filtered_g2v_feats = g2v_feats
|
| 330 |
|
| 331 |
# Convert to human readable for display
|
| 332 |
HR_g2v_list = []
|
|
|
|
| 334 |
HR_g2v = get_fullform(feat[0])
|
| 335 |
# print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
|
| 336 |
if HR_g2v is None:
|
| 337 |
+
#print(f"Skipping Gram2Vec feature without human readable form: {feat}")
|
| 338 |
+
HR_g2v_list.append((feat[0], feat[1])) #get the score
|
| 339 |
else:
|
| 340 |
HR_g2v_list.append((HR_g2v, feat[1])) #get the score
|
| 341 |
|