Milad Alshomary commited on
Commit
ff6e56b
·
1 Parent(s): 912be5c
Files changed (2) hide show
  1. utils/interp_space_utils.py +12 -23
  2. utils/visualizations.py +33 -31
utils/interp_space_utils.py CHANGED
@@ -579,12 +579,11 @@ def compute_clusters_style_representation_3(
579
  max_num_authors=10,
580
  max_authors_for_span_extraction=4,
581
  top_k: int = 10,
582
- return_only_feats= False,
583
- predicted_author: int = None
584
  ):
585
 
586
  print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
587
- print(f"Predicted author: {predicted_author}")
588
  # STEP 1: Identify features on max_num_authors's max_num_documents_per_author number of documents
589
  background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
590
  background_corpus_df_feat_id = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
@@ -610,9 +609,6 @@ def compute_clusters_style_representation_3(
610
  # Filter-in only task authors that are part of the current selection
611
  task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
612
 
613
- # Define mystery and predicted author names
614
- mystery_author = 'Mystery author'
615
- predicted_author_name = f'Candidate Author {predicted_author + 1}' if predicted_author is not None else None
616
 
617
  # Compute feature importance based on Mystery + Predicted author vs. other candidates
618
  feature_importance = {f : 0 for f in features}
@@ -620,12 +616,7 @@ def compute_clusters_style_representation_3(
620
  if author in task_author_names.intersection(set(cluster_ids)):
621
  for feature, spans in feature_map.items():
622
  if spans:
623
- # Add span count if Mystery or Predicted author, subtract if other candidate
624
- if author == mystery_author or (predicted_author is not None and author == predicted_author_name):
625
- feature_importance[feature] += len(spans)
626
- else:
627
- # Other candidates - subtract their span counts
628
- feature_importance[feature] -= len(spans)
629
  else:
630
  # Background authors - subtract their span counts
631
  for feature, spans in feature_map.items():
@@ -750,7 +741,7 @@ def compute_clusters_g2v_representation(
750
  author_ids: List[Any],
751
  other_author_ids: List[Any],
752
  features_clm_name: str,
753
- top_n: int = 10,
754
  max_candidates_for_span_sorting: int = 50,
755
  predicted_author: int = None
756
  ) -> List[tuple]: # Changed return type to List[tuple] to include scores
@@ -789,11 +780,11 @@ def compute_clusters_g2v_representation(
789
  feature_scores.sort(key=lambda x: x[1], reverse=True)
790
 
791
  # 6) Extract top candidates for span-based sorting
792
- candidate_features = feature_scores[:max_candidates_for_span_sorting]
793
 
794
  # 7) Extract spans for task authors to sort by frequency
795
  task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
796
- task_authors_in_selection = [aid for aid in author_ids if aid in task_author_names]
797
 
798
  if not task_authors_in_selection:
799
  # If no task authors in selection, just return the z-score sorted features
@@ -801,15 +792,14 @@ def compute_clusters_g2v_representation(
801
  return feature_scores[:top_n]
802
 
803
  # Get task author data
804
- task_authors_df = background_corpus_df[background_corpus_df['authorID'].isin(task_authors_in_selection)]
805
 
806
- # Define mystery and predicted author names
807
- mystery_author = 'Mystery author'
808
- predicted_author_name = f'Candidate Author {predicted_author + 1}' if predicted_author is not None else None
809
 
 
 
810
  # Count spans for each feature: +1 for Mystery/Predicted, -1 for other candidates
811
  feature_span_scores = {}
812
- for feat_shorthand, z_score in candidate_features:
813
  span_score = 0
814
 
815
  for _, author_row in task_authors_df.iterrows():
@@ -822,9 +812,8 @@ def compute_clusters_g2v_representation(
822
  # find_feature_spans expects shorthand format like "pos_unigrams:ADJ"
823
  spans = find_feature_spans(author_text, feat_shorthand)
824
  span_count = len(spans)
825
-
826
  # Add span count if Mystery or Predicted author, subtract if other candidate
827
- if author_name == mystery_author or (predicted_author is not None and author_name == predicted_author_name):
828
  span_score += span_count
829
  else:
830
  # Other candidates - subtract their span counts
@@ -843,7 +832,7 @@ def compute_clusters_g2v_representation(
843
 
844
  print(f"[INFO] Top 5 gram2vec features by span score: {[(f, feature_span_scores.get(f, 0), z) for f, z in sorted_by_spans[:5]]}")
845
 
846
- return sorted_by_spans[:top_n]
847
 
848
  # Noticed the following function isnt actually referenced anywhere.
849
  # def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
 
579
  max_num_authors=10,
580
  max_authors_for_span_extraction=4,
581
  top_k: int = 10,
582
+ predicted_author = None,
583
+ return_only_feats= False
584
  ):
585
 
586
  print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
 
587
  # STEP 1: Identify features on max_num_authors's max_num_documents_per_author number of documents
588
  background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
589
  background_corpus_df_feat_id = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
 
609
  # Filter-in only task authors that are part of the current selection
610
  task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
611
 
 
 
 
612
 
613
  # Compute feature importance based on Mystery + Predicted author vs. other candidates
614
  feature_importance = {f : 0 for f in features}
 
616
  if author in task_author_names.intersection(set(cluster_ids)):
617
  for feature, spans in feature_map.items():
618
  if spans:
619
+ feature_importance[feature] += len(spans)
 
 
 
 
 
620
  else:
621
  # Background authors - subtract their span counts
622
  for feature, spans in feature_map.items():
 
741
  author_ids: List[Any],
742
  other_author_ids: List[Any],
743
  features_clm_name: str,
744
+ top_n: int = 15,
745
  max_candidates_for_span_sorting: int = 50,
746
  predicted_author: int = None
747
  ) -> List[tuple]: # Changed return type to List[tuple] to include scores
 
780
  feature_scores.sort(key=lambda x: x[1], reverse=True)
781
 
782
  # 6) Extract top candidates for span-based sorting
783
+ candidate_features = feature_scores[:top_n]
784
 
785
  # 7) Extract spans for task authors to sort by frequency
786
  task_author_names = {'Mystery author', 'Candidate Author 1', 'Candidate Author 2', 'Candidate Author 3'}
787
+ task_authors_in_selection = task_author_names.intersection(set(author_ids))
788
 
789
  if not task_authors_in_selection:
790
  # If no task authors in selection, just return the z-score sorted features
 
792
  return feature_scores[:top_n]
793
 
794
  # Get task author data
795
+ task_authors_df = background_corpus_df[background_corpus_df['authorID'].isin(task_author_names)]
796
 
 
 
 
797
 
798
+ print('len of task_authors_df ', len(task_authors_df))
799
+ print('zoomed in authors {}'.format(task_authors_in_selection))
800
  # Count spans for each feature: +1 for Mystery/Predicted, -1 for other candidates
801
  feature_span_scores = {}
802
+ for feat_shorthand, _ in candidate_features:
803
  span_score = 0
804
 
805
  for _, author_row in task_authors_df.iterrows():
 
812
  # find_feature_spans expects shorthand format like "pos_unigrams:ADJ"
813
  spans = find_feature_spans(author_text, feat_shorthand)
814
  span_count = len(spans)
 
815
  # Add span count if Mystery or Predicted author, subtract if other candidate
816
+ if author_name in task_authors_in_selection:
817
  span_score += span_count
818
  else:
819
  # Other candidates - subtract their span counts
 
832
 
833
  print(f"[INFO] Top 5 gram2vec features by span score: {[(f, feature_span_scores.get(f, 0), z) for f, z in sorted_by_spans[:5]]}")
834
 
835
+ return sorted_by_spans
836
 
837
  # Noticed the following function isnt actually referenced anywhere.
838
  # def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
utils/visualizations.py CHANGED
@@ -289,7 +289,7 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
289
  author_ids=visible_authors,
290
  other_author_ids=[],
291
  features_clm_name='g2v_vector',
292
- top_n=50,
293
  predicted_author=predicted_author
294
  )
295
 
@@ -297,35 +297,36 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
297
  # Keep only features that have detected spans in at least 2 of the
298
  # task authors' texts (Mystery + Candidates 1-3)
299
  # Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
300
- task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
301
- task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
302
- if task_only_df.empty:
303
- task_only_df = task_authors_df
304
-
305
- def _to_text(x):
306
- return '\n\n'.join(x) if isinstance(x, list) else x
307
-
308
- task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
309
-
310
- print(f"len task_texts: {len(task_texts)}")
311
- filtered_g2v_feats = []
312
- for feat in g2v_feats:
313
- try:
314
- # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
315
- occurrences = 0
316
- for txt in task_texts:
317
- spans = find_feature_spans(txt, feat[0])
318
- if spans:
319
- occurrences += 1
320
- if occurrences >= 2:
321
- filtered_g2v_feats.append(feat)
322
- else:
323
- print(f"[INFO] Dropping G2V feature with <2 task-author spans: {feat}")
324
- except Exception as e:
325
- print(f"[WARN] Error while checking spans for {feat}: {e}")
326
-
327
- # After filtering by spans, keep top-N by score
328
- filtered_g2v_feats = filtered_g2v_feats[:10]
 
329
 
330
  # Convert to human readable for display
331
  HR_g2v_list = []
@@ -333,7 +334,8 @@ def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors
333
  HR_g2v = get_fullform(feat[0])
334
  # print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
335
  if HR_g2v is None:
336
- print(f"Skipping Gram2Vec feature without human readable form: {feat}")
 
337
  else:
338
  HR_g2v_list.append((HR_g2v, feat[1])) #get the score
339
 
 
289
  author_ids=visible_authors,
290
  other_author_ids=[],
291
  features_clm_name='g2v_vector',
292
+ top_n=15,
293
  predicted_author=predicted_author
294
  )
295
 
 
297
  # Keep only features that have detected spans in at least 2 of the
298
  # task authors' texts (Mystery + Candidates 1-3)
299
  # Use only the task authors (Mystery + Candidates 1-3), not the zoom-visible set
300
+ # task_author_ids = {"Mystery author", "Candidate Author 1", "Candidate Author 2", "Candidate Author 3"}
301
+ # task_only_df = task_authors_df[task_authors_df['authorID'].isin(task_author_ids)]
302
+ # if task_only_df.empty:
303
+ # task_only_df = task_authors_df
304
+
305
+ # def _to_text(x):
306
+ # return '\n\n'.join(x) if isinstance(x, list) else x
307
+
308
+ # task_texts = [_to_text(x) for x in task_only_df['fullText'].tolist()]
309
+
310
+ # print(f"len task_texts: {len(task_texts)}")
311
+ # filtered_g2v_feats = []
312
+ # for feat in g2v_feats:
313
+ # try:
314
+ # # `feat` is shorthand already (e.g., 'pos_bigrams:NOUN PROPN')
315
+ # occurrences = 0
316
+ # for txt in task_texts:
317
+ # spans = find_feature_spans(txt, feat[0])
318
+ # if spans:
319
+ # occurrences += 1
320
+ # if occurrences >= 2:
321
+ # filtered_g2v_feats.append(feat)
322
+ # else:
323
+ # print(f"[INFO] Dropping G2V feature with <2 task-author spans: {feat}")
324
+ # except Exception as e:
325
+ # print(f"[WARN] Error while checking spans for {feat}: {e}")
326
+
327
+ # # After filtering by spans, keep top-N by score
328
+ # filtered_g2v_feats = filtered_g2v_feats[:10]
329
+ filtered_g2v_feats = g2v_feats
330
 
331
  # Convert to human readable for display
332
  HR_g2v_list = []
 
334
  HR_g2v = get_fullform(feat[0])
335
  # print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
336
  if HR_g2v is None:
337
+ #print(f"Skipping Gram2Vec feature without human readable form: {feat}")
338
+ HR_g2v_list.append((feat[0], feat[1])) #get the score
339
  else:
340
  HR_g2v_list.append((HR_g2v, feat[1])) #get the score
341