Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Sleeping

App Files Files Community

Anisha Bhatnagar commited on Aug 24

Commit

bd7d9f9

1 Parent(s): 8db24a7

fixed caching issues in LLM feature identification

Browse files

Files changed (2) hide show

utils/interp_space_utils.py +43 -3
utils/llm_feat_utils.py +1 -1

utils/interp_space_utils.py CHANGED Viewed

@@ -22,7 +22,9 @@ import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
 CACHE_DIR = "datasets/embeddings_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 # Bump this whenever there is a change etc...
 CACHE_VERSION = 1
@@ -418,7 +420,34 @@ def compute_clusters_style_representation_2(
     return parsed_response
-def identify_style_features(author_texts: list[str], max_num_feats: int = 5) -> list[str]:
     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
     prompt = f"""Identify {max_num_feats} writing style features that are commonly found across the following texts. Do not extract spans. Just return the feature names as a list.
     Author Texts:
@@ -442,7 +471,18 @@ def identify_style_features(author_texts: list[str], max_num_feats: int = 5) ->
         )
         return json.loads(response.choices[0].message.content)
-    return retry_call(_make_call, FeatureIdentificationSchema).features
 def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
     for attempt in range(max_attempts):
@@ -494,7 +534,7 @@ def compute_clusters_style_representation_3(
     author_names = background_corpus_df_feat_id[cluster_label_clm_name].tolist()[:max_num_authors]
     print(f"Number of authors: {len(background_corpus_df_feat_id)}")
     print(author_names)
-    features = identify_style_features(author_texts, max_num_feats=max_num_feats)
     # STEP 2: Prepare author pool for span extraction
     span_df = background_corpus_df.iloc[:4]

 from sklearn.metrics.pairwise import cosine_similarity
 CACHE_DIR = "datasets/embeddings_cache"
+ZOOM_CACHE = "datasets/zoom_cache/features_cache.json"
 os.makedirs(CACHE_DIR, exist_ok=True)
+os.makedirs(os.path.dirname(ZOOM_CACHE), exist_ok=True)
 # Bump this whenever there is a change etc...
 CACHE_VERSION = 1
     return parsed_response
+def generate_cache_key(author_names: List[str], max_num_feats: int) -> str:
+    """Generate a unique cache key based on author names and max features"""
+    # Sort author names to ensure consistent key regardless of order
+    sorted_authors = sorted(author_names)
+    key_data = {
+        "authors": sorted_authors,
+        "max_num_feats": max_num_feats
+    }
+    key_string = json.dumps(key_data, sort_keys=True)
+    return hashlib.md5(key_string.encode()).hexdigest()
+def identify_style_features(author_texts: list[str], author_names: list[str], max_num_feats: int = 5) -> list[str]:
+    cache_key = None
+    if author_names:
+        cache_key = generate_cache_key(author_names, max_num_feats)
+        if os.path.exists(ZOOM_CACHE):
+            with open(ZOOM_CACHE, 'r') as f:
+                cache = json.load(f)
+        else:
+            cache = {}
+        if cache_key in cache:
+            print(f"\nCache hit! Using cached features for authors: {author_names}")
+            return cache[cache_key]["features"]
+        else:
+            print(f"Cache miss. Computing features for authors: {author_names}")
     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
     prompt = f"""Identify {max_num_feats} writing style features that are commonly found across the following texts. Do not extract spans. Just return the feature names as a list.
     Author Texts:
         )
         return json.loads(response.choices[0].message.content)
+    features = retry_call(_make_call, FeatureIdentificationSchema).features
+    print(f"Adding to zoom cache")
+    if cache_key and author_names:
+        cache[cache_key] = {
+            "features": features
+        }
+        # save_cache(cache)
+        with open(ZOOM_CACHE, 'w') as f:
+            json.dump(cache, f, indent=2)
+        print(f"Cached features for authors: {author_names}")
 def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
     for attempt in range(max_attempts):
     author_names = background_corpus_df_feat_id[cluster_label_clm_name].tolist()[:max_num_authors]
     print(f"Number of authors: {len(background_corpus_df_feat_id)}")
     print(author_names)
+    features = identify_style_features(author_texts, author_names, max_num_feats=max_num_feats)
     # STEP 2: Prepare author pool for span extraction
     span_df = background_corpus_df.iloc[:4]

utils/llm_feat_utils.py CHANGED Viewed

@@ -125,7 +125,7 @@ def generate_feature_spans_cached(client, text: str, features: list[str], role:
             result[feat] = spans
         # 5) write back the combined cache
-        with open(cache_path, "a") as f:
             json.dump(cache, f, indent=2)
     return result

             result[feat] = spans
         # 5) write back the combined cache
+        with open(cache_path, "w") as f:
             json.dump(cache, f, indent=2)
     return result