Spaces:

taufeeque
/

codebook-features

Runtime error

App Files Files Community

taufeeque commited on Oct 27, 2023

Commit

63b5bc1

1 Parent(s): b2a4148

Update code

Browse files

Files changed (7) hide show

.gitignore +3 -0
Code_Browser.py +180 -140
README.md +2 -2
code_search_utils.py +201 -97
pages/Concept_Code.py +5 -17
utils.py +187 -232
webapp_utils.py +21 -9

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+hgf_webapp/
+.vscode/

Code_Browser.py CHANGED Viewed

@@ -1,15 +1,38 @@
 """Web App for the Codebook Features project."""
 import glob
 import os
 import streamlit as st
 import code_search_utils
 import webapp_utils
-DEPLOY_MODE = True
 webapp_utils.load_widget_state()
@@ -20,14 +43,17 @@ st.set_page_config(
 st.title("Codebook Features")
 pretty_model_names = {
     "TinyStories-1Layer-21M#100ksteps_vcb_mlp": "TinyStories-1L-21M-MLP",
-    "TinyStories-1Layer-21M_ccb_attn_preproj": "TinyStories-1L-21M-Attn",
-    "TinyStories-33M_ccb_attn_preproj": "TinyStories-4L-33M-Attn",
 }
 orig_model_name = {v: k for k, v in pretty_model_names.items()}
-base_cache_dir = "cache/"
 dirs = glob.glob(base_cache_dir + "models/*/")
 model_name_options = [d.split("/")[-2].split("_")[:-2] for d in dirs]
 model_name_options = ["_".join(m) for m in model_name_options]
@@ -41,25 +67,23 @@ p_model_name = st.selectbox(
     key=webapp_utils.persist("model_name"),
 )
 model_name = orig_model_name.get(p_model_name, p_model_name)
-model = model_name.split("_")[0].split("#")[0]
-ccb = model_name.split("_")[1]
-ccb = "_ccb" if ccb == "ccb" else ""
-cb_at = "_".join(model_name.split("_")[2:])
-seq_len = 512 if "tinystories" in model_name.lower() else 1024
-st.session_state["seq_len"] = seq_len
 codes_cache_path = base_cache_dir + f"models/{model_name}_*"
 dirs = glob.glob(codes_cache_path)
 dirs.sort(key=os.path.getmtime)
 # session states
-is_attn = "attn" in cb_at
 codes_cache_path = dirs[-1] + "/"
-model_info = code_search_utils.parse_model_info(codes_cache_path)
 num_codes = model_info.num_codes
 num_layers = model_info.n_layers
 num_heads = model_info.n_heads
 dataset_cache_path = base_cache_dir + f"datasets/{model_info.dataset_name}/"
 (
@@ -70,9 +94,12 @@ dataset_cache_path = base_cache_dir + f"datasets/{model_info.dataset_name}/"
     act_count_ft_tkns,
     metrics,
 ) = webapp_utils.load_code_search_cache(codes_cache_path, dataset_cache_path)
 metric_keys = ["eval_loss", "eval_accuracy", "eval_dead_code_fraction"]
 metrics = {k: v for k, v in metrics.items() if k.split("/")[0] in metric_keys}
 st.session_state["model_name_id"] = model_name
 st.session_state["cb_acts"] = cb_acts
 st.session_state["tokens_text"] = tokens_text
@@ -80,11 +107,13 @@ st.session_state["tokens_str"] = tokens_str
 st.session_state["act_count_ft_tkns"] = act_count_ft_tkns
 st.session_state["num_codes"] = num_codes
-st.session_state["ccb"] = ccb
 st.session_state["cb_at"] = cb_at
 st.session_state["is_attn"] = is_attn
-if not DEPLOY_MODE:
     st.markdown("## Metrics")
     # hide metrics by default
     if st.checkbox("Show Model Metrics"):
@@ -93,7 +122,7 @@ if not DEPLOY_MODE:
 st.markdown("## Demo Codes")
 demo_codes_desc = (
     "This section contains codes that we've found to be interpretable along "
-    "with a description of the feature we think they are capturing."
     "Click on the 🔍 search button for a code to see the tokens that code activates on."
 )
 st.write(demo_codes_desc)
@@ -144,7 +173,7 @@ if st.checkbox("Show Demo Codes"):
             continue
         if skip:
             continue
-        code_info = code_search_utils.get_code_info_pr_from_str(code_txt, code_regex)
         comp_info = f"layer{code_info.layer}_{f'head{code_info.head}' if code_info.head is not None else ''}"
         button_key = (
             f"demo_search_code{code_info.code}_layer{code_info.layer}_desc-{code_info.description}"
@@ -167,150 +196,160 @@ if st.checkbox("Show Demo Codes"):
         cols[-1].write(code_desc)
         skip = True
 st.markdown("## Code Search")
-regex_pattern = st.text_input(
-    "Enter a regex pattern",
-    help="Wrap code token in the first group. E.g. New (York)",
-    key="regex_pattern",
 )
-# topk = st.slider("Top K", 1, 20, 10)
-prec_col, sort_col = st.columns(2)
-prec_threshold = prec_col.slider(
-    "Precision Threshold",
-    0.0,
-    1.0,
-    0.9,
-    help="Shows codes with precision on the regex pattern above the threshold.",
-)
-sort_by_options = ["Precision", "Recall", "Num Acts"]
-sort_by_name = sort_col.radio(
-    "Sort By",
-    sort_by_options,
-    index=0,
-    horizontal=True,
-    help="Sorts the codes by the selected metric.",
-)
-sort_by = sort_by_options.index(sort_by_name)
-@st.cache_data(ttl=3600)
-def get_codebook_wise_codes_for_regex(regex_pattern, prec_threshold, ccb, model_name):
-    """Get codebook wise codes for a given regex pattern."""
-    assert model_name is not None  # required for loading from correct cache data
-    return code_search_utils.get_codes_from_pattern(
-        regex_pattern,
-        tokens_text,
-        token_byte_pos,
-        cb_acts,
-        act_count_ft_tkns,
-        ccb=ccb,
-        topk=8,
-        prec_threshold=prec_threshold,
-    )
-if regex_pattern:
-    codebook_wise_codes, re_token_matches = get_codebook_wise_codes_for_regex(
-        regex_pattern,
-        prec_threshold,
-        ccb,
-        model_name,
     )
-    st.markdown(
-        f"Found <span style='color:green;'>{re_token_matches}</span> matches",
-        unsafe_allow_html=True,
     )
-    num_search_cols = 7 if is_attn else 6
-    non_deploy_offset = 0
-    if not DEPLOY_MODE:
-        non_deploy_offset = 1
-        num_search_cols += non_deploy_offset
-    cols = st.columns(num_search_cols)
-    # st.markdown(button_height_style, unsafe_allow_html=True)
-    cols[0].markdown("Search", help="Button to see token activations for the code.")
-    cols[1].write("Layer")
-    if is_attn:
-        cols[2].write("Head")
-    cols[-4 - non_deploy_offset].write("Code")
-    cols[-3 - non_deploy_offset].write("Precision")
-    cols[-2 - non_deploy_offset].write("Recall")
-    cols[-1 - non_deploy_offset].markdown(
-        "Num Acts",
-        help="Number of tokens that the code activates on in the acts dataset.",
     )
-    if not DEPLOY_MODE:
-        cols[-1].markdown(
-            "Save to Demos",
-            help="Button to save the code to demos along with the regex pattern.",
-        )
-    all_codes = codebook_wise_codes.items()
-    all_codes = [
-        (cb_name, code_pr_info)
-        for cb_name, code_pr_infos in all_codes
-        for code_pr_info in code_pr_infos
-    ]
-    all_codes = sorted(all_codes, key=lambda x: x[1][1 + sort_by], reverse=True)
-    for cb_name, (code, prec, rec, code_acts) in all_codes:
-        layer_head = cb_name.split("_")
-        layer = layer_head[0][5:]
-        head = layer_head[1][4:] if len(layer_head) > 1 else None
-        button_key = f"search_code{code}_layer{layer}" + (
-            f"head{head}" if head is not None else ""
         )
-        cols = st.columns(num_search_cols)
-        extra_args = {
-            "prec": prec,
-            "recall": rec,
-            "num_acts": code_acts,
-            "regex": regex_pattern,
-        }
-        button_clicked = cols[0].button("🔍", key=button_key)
-        if button_clicked:
-            webapp_utils.set_ct_acts(code, layer, head, extra_args, is_attn)
-        cols[1].write(layer)
-        if is_attn:
-            cols[2].write(head)
-        cols[-4 - non_deploy_offset].write(code)
-        cols[-3 - non_deploy_offset].write(f"{prec*100:.2f}%")
-        cols[-2 - non_deploy_offset].write(f"{rec*100:.2f}%")
-        cols[-1 - non_deploy_offset].write(str(code_acts))
-        if not DEPLOY_MODE:
-            webapp_utils.add_save_code_button(
-                demo_file_path,
-                num_acts=code_acts,
-                save_regex=True,
-                prec=prec,
-                recall=rec,
-                button_st_container=cols[-1],
-                button_key_suffix=f"_code{code}_layer{layer}_head{head}",
-            )
-    if len(all_codes) == 0:
         st.markdown(
-            f"""
-            <div style="font-size: 1.0rem; color: red;">
-            No codes found for pattern {regex_pattern} at precision threshold: {prec_threshold}
-            </div>
-            """,
             unsafe_allow_html=True,
         )
 st.markdown("## Code Token Activations")
-filter_codes = st.checkbox("Show filters", key="filter_codes")
 act_range, layer_code_acts = None, None
 if filter_codes:
     act_range = st.slider(
-        "Num Acts",
         0,
         10_000,
-        (100, 10_000),
         key="ct_act_range",
         help="Filter codes by the number of tokens they activate on.",
     )
@@ -361,6 +400,7 @@ acts, acts_count = webapp_utils.get_code_acts(
     head,
     ctx_size,
     num_examples,
 )
 st.write(
@@ -368,7 +408,7 @@ st.write(
     f"Activates on {acts_count[0]} tokens on the acts dataset",
 )
-if not DEPLOY_MODE:
     webapp_utils.add_save_code_button(
         demo_file_path,
         acts_count[0],

 """Web App for the Codebook Features project."""
+import argparse
 import glob
 import os
 import streamlit as st
 import code_search_utils
+import utils
 import webapp_utils
+# --- Parse command line arguments ---
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--deploy",
+    default=True,
+    help="Deploy mode.",
+)
+parser.add_argument(
+    "--cache_dir",
+    type=str,
+    default="cache/",
+    help="Path to directory containing cache for codebook models.",
+)
+try:
+    args = parser.parse_args()
+except SystemExit as e:
+    # This exception will be raised if --help or invalid command line arguments
+    # are used. Currently streamlit prevents the program from exiting normally
+    # so we have to do a hard exit.
+    os._exit(e.code if isinstance(e.code, int) else 1)
+deploy = args.deploy
 webapp_utils.load_widget_state()
 st.title("Codebook Features")
+# --- Load model info and cache ---
 pretty_model_names = {
     "TinyStories-1Layer-21M#100ksteps_vcb_mlp": "TinyStories-1L-21M-MLP",
+    "TinyStories-1Layer-21M_ccb_attn_preproj": "TinyStories 1 Layer Attention Codebook",
+    "TinyStories-33M_ccb_attn_preproj": "TinyStories 4 Layer Attention Codebook",
+    "TinyStories-1Layer-21M_vcb_mlp": "TinyStories 1 Layer MLP Codebook",
 }
 orig_model_name = {v: k for k, v in pretty_model_names.items()}
+base_cache_dir = args.cache_dir
 dirs = glob.glob(base_cache_dir + "models/*/")
 model_name_options = [d.split("/")[-2].split("_")[:-2] for d in dirs]
 model_name_options = ["_".join(m) for m in model_name_options]
     key=webapp_utils.persist("model_name"),
 )
 model_name = orig_model_name.get(p_model_name, p_model_name)
+is_fsm = "FSM" in p_model_name
 codes_cache_path = base_cache_dir + f"models/{model_name}_*"
 dirs = glob.glob(codes_cache_path)
 dirs.sort(key=os.path.getmtime)
 # session states
 codes_cache_path = dirs[-1] + "/"
+model_info = utils.ModelInfoForWebapp.load(codes_cache_path)
 num_codes = model_info.num_codes
 num_layers = model_info.n_layers
 num_heads = model_info.n_heads
+cb_at = model_info.cb_at
+gcb = model_info.gcb
+gcb = "_gcb" if gcb else ""
+is_attn = "attn" in cb_at
 dataset_cache_path = base_cache_dir + f"datasets/{model_info.dataset_name}/"
 (
     act_count_ft_tkns,
     metrics,
 ) = webapp_utils.load_code_search_cache(codes_cache_path, dataset_cache_path)
+seq_len = len(tokens_str[0])
 metric_keys = ["eval_loss", "eval_accuracy", "eval_dead_code_fraction"]
 metrics = {k: v for k, v in metrics.items() if k.split("/")[0] in metric_keys}
+# --- Set the session states ---
 st.session_state["model_name_id"] = model_name
 st.session_state["cb_acts"] = cb_acts
 st.session_state["tokens_text"] = tokens_text
 st.session_state["act_count_ft_tkns"] = act_count_ft_tkns
 st.session_state["num_codes"] = num_codes
+st.session_state["gcb"] = gcb
 st.session_state["cb_at"] = cb_at
 st.session_state["is_attn"] = is_attn
+st.session_state["seq_len"] = seq_len
+if not deploy:
     st.markdown("## Metrics")
     # hide metrics by default
     if st.checkbox("Show Model Metrics"):
 st.markdown("## Demo Codes")
 demo_codes_desc = (
     "This section contains codes that we've found to be interpretable along "
+    "with a description of the feature we think they are capturing. "
     "Click on the 🔍 search button for a code to see the tokens that code activates on."
 )
 st.write(demo_codes_desc)
             continue
         if skip:
             continue
+        code_info = utils.CodeInfo.from_str(code_txt, regex=code_regex)
         comp_info = f"layer{code_info.layer}_{f'head{code_info.head}' if code_info.head is not None else ''}"
         button_key = (
             f"demo_search_code{code_info.code}_layer{code_info.layer}_desc-{code_info.description}"
         cols[-1].write(code_desc)
         skip = True
+# --- Code Search ---
 st.markdown("## Code Search")
+code_search_desc = (
+    "If you want to find whether the codebooks model has captured a relevant features from the data,"
+    " you can specify a regex pattern for your feature and find whether any code activating on the regex pattern"
+    " exists. The first group in the regex pattern is the token that the code activates on. If the group contains"
+    " multiple tokens, we search for codes that will activate on the first token in the group followed by the"
+    " subsequent tokens in the group. For example, the search term 'New (York)' will try to find codes that"
+    " activate on the bigram feature 'New York' at the York token."
 )
+if st.checkbox("Search with Regex"):
+    st.write(code_search_desc)
+    regex_pattern = st.text_input(
+        "Enter a regex pattern",
+        help="Wrap code token in the first group. E.g. New (York)",
+        key="regex_pattern",
     )
+    # topk = st.slider("Top K", 1, 20, 10)
+    prec_col, sort_col = st.columns(2)
+    prec_threshold = prec_col.slider(
+        "Precision Threshold",
+        0.0,
+        1.0,
+        0.9,
+        help="Shows codes with precision on the regex pattern above the threshold.",
     )
+    sort_by_options = ["Precision", "Recall", "Num Acts"]
+    sort_by_name = sort_col.radio(
+        "Sort By",
+        sort_by_options,
+        index=0,
+        horizontal=True,
+        help="Sorts the codes by the selected metric.",
     )
+    sort_by = sort_by_options.index(sort_by_name)
+    @st.cache_data(ttl=3600)
+    def get_codebook_wise_codes_for_regex(
+        regex_pattern, prec_threshold, gcb, model_name
+    ):
+        """Get codebook wise codes for a given regex pattern."""
+        assert model_name is not None  # required for loading from correct cache data
+        return code_search_utils.get_codes_from_pattern(
+            regex_pattern,
+            tokens_text,
+            token_byte_pos,
+            cb_acts,
+            act_count_ft_tkns,
+            gcb=gcb,
+            topk=8,
+            prec_threshold=prec_threshold,
         )
+    if regex_pattern:
+        codebook_wise_codes, re_token_matches = get_codebook_wise_codes_for_regex(
+            regex_pattern,
+            prec_threshold,
+            gcb,
+            model_name,
+        )
         st.markdown(
+            f"Found <span style='color:green;'>{re_token_matches}</span> matches",
             unsafe_allow_html=True,
         )
+        num_search_cols = 7 if is_attn else 6
+        non_deploy_offset = 0
+        if not deploy:
+            non_deploy_offset = 1
+            num_search_cols += non_deploy_offset
+        cols = st.columns(num_search_cols)
+        cols[0].markdown("Search", help="Button to see token activations for the code.")
+        cols[1].write("Layer")
+        if is_attn:
+            cols[2].write("Head")
+        cols[-4 - non_deploy_offset].write("Code")
+        cols[-3 - non_deploy_offset].write("Precision")
+        cols[-2 - non_deploy_offset].write("Recall")
+        cols[-1 - non_deploy_offset].markdown(
+            "Num Acts",
+            help="Number of tokens that the code activates on in the acts dataset.",
+        )
+        if not deploy:
+            cols[-1].markdown(
+                "Save to Demos",
+                help="Button to save the code to demos along with the regex pattern.",
+            )
+        all_codes = codebook_wise_codes.items()
+        all_codes = [
+            (cb_name, code_pr_info)
+            for cb_name, code_pr_infos in all_codes
+            for code_pr_info in code_pr_infos
+        ]
+        all_codes = sorted(all_codes, key=lambda x: x[1][1 + sort_by], reverse=True)
+        for cb_name, (code, prec, rec, code_acts) in all_codes:
+            layer_head = cb_name.split("_")
+            layer = layer_head[0][5:]
+            head = layer_head[1][4:] if len(layer_head) > 1 else None
+            button_key = f"search_code{code}_layer{layer}" + (
+                f"head{head}" if head is not None else ""
+            )
+            cols = st.columns(num_search_cols)
+            extra_args = {
+                "prec": prec,
+                "recall": rec,
+                "num_acts": code_acts,
+                "regex": regex_pattern,
+            }
+            button_clicked = cols[0].button("🔍", key=button_key)
+            if button_clicked:
+                webapp_utils.set_ct_acts(code, layer, head, extra_args, is_attn)
+            cols[1].write(layer)
+            if is_attn:
+                cols[2].write(head)
+            cols[-4 - non_deploy_offset].write(code)
+            cols[-3 - non_deploy_offset].write(f"{prec*100:.2f}%")
+            cols[-2 - non_deploy_offset].write(f"{rec*100:.2f}%")
+            cols[-1 - non_deploy_offset].write(str(code_acts))
+            if not deploy:
+                webapp_utils.add_save_code_button(
+                    demo_file_path,
+                    num_acts=code_acts,
+                    save_regex=True,
+                    prec=prec,
+                    recall=rec,
+                    button_st_container=cols[-1],
+                    button_key_suffix=f"_code{code}_layer{layer}_head{head}",
+                )
+        if len(all_codes) == 0:
+            st.markdown(
+                f"""
+                <div style="font-size: 1.0rem; color: red;">
+                No codes found for pattern {regex_pattern} at precision threshold: {prec_threshold}
+                </div>
+                """,
+                unsafe_allow_html=True,
+            )
+# --- Display Code Token Activations ---
 st.markdown("## Code Token Activations")
+filter_codes = st.checkbox("Show filters", key="filter_codes", value=True)
 act_range, layer_code_acts = None, None
 if filter_codes:
     act_range = st.slider(
+        "Minimum number of activations",
         0,
         10_000,
+        100,
         key="ct_act_range",
         help="Filter codes by the number of tokens they activate on.",
     )
     head,
     ctx_size,
     num_examples,
+    is_fsm=is_fsm,
 )
 st.write(
     f"Activates on {acts_count[0]} tokens on the acts dataset",
 )
+if not deploy:
     webapp_utils.add_save_code_button(
         demo_file_path,
         acts_count[0],

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Codebook Features
-emoji: 👀
 colorFrom: gray
-colorTo: green
 sdk: streamlit
 sdk_version: 1.25.0
 app_file: Code_Browser.py

 ---
 title: Codebook Features
+emoji: 📚
 colorFrom: gray
+colorTo: blue
 sdk: streamlit
 sdk_version: 1.25.0
 app_file: Code_Browser.py

code_search_utils.py CHANGED Viewed

@@ -2,15 +2,11 @@
 import pickle
 import re
-from dataclasses import dataclass
-from typing import Optional
 import numpy as np
 import torch
 from tqdm import tqdm
-import utils
 def load_dataset_cache(cache_base_path):
     """Load cache files required for dataset from `cache_base_path`."""
@@ -31,28 +27,73 @@ def load_code_search_cache(cache_base_path):
     return cb_acts, act_count_ft_tkns, metrics
-def search_re(re_pattern, tokens_text):
-    """Get list of (example_id, token_pos) where re_pattern matches in tokens_text."""
-    # TODO: ensure that parantheses are not escaped
     if re_pattern.find("(") == -1:
         re_pattern = f"({re_pattern})"
-    return [
         (i, finditer.span(1)[0])
         for i, text in enumerate(tokens_text)
         for finditer in re.finditer(re_pattern, text)
         if finditer.span(1)[0] != finditer.span(1)[1]
     ]
 def byte_id_to_token_pos_id(example_byte_id, token_byte_pos):
-    """Get (example_id, token_pos_id) for given (example_id, byte_id)."""
     example_id, byte_id = example_byte_id
     index = np.searchsorted(token_byte_pos[example_id], byte_id, side="right")
     return (example_id, index)
-def get_code_pr(token_pos_ids, codebook_acts, cb_act_counts=None):
-    """Get codes, prec, recall for given token_pos_ids and codebook_acts."""
     codes = np.array(
         [
             codebook_acts[example_id][token_pos_id]
@@ -76,46 +117,64 @@ def get_code_pr(token_pos_ids, codebook_acts, cb_act_counts=None):
     return codes, prec, recall, code_acts
-def get_neuron_pr(
-    token_pos_ids, recall, neuron_acts_by_ex, neuron_sorted_acts, topk=10
 ):
-    """Get codes, prec, recall for given token_pos_ids and codebook_acts."""
-    # check if neuron_acts_by_ex is a torch tensor
     if isinstance(neuron_acts_by_ex, torch.Tensor):
-        re_neuron_acts = torch.stack(
             [
                 neuron_acts_by_ex[example_id, token_pos_id]
                 for example_id, token_pos_id in token_pos_ids
             ],
             dim=-1,
         )  # (layers, 2, dim_size, matches)
-        re_neuron_acts = torch.sort(re_neuron_acts, dim=-1).values
     else:
-        re_neuron_acts = np.stack(
             [
                 neuron_acts_by_ex[example_id, token_pos_id]
                 for example_id, token_pos_id in token_pos_ids
             ],
             axis=-1,
         )  # (layers, 2, dim_size, matches)
-        re_neuron_acts.sort(axis=-1)
-        re_neuron_acts = torch.from_numpy(re_neuron_acts)
-    # re_neuron_acts = re_neuron_acts[:, :, :, -int(recall * re_neuron_acts.shape[-1]) :]
-    print("Examples for recall", recall, ":", int(recall * re_neuron_acts.shape[-1]))
-    act_thresh = re_neuron_acts[:, :, :, -int(recall * re_neuron_acts.shape[-1])]
-    # binary search act_thresh in neuron_sorted_acts
     assert neuron_sorted_acts.shape[:-1] == act_thresh.shape
     prec_den = torch.searchsorted(neuron_sorted_acts, act_thresh.unsqueeze(-1))
     prec_den = prec_den.squeeze(-1)
     prec_den = neuron_sorted_acts.shape[-1] - prec_den
-    prec = int(recall * re_neuron_acts.shape[-1]) / prec_den
     assert (
-        prec.shape == re_neuron_acts.shape[:-1]
-    ), f"{prec.shape} != {re_neuron_acts.shape[:-1]}"
     best_neuron_idx = np.unravel_index(prec.argmax(), prec.shape)
     best_prec = prec[best_neuron_idx]
-    print("max prec:", best_prec)
     best_neuron_act_thresh = act_thresh[best_neuron_idx].item()
     best_neuron_acts = neuron_acts_by_ex[
         :, :, best_neuron_idx[0], best_neuron_idx[1], best_neuron_idx[2]
@@ -126,20 +185,20 @@ def get_neuron_pr(
     return best_prec, best_neuron_acts, best_neuron_idx
-def convert_to_adv_name(name, cb_at, ccb=""):
-    """Convert layer0_head0 to layer0_attn_preproj_ccb0."""
-    if ccb:
         layer, head = name.split("_")
-        return layer + f"_{cb_at}_ccb" + head[4:]
     else:
         return layer + "_" + cb_at
-def convert_to_base_name(name, ccb=""):
-    """Convert layer0_attn_preproj_ccb0 to layer0_head0."""
     split_name = name.split("_")
     layer, head = split_name[0], split_name[-1][3:]
-    if "ccb" in name:
         return layer + "_head" + head
     else:
         return layer
@@ -156,7 +215,7 @@ def get_layer_head_from_base_name(name):
 def get_layer_head_from_adv_name(name):
-    """Convert layer0_attn_preproj_ccb0 to 0, 0."""
     base_name = convert_to_base_name(name)
     layer, head = get_layer_head_from_base_name(base_name)
     return layer, head
@@ -168,12 +227,39 @@ def get_codes_from_pattern(
     token_byte_pos,
     cb_acts,
     act_count_ft_tkns,
-    ccb="",
     topk=5,
     prec_threshold=0.5,
 ):
-    """Fetch codes from a given regex pattern."""
-    byte_ids = search_re(re_pattern, tokens_text)
     token_pos_ids = [
         byte_id_to_token_pos_id(ex_byte_id, token_byte_pos) for ex_byte_id in byte_ids
     ]
@@ -181,8 +267,8 @@ def get_codes_from_pattern(
     re_token_matches = len(token_pos_ids)
     codebook_wise_codes = {}
     for cb_name, cb in tqdm(cb_acts.items()):
-        base_cb_name = convert_to_base_name(cb_name, ccb=ccb)
-        codes, prec, recall, code_acts = get_code_pr(
             token_pos_ids,
             cb,
             cb_act_counts=act_count_ft_tkns[base_cb_name],
@@ -203,15 +289,49 @@ def get_neurons_from_pattern(
     neuron_acts_by_ex,
     neuron_sorted_acts,
     recall_threshold,
 ):
-    """Fetch the best neuron (with act thresh given by recall) from a given regex pattern."""
-    byte_ids = search_re(re_pattern, tokens_text)
     token_pos_ids = [
         byte_id_to_token_pos_id(ex_byte_id, token_byte_pos) for ex_byte_id in byte_ids
     ]
     token_pos_ids = np.unique(token_pos_ids, axis=0)
     re_token_matches = len(token_pos_ids)
-    best_prec, best_neuron_acts, best_neuron_idx = get_neuron_pr(
         token_pos_ids,
         recall_threshold,
         neuron_acts_by_ex,
@@ -226,74 +346,58 @@ def compare_codes_with_neurons(
     token_byte_pos,
     neuron_acts_by_ex,
     neuron_sorted_acts,
 ):
-    """Compare codes with neurons."""
     assert isinstance(neuron_acts_by_ex, np.ndarray)
     (
-        all_best_prec,
         all_best_neuron_acts,
         all_best_neuron_idxs,
         all_re_token_matches,
     ) = zip(
         *[
             get_neurons_from_pattern(
-                code_info.re_pattern,
                 tokens_text,
                 token_byte_pos,
                 neuron_acts_by_ex,
                 neuron_sorted_acts,
                 code_info.recall,
             )
-            for code_info in tqdm(range(len(best_codes_info)))
         ],
         strict=True,
     )
-    code_best_precs = np.array(
-        [code_info.prec for code_info in range(len(best_codes_info))]
-    )
-    codes_better_than_neurons = code_best_precs > np.array(all_best_prec)
-    return codes_better_than_neurons.mean()
-def get_code_info_pr_from_str(code_txt, regex):
-    """Extract code info fields from string."""
-    code_txt = code_txt.strip()
-    code_txt = code_txt.split(", ")
-    code_txt = dict(txt.split(": ") for txt in code_txt)
-    return utils.CodeInfo(**code_txt)
-@dataclass
-class ModelInfoForWebapp:
-    """Model info for webapp."""
-    model_name: str
-    pretrained_path: str
-    dataset_name: str
-    num_codes: int
-    cb_at: str
-    ccb: str
-    n_layers: int
-    n_heads: Optional[int] = None
-    seed: int = 42
-    max_samples: int = 2000
-    def __post_init__(self):
-        """Convert to correct types."""
-        self.num_codes = int(self.num_codes)
-        self.n_layers = int(self.n_layers)
-        if self.n_heads == "None":
-            self.n_heads = None
-        elif self.n_heads is not None:
-            self.n_heads = int(self.n_heads)
-        self.seed = int(self.seed)
-        self.max_samples = int(self.max_samples)
-def parse_model_info(path):
-    """Parse model info from path."""
-    with open(path + "info.txt", "r") as f:
-        lines = f.readlines()
-        lines = dict(line.strip().split(": ") for line in lines)
-        return ModelInfoForWebapp(**lines)
-        return ModelInfoForWebapp(**lines)

 import pickle
 import re
 import numpy as np
 import torch
 from tqdm import tqdm
 def load_dataset_cache(cache_base_path):
     """Load cache files required for dataset from `cache_base_path`."""
     return cb_acts, act_count_ft_tkns, metrics
+def search_re(re_pattern, tokens_text, at_odd_even=-1):
+    """Get list of (example_id, token_pos) where re_pattern matches in tokens_text.
+    Args:
+        re_pattern: regex pattern to search for.
+        tokens_text: list of example texts.
+        at_odd_even: to limit matches to odd or even positions only.
+            -1 (default): to not limit matches.
+            0: to limit matches to odd positions only.
+            1: to limit matches to even positions only.
+            This is useful for the TokFSM dataset when searching for states
+            since the first token of states are always at even positions.
+    """
+    # TODO: ensure that parentheses are not escaped
+    assert at_odd_even in [-1, 0, 1], f"Invalid at_odd_even: {at_odd_even}"
     if re_pattern.find("(") == -1:
         re_pattern = f"({re_pattern})"
+    res = [
         (i, finditer.span(1)[0])
         for i, text in enumerate(tokens_text)
         for finditer in re.finditer(re_pattern, text)
         if finditer.span(1)[0] != finditer.span(1)[1]
     ]
+    if at_odd_even != -1:
+        res = [r for r in res if r[1] % 2 == at_odd_even]
+    return res
 def byte_id_to_token_pos_id(example_byte_id, token_byte_pos):
+    """Convert byte position (or character position in a text) to its token position.
+    Used to convert the searched regex span to its token position.
+    Args:
+        example_byte_id: tuple of (example_id, byte_id) where byte_id is a
+            character's position in the text.
+        token_byte_pos: numpy array of shape (num_examples, seq_len) where
+            `token_byte_pos[example_id][token_pos]` is the byte position of
+            the token at `token_pos` in the example with `example_id`.
+    Returns:
+        (example_id, token_pos_id) tuple.
+    """
     example_id, byte_id = example_byte_id
     index = np.searchsorted(token_byte_pos[example_id], byte_id, side="right")
     return (example_id, index)
+def get_code_precision_and_recall(token_pos_ids, codebook_acts, cb_act_counts=None):
+    """Search for the codes that activate on the given `token_pos_ids`.
+    Args:
+        token_pos_ids: list of (example_id, token_pos_id) tuples.
+        codebook_acts: numpy array of activations of a codebook on a dataset with
+            shape (num_examples, seq_len, k_codebook).
+        cb_act_counts: array of shape (num_codes,) where `cb_act_counts[cb_name][code]`
+            is the number of times the code `code` is activated in the dataset.
+    Returns:
+        codes: numpy array of code ids sorted by their precision on the given `token_pos_ids`.
+        prec: numpy array where `prec[i]` is the precision of the code
+            `codes[i]` for the given `token_pos_ids`.
+        recall: numpy array where `recall[i]` is the recall of the code
+            `codes[i]` for the given `token_pos_ids`.
+        code_acts: numpy array where `code_acts[i]` is the number of times
+            the code `codes[i]` is activated in the dataset.
+    """
     codes = np.array(
         [
             codebook_acts[example_id][token_pos_id]
     return codes, prec, recall, code_acts
+def get_neuron_precision_and_recall(
+    token_pos_ids, recall, neuron_acts_by_ex, neuron_sorted_acts
 ):
+    """Get the neurons with the highest precision and recall for the given `token_pos_ids`.
+    Args:
+        token_pos_ids: list of token (example_id, token_pos_id) tuples from a dataset over which
+            the neurons with the highest precision and recall are to be found.
+        recall: recall threshold for the neurons (this determines their activation threshold).
+        neuron_acts_by_ex: numpy array of activations of all the attention and mlp output neurons
+            on a dataset with shape (num_examples, seq_len, num_layers, 2, dim_size).
+            The third dimension is 2 because we consider neurons from both: attention and mlp.
+        neuron_sorted_acts: numpy array of sorted activations of all the attention and mlp output neurons
+            on a dataset with shape (num_layers, 2, dim_size, num_examples * seq_len).
+            This should be obtained using the `neuron_acts_by_ex` array by rearranging the first two
+            dimensions to the last dimensions and then sorting the last dimension.
+    Returns:
+        best_prec: highest precision amongst all the neurons for the given `token_pos_ids`.
+        best_neuron_acts: number of activations of the best neuron for the given `token_pos_ids`
+            based on the threshold determined by the `recall` argument.
+        best_neuron_idx: tuple of (layer, is_mlp, neuron_id) where `layer` is the layer number,
+            `is_mlp` is 0 if the neuron is from attention and 1 if the neuron is from mlp,
+            and `neuron_id` is the neuron's index in the layer.
+    """
     if isinstance(neuron_acts_by_ex, torch.Tensor):
+        neuron_acts_on_pattern = torch.stack(
             [
                 neuron_acts_by_ex[example_id, token_pos_id]
                 for example_id, token_pos_id in token_pos_ids
             ],
             dim=-1,
         )  # (layers, 2, dim_size, matches)
+        neuron_acts_on_pattern = torch.sort(neuron_acts_on_pattern, dim=-1).values
     else:
+        neuron_acts_on_pattern = np.stack(
             [
                 neuron_acts_by_ex[example_id, token_pos_id]
                 for example_id, token_pos_id in token_pos_ids
             ],
             axis=-1,
         )  # (layers, 2, dim_size, matches)
+        neuron_acts_on_pattern.sort(axis=-1)
+        neuron_acts_on_pattern = torch.from_numpy(neuron_acts_on_pattern)
+    act_thresh = neuron_acts_on_pattern[
+        :, :, :, -int(recall * neuron_acts_on_pattern.shape[-1])
+    ]
     assert neuron_sorted_acts.shape[:-1] == act_thresh.shape
     prec_den = torch.searchsorted(neuron_sorted_acts, act_thresh.unsqueeze(-1))
     prec_den = prec_den.squeeze(-1)
     prec_den = neuron_sorted_acts.shape[-1] - prec_den
+    prec = int(recall * neuron_acts_on_pattern.shape[-1]) / prec_den
     assert (
+        prec.shape == neuron_acts_on_pattern.shape[:-1]
+    ), f"{prec.shape} != {neuron_acts_on_pattern.shape[:-1]}"
     best_neuron_idx = np.unravel_index(prec.argmax(), prec.shape)
     best_prec = prec[best_neuron_idx]
     best_neuron_act_thresh = act_thresh[best_neuron_idx].item()
     best_neuron_acts = neuron_acts_by_ex[
         :, :, best_neuron_idx[0], best_neuron_idx[1], best_neuron_idx[2]
     return best_prec, best_neuron_acts, best_neuron_idx
+def convert_to_adv_name(name, cb_at, gcb=""):
+    """Convert layer0_head0 to layer0_attn_preproj_gcb0."""
+    if gcb:
         layer, head = name.split("_")
+        return layer + f"_{cb_at}_gcb" + head[4:]
     else:
         return layer + "_" + cb_at
+def convert_to_base_name(name, gcb=""):
+    """Convert layer0_attn_preproj_gcb0 to layer0_head0."""
     split_name = name.split("_")
     layer, head = split_name[0], split_name[-1][3:]
+    if "gcb" in name:
         return layer + "_head" + head
     else:
         return layer
 def get_layer_head_from_adv_name(name):
+    """Convert layer0_attn_preproj_gcb0 to 0, 0."""
     base_name = convert_to_base_name(name)
     layer, head = get_layer_head_from_base_name(base_name)
     return layer, head
     token_byte_pos,
     cb_acts,
     act_count_ft_tkns,
+    gcb="",
     topk=5,
     prec_threshold=0.5,
+    at_odd_even=-1,
 ):
+    """Fetch codes that activate on a given regex pattern.
+    Retrieves at most `top_k` codes that activate with precision above `prec_threshold`.
+    Args:
+        re_pattern: regex pattern to search for.
+        tokens_text: list of example texts of a dataset.
+        token_byte_pos: numpy array of shape (num_examples, seq_len) where
+            `token_byte_pos[example_id][token_pos]` is the byte position of
+            the token at `token_pos` in the example with `example_id`.
+        cb_acts: dict of codebook activations.
+        act_count_ft_tkns: dict over all codebooks of number of token activations on the dataset
+        gcb: "_gcb" for grouped codebooks and "" for non-grouped codebooks.
+        topk: maximum number of codes to return per codebook.
+        prec_threshold: minimum precision required for a code to be returned.
+        at_odd_even: to limit matches to odd or even positions only.
+            -1 (default): to not limit matches.
+            0: to limit matches to odd positions only.
+            1: to limit matches to even positions only.
+            This is useful for the TokFSM dataset when searching for states
+            since the first token of states are always at even positions.
+    Returns:
+        codebook_wise_codes: dict of codebook name to list of
+        (code, prec, recall, code_acts) tuples.
+        re_token_matches: number of tokens that match the regex pattern.
+    """
+    byte_ids = search_re(re_pattern, tokens_text, at_odd_even=at_odd_even)
     token_pos_ids = [
         byte_id_to_token_pos_id(ex_byte_id, token_byte_pos) for ex_byte_id in byte_ids
     ]
     re_token_matches = len(token_pos_ids)
     codebook_wise_codes = {}
     for cb_name, cb in tqdm(cb_acts.items()):
+        base_cb_name = convert_to_base_name(cb_name, gcb=gcb)
+        codes, prec, recall, code_acts = get_code_precision_and_recall(
             token_pos_ids,
             cb,
             cb_act_counts=act_count_ft_tkns[base_cb_name],
     neuron_acts_by_ex,
     neuron_sorted_acts,
     recall_threshold,
+    at_odd_even=-1,
 ):
+    """Fetch the highest precision neurons that activate on a given regex pattern.
+    The activation threshold for the neurons is determined by the `recall_threshold`.
+    Args:
+        re_pattern: regex pattern to search for.
+        tokens_text: list of example texts of a dataset.
+        token_byte_pos: numpy array of shape (num_examples, seq_len) where
+            `token_byte_pos[example_id][token_pos]` is the byte position of
+            the token at `token_pos` in the example with `example_id`.
+        neuron_acts_by_ex: numpy array of activations of all the attention and mlp output neurons
+            on a dataset with shape (num_examples, seq_len, num_layers, 2, dim_size).
+            The third dimension is 2 because we consider neurons from both: attention and mlp.
+        neuron_sorted_acts: numpy array of sorted activations of all the attention and mlp output neurons
+            on a dataset with shape (num_layers, 2, dim_size, num_examples * seq_len).
+            This should be obtained using the `neuron_acts_by_ex` array by rearranging the first two
+            dimensions to the last dimensions and then sorting the last dimension.
+        recall_threshold: recall threshold for the neurons (this determines their activation threshold).
+        at_odd_even: to limit matches to odd or even positions only.
+            -1 (default): to not limit matches.
+            0: to limit matches to odd positions only.
+            1: to limit matches to even positions only.
+            This is useful for the TokFSM dataset when searching for states
+            since the first token of states are always at even positions.
+    Returns:
+        best_prec: highest precision amongst all the neurons for the given `token_pos_ids`.
+        best_neuron_acts: number of activations of the best neuron for the given `token_pos_ids`
+            based on the threshold determined by the `recall` argument.
+        best_neuron_idx: tuple of (layer, is_mlp, neuron_id) where `layer` is the layer number,
+            `is_mlp` is 0 if the neuron is from attention and 1 if the neuron is from mlp,
+            and `neuron_id` is the neuron's index in the layer.
+        re_token_matches: number of tokens that match the regex pattern.
+    """
+    byte_ids = search_re(re_pattern, tokens_text, at_odd_even=at_odd_even)
     token_pos_ids = [
         byte_id_to_token_pos_id(ex_byte_id, token_byte_pos) for ex_byte_id in byte_ids
     ]
     token_pos_ids = np.unique(token_pos_ids, axis=0)
     re_token_matches = len(token_pos_ids)
+    best_prec, best_neuron_acts, best_neuron_idx = get_neuron_precision_and_recall(
         token_pos_ids,
         recall_threshold,
         neuron_acts_by_ex,
     token_byte_pos,
     neuron_acts_by_ex,
     neuron_sorted_acts,
+    at_odd_even=-1,
 ):
+    """Compare codes with the highest precision neurons on the regex pattern of the code.
+    Args:
+        best_codes_info: list of CodeInfo objects.
+        tokens_text: list of example texts of a dataset.
+        token_byte_pos: numpy array of shape (num_examples, seq_len) where
+            `token_byte_pos[example_id][token_pos]` is the byte position of
+            the token at `token_pos` in the example with `example_id`.
+        neuron_acts_by_ex: numpy array of activations of all the attention and mlp output neurons
+            on a dataset with shape (num_examples, seq_len, num_layers, 2, dim_size).
+            The third dimension is 2 because we consider neurons from both: attention and mlp.
+        neuron_sorted_acts: numpy array of sorted activations of all the attention and mlp output neurons
+            on a dataset with shape (num_layers, 2, dim_size, num_examples * seq_len).
+            This should be obtained using the `neuron_acts_by_ex` array by rearranging the first two
+            dimensions to the last dimensions and then sorting the last dimension.
+        at_odd_even: to limit matches to odd or even positions only.
+            -1 (default): to not limit matches.
+            0: to limit matches to odd positions only.
+            1: to limit matches to even positions only.
+            This is useful for the TokFSM dataset when searching for states
+            since the first token of states are always at even positions.
+    Returns:
+        codes_better_than_neurons: fraction of codes that have higher precision than the highest
+            precision neuron on the regex pattern of the code.
+        code_best_precs: is an array of the precision of each code in `best_codes_info`.
+        all_best_prec: is an array of the highest precision neurons on the regex pattern.
+    """
     assert isinstance(neuron_acts_by_ex, np.ndarray)
     (
+        neuron_best_prec,
         all_best_neuron_acts,
         all_best_neuron_idxs,
         all_re_token_matches,
     ) = zip(
         *[
             get_neurons_from_pattern(
+                code_info.regex,
                 tokens_text,
                 token_byte_pos,
                 neuron_acts_by_ex,
                 neuron_sorted_acts,
                 code_info.recall,
+                at_odd_even=at_odd_even,
             )
+            for code_info in tqdm(best_codes_info)
         ],
         strict=True,
     )
+    neuron_best_prec = np.array(neuron_best_prec)
+    code_best_precs = np.array([code_info.prec for code_info in best_codes_info])
+    codes_better_than_neurons = code_best_precs > neuron_best_prec
+    return codes_better_than_neurons.mean(), code_best_precs, neuron_best_prec

pages/Concept_Code.py CHANGED Viewed

@@ -21,7 +21,7 @@ tokens_text = st.session_state["tokens_text"]
 tokens_str = st.session_state["tokens_str"]
 cb_acts = st.session_state["cb_acts"]
 act_count_ft_tkns = st.session_state["act_count_ft_tkns"]
-ccb = st.session_state["ccb"]
 def get_example_concept_codes(example_id):
@@ -29,8 +29,8 @@ def get_example_concept_codes(example_id):
     token_pos_ids = [(example_id, i) for i in range(seq_len)]
     all_codes = []
     for cb_name, cb in cb_acts.items():
-        base_cb_name = code_search_utils.convert_to_base_name(cb_name, ccb=ccb)
-        codes, prec, rec, code_acts = code_search_utils.get_code_pr(
             token_pos_ids,
             cb,
             act_count_ft_tkns[base_cb_name],
@@ -112,7 +112,6 @@ concept_code_description = (
 )
 st.write(concept_code_description)
-# ex_col, p_col, r_col, trunc_col, sort_col = st.columns([1, 2, 2, 1, 1])
 ex_col, r_col, trunc_col, sort_col = st.columns([1, 1, 1, 1])
 example_id = ex_col.number_input(
     "Example ID",
@@ -121,14 +120,6 @@ example_id = ex_col.number_input(
     0,
     key="example_id",
 )
-# prec_threshold = p_col.slider(
-#     "Precision Threshold",
-#     0.0,
-#     1.0,
-#     0.02,
-#     key="prec",
-#     help="Precision Threshold controls the specificity of the codes for the given example.",
-# )
 recall_threshold = r_col.slider(
     "Recall Threshold",
     0.0,
@@ -138,13 +129,13 @@ recall_threshold = r_col.slider(
     help="Recall Threshold is the minimum fraction of tokens in the example that the code must activate on.",
 )
 example_truncation = trunc_col.number_input(
-    "Max Output Chars", 0, 10240, 1024, key="max_chars"
 )
 sort_by_options = ["Precision", "Recall", "Num Acts"]
 sort_by_name = sort_col.radio(
     "Sort By",
     sort_by_options,
-    index=0,
     horizontal=True,
     help="Sorts the codes by the selected metric.",
 )
@@ -158,9 +149,6 @@ button = st.button(
     args=(example_id,),
     help="Find an example which has codes above the recall threshold.",
 )
-# if button:
-#     find_next_example(st.session_state["example_id"])
 st.markdown("### Example Text")
 trunc_suffix = "..." if example_truncation < len(tokens_text[example_id]) else ""

 tokens_str = st.session_state["tokens_str"]
 cb_acts = st.session_state["cb_acts"]
 act_count_ft_tkns = st.session_state["act_count_ft_tkns"]
+gcb = st.session_state["gcb"]
 def get_example_concept_codes(example_id):
     token_pos_ids = [(example_id, i) for i in range(seq_len)]
     all_codes = []
     for cb_name, cb in cb_acts.items():
+        base_cb_name = code_search_utils.convert_to_base_name(cb_name, gcb=gcb)
+        codes, prec, rec, code_acts = code_search_utils.get_code_precision_and_recall(
             token_pos_ids,
             cb,
             act_count_ft_tkns[base_cb_name],
 )
 st.write(concept_code_description)
 ex_col, r_col, trunc_col, sort_col = st.columns([1, 1, 1, 1])
 example_id = ex_col.number_input(
     "Example ID",
     0,
     key="example_id",
 )
 recall_threshold = r_col.slider(
     "Recall Threshold",
     0.0,
     help="Recall Threshold is the minimum fraction of tokens in the example that the code must activate on.",
 )
 example_truncation = trunc_col.number_input(
+    "Max Output Chars", 0, 102400, 1024, key="max_chars"
 )
 sort_by_options = ["Precision", "Recall", "Num Acts"]
 sort_by_name = sort_col.radio(
     "Sort By",
     sort_by_options,
+    index=1,
     horizontal=True,
     help="Sorts the codes by the selected metric.",
 )
     args=(example_id,),
     help="Find an example which has codes above the recall threshold.",
 )
 st.markdown("### Example Text")
 trunc_suffix = "..." if example_truncation < len(tokens_text[example_id]) else ""

utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """Util functions for codebook features."""
 import re
 import typing
 from dataclasses import dataclass
@@ -57,11 +59,6 @@ class CodeInfo:
         if self.regex is not None:
             assert self.prec is not None and self.recall is not None
-    def check_patch_info(self):
-        """Check if the patch info is present."""
-        # TODO: pos can be none for patching
-        assert self.pos is not None and self.code_pos is not None
     def __repr__(self):
         """Return the string representation."""
         repr = f"CodeInfo(code={self.code}, layer={self.layer}, head={self.head}, cb_at={self.cb_at}"
@@ -76,6 +73,57 @@ class CodeInfo:
         repr += ")"
         return repr
 def logits_to_pred(logits, tokenizer, k=5):
     """Convert logits to top-k predictions."""
@@ -88,53 +136,6 @@ def logits_to_pred(logits, tokenizer, k=5):
     return [(topk_preds[i], probs[:, -1, i].item()) for i in range(len(topk_preds))]
-def patch_codebook_ids(
-    corrupted_codebook_ids, hook, pos, cache, cache_pos=None, code_idx=None
-):
-    """Patch codebook ids with cached ids."""
-    if cache_pos is None:
-        cache_pos = pos
-    if code_idx is None:
-        corrupted_codebook_ids[:, pos] = cache[hook.name][:, cache_pos]
-    else:
-        for code_id in range(32):
-            if code_id in code_idx:
-                corrupted_codebook_ids[:, pos, code_id] = cache[hook.name][
-                    :, cache_pos, code_id
-                ]
-            else:
-                corrupted_codebook_ids[:, pos, code_id] = -1
-    return corrupted_codebook_ids
-def logits_to_ave_logit_diff(logits, answer_tokens, per_prompt=False):
-    """Calculate the average logit difference between the answer and the other token."""
-    # Only the final logits are relevant for the answer
-    final_logits = logits[:, -1, :]
-    answer_logits = final_logits.gather(dim=-1, index=answer_tokens)
-    answer_logit_diff = answer_logits[:, 0] - answer_logits[:, 1]
-    if per_prompt:
-        return answer_logit_diff
-    else:
-        return answer_logit_diff.mean()
-def normalize_patched_logit_diff(
-    patched_logit_diff,
-    base_average_logit_diff,
-    corrupted_average_logit_diff,
-):
-    """Normalize the patched logit difference."""
-    # Subtract corrupted logit diff to measure the improvement,
-    # divide by the total improvement from clean to corrupted to normalise
-    # 0 means zero change, negative means actively made worse,
-    # 1 means totally recovered clean performance, >1 means actively *improved* on clean performance
-    return (patched_logit_diff - corrupted_average_logit_diff) / (
-        base_average_logit_diff - corrupted_average_logit_diff
-    )
 def features_to_tokens(cb_key, cb_acts, num_codes, code=None):
     """Return the set of token ids each codebook feature activates on."""
     codebook_ids = cb_acts[cb_key]
@@ -154,7 +155,6 @@ def features_to_tokens(cb_key, cb_acts, num_codes, code=None):
 def color_str(s: str, html: bool, color: Optional[str] = None):
     """Color the string for html or terminal."""
     if html:
         color = "DeepSkyBlue" if color is None else color
         return f"<span style='color:{color}'>{s}</span>"
@@ -163,7 +163,7 @@ def color_str(s: str, html: bool, color: Optional[str] = None):
         return colored(s, color)
-def color_tokens_automata(tokens, color_idx, html=False):
     """Separate states with a dash and color red the tokens in color_idx."""
     ret_string = ""
     itr_over_color_idx = 0
@@ -224,31 +224,48 @@ def prepare_example_print(
     return example_output
-def tkn_print(
-    ll,
     tokens,
-    separate_states,
     n=3,
     max_examples=100,
     randomize=False,
     html=False,
     return_example_list=False,
 ):
-    """Format and prints the tokens in ll."""
     if randomize:
         raise NotImplementedError("Randomize not yet implemented.")
-    indices = range(len(ll))
     print_output = [] if return_example_list else ""
-    curr_ex = ll[0][0]
     total_examples = 0
     tokens_to_color = []
-    color_fn = color_tokens_automata if separate_states else partial(color_tokens, n=n)
     for idx in indices:
         if total_examples > max_examples:
             break
-        i, j = ll[idx]
         if i != curr_ex and curr_ex >= 0:
             curr_ex_output = prepare_example_print(
                 curr_ex,
                 tokens[curr_ex],
@@ -275,17 +292,16 @@ def tkn_print(
         print_output.append((curr_ex_output, len(tokens_to_color)))
     else:
         print_output += curr_ex_output
-        asterisk_str = "********************************************"
-        print_output += color_str(asterisk_str, html, "green")
     total_examples += 1
     return print_output
-def print_ft_tkns(
     ft_tkns,
     tokens,
-    separate_states=False,
     n=3,
     start=0,
     stop=1000,
@@ -301,17 +317,17 @@ def print_ft_tkns(
     num_tokens = len(tokens) * len(tokens[0])
     codes, token_act_freqs, token_acts = [], [], []
     for i in indices:
-        tkns = ft_tkns[i]
-        freq = (len(tkns), 100 * len(tkns) / num_tokens)
         if freq_filter is not None and freq[1] > freq_filter:
             continue
         codes.append(i)
         token_act_freqs.append(freq)
-        if len(tkns) > 0:
-            tkn_acts = tkn_print(
-                tkns,
                 tokens,
-                separate_states,
                 n=n,
                 max_examples=max_examples,
                 randomize=randomize,
@@ -340,149 +356,59 @@ def patch_in_codes(run_cb_ids, hook, pos, code, code_pos=None):
     return run_cb_ids
-def get_cb_layer_name(cb_at, layer_idx, head_idx=None):
     """Get the layer name used to store hooks/cache."""
-    if head_idx is None:
-        return f"blocks.{layer_idx}.{cb_at}.codebook_layer.hook_codebook_ids"
-    else:
-        return f"blocks.{layer_idx}.{cb_at}.codebook_layer.codebook.{head_idx}.hook_codebook_ids"
-def get_cb_layer_names(layer, patch_types, n_heads):
-    """Get the layer names used to store hooks/cache."""
-    layer_names = []
-    attn_added, mlp_added = False, False
-    if "attn_out" in patch_types:
-        attn_added = True
-        for head in range(n_heads):
-            layer_names.append(
-                f"blocks.{layer}.attn.codebook_layer.codebook.{head}.hook_codebook_ids"
-            )
-    if "mlp_out" in patch_types:
-        mlp_added = True
-        layer_names.append(f"blocks.{layer}.mlp.codebook_layer.hook_codebook_ids")
-    for patch_type in patch_types:
-        # match patch_type of the pattern attn_\d_head_\d
-        attn_head = re.match(r"attn_(\d)_head_(\d)", patch_type)
-        if (not attn_added) and attn_head and attn_head[1] == str(layer):
-            layer_names.append(
-                f"blocks.{layer}.attn.codebook_layer.codebook.{attn_head[2]}.hook_codebook_ids"
-            )
-        mlp = re.match(r"mlp_(\d)", patch_type)
-        if (not mlp_added) and mlp and mlp[1] == str(layer):
-            layer_names.append(f"blocks.{layer}.mlp.codebook_layer.hook_codebook_ids")
-    return layer_names
-def cb_layer_name_to_info(layer_name):
-    """Get the layer info from the layer name."""
-    layer_name_split = layer_name.split(".")
-    layer_idx = int(layer_name_split[1])
-    cb_at = layer_name_split[2]
-    if cb_at == "mlp":
-        head_idx = None
     else:
-        head_idx = int(layer_name_split[5])
-    return cb_at, layer_idx, head_idx
-def get_hooks(code, cb_at, layer_idx, head_idx=None, pos=None):
-    """Get the hooks for the codebook features."""
-    hook_fns = [
-        partial(patch_in_codes, pos=pos, code=code[i]) for i in range(len(code))
-    ]
-    return [
-        (get_cb_layer_name(cb_at[i], layer_idx[i], head_idx[i]), hook_fns[i])
-        for i in range(len(code))
-    ]
-def run_with_codes(
-    input, cb_model, code, cb_at, layer_idx, head_idx=None, pos=None, prepend_bos=True
-):
-    """Run the model with the codebook features patched in."""
-    hook_fns = [
-        partial(patch_in_codes, pos=pos, code=code[i]) for i in range(len(code))
-    ]
-    cb_model.reset_codebook_metrics()
-    cb_model.reset_hook_kwargs()
-    fwd_hooks = [
-        (get_cb_layer_name(cb_at[i], layer_idx[i], head_idx[i]), hook_fns[i])
-        for i in range(len(cb_at))
-    ]
-    with cb_model.hooks(fwd_hooks, [], True, False) as hooked_model:
-        patched_logits, patched_cache = hooked_model.run_with_cache(
-            input, prepend_bos=prepend_bos
-        )
-    return patched_logits, patched_cache
-def in_hook_list(list_of_arg_tuples, layer, head=None):
-    """Check if the component specified by `layer` and `head` is in the `list_of_arg_tuples`."""
-    # if head is not provided, then checks in MLP
-    for arg_tuple in list_of_arg_tuples:
-        if head is None:
-            if arg_tuple.cb_at == "mlp" and arg_tuple.layer == layer:
-                return True
-        else:
-            if (
-                arg_tuple.cb_at == "attn"
-                and arg_tuple.layer == layer
-                and arg_tuple.head == head
-            ):
-                return True
-    return False
-# def generate_with_codes(input, code, cb_at, layer_idx, head_idx=None, pos=None, disable_other_comps=False):
-def generate_with_codes(
     input,
     cb_model,
     list_of_code_infos=(),
-    disable_other_comps=False,
-    automata=None,
-    generate_kwargs=None,
 ):
-    """Model's generation with the codebook features patched in."""
-    if generate_kwargs is None:
-        generate_kwargs = {}
     hook_fns = [
-        partial(patch_in_codes, pos=tupl.pos, code=tupl.code)
         for tupl in list_of_code_infos
     ]
     fwd_hooks = [
-        (get_cb_layer_name(tupl.cb_at, tupl.layer, tupl.head), hook_fns[i])
         for i, tupl in enumerate(list_of_code_infos)
     ]
     cb_model.reset_hook_kwargs()
-    if disable_other_comps:
-        for layer, cb in cb_model.all_codebooks.items():
-            for head_idx, head in enumerate(cb[0].codebook):
-                if not in_hook_list(list_of_code_infos, layer, head_idx):
-                    head.set_hook_kwargs(
-                        disable_topk=1, disable_for_tkns=[-1], keep_k_codes=False
-                    )
-            if not in_hook_list(list_of_code_infos, layer):
-                cb[1].set_hook_kwargs(
-                    disable_topk=1, disable_for_tkns=[-1], keep_k_codes=False
-                )
     with cb_model.hooks(fwd_hooks, [], True, False) as hooked_model:
-        gen = hooked_model.generate(input, **generate_kwargs)
-    return automata.seq_to_traj(gen)[0] if automata is not None else gen
-def kl_div(logits1, logits2, pos=-1, reduction="batchmean"):
-    """Calculate the KL divergence between the logits at `pos`."""
-    logits1_last, logits2_last = logits1[:, pos, :], logits2[:, pos, :]
-    # calculate kl divergence between clean and mod logits last
-    return F.kl_div(
-        F.log_softmax(logits1_last, dim=-1),
-        F.log_softmax(logits2_last, dim=-1),
-        log_target=True,
-        reduction=reduction,
     )
 def JSD(logits1, logits2, pos=-1, reduction="batchmean"):
@@ -511,11 +437,27 @@ def JSD(logits1, logits2, pos=-1, reduction="batchmean"):
     return 0.5 * loss
-def residual_stream_patching_hook(resid_pre, hook, cache, position: int):
-    """Patch in the codebook features at `position` from `cache`."""
-    clean_resid_pre = cache[hook.name]
-    resid_pre[:, position, :] = clean_resid_pre[:, position, :]
-    return resid_pre
 def find_code_changes(cache1, cache2, pos=None):
@@ -525,8 +467,8 @@ def find_code_changes(cache1, cache2, pos=None):
             c1 = cache1[k][0, pos]
             c2 = cache2[k][0, pos]
             if not torch.all(c1 == c2):
-                print(cb_layer_name_to_info(k), c1.tolist(), c2.tolist())
-                print(cb_layer_name_to_info(k), c1.tolist(), c2.tolist())
 def common_codes_in_cache(cache_codes, threshold=0.0):
@@ -541,39 +483,52 @@ def common_codes_in_cache(cache_codes, threshold=0.0):
     return codes, counts
-def parse_code_info_string(
-    info_str: str, cb_at="attn", pos=None, code_pos=-1
-) -> CodeInfo:
-    """Parse the code info string.
-    The format of the `info_str` is:
-    `code: 0, layer: 0, head: 0, occ_freq: 0.0, train_act_freq: 0.0`.
-    """
-    code, layer, head, occ_freq, train_act_freq = info_str.split(", ")
-    code = int(code.split(": ")[1])
-    layer = int(layer.split(": ")[1])
-    head = int(head.split(": ")[1]) if head else None
-    occ_freq = float(occ_freq.split(": ")[1])
-    train_act_freq = float(train_act_freq.split(": ")[1])
-    return CodeInfo(code, layer, head, pos=pos, code_pos=code_pos, cb_at=cb_at)
-def parse_concept_codes_string(info_str: str, pos=None, code_append=False):
-    """Parse the concept codes string."""
     code_info_strs = info_str.strip().split("\n")
-    concept_codes = []
     layer, head = None, None
-    code_pos = "append" if code_append else -1
     for code_info_str in code_info_strs:
-        concept_codes.append(
-            parse_code_info_string(code_info_str, pos=pos, code_pos=code_pos)
         )
-        if code_append:
             continue
-        if layer == concept_codes[-1].layer and head == concept_codes[-1].head:
-            code_pos -= 1
         else:
             code_pos = -1
-        concept_codes[-1].code_pos = code_pos
-        layer, head = concept_codes[-1].layer, concept_codes[-1].head
-    return concept_codes

 """Util functions for codebook features."""
+import pathlib
 import re
 import typing
 from dataclasses import dataclass
         if self.regex is not None:
             assert self.prec is not None and self.recall is not None
     def __repr__(self):
         """Return the string representation."""
         repr = f"CodeInfo(code={self.code}, layer={self.layer}, head={self.head}, cb_at={self.cb_at}"
         repr += ")"
         return repr
+    @classmethod
+    def from_str(cls, code_txt, *args, **kwargs):
+        """Extract code info fields from string."""
+        code_txt = code_txt.strip().lower()
+        code_txt = code_txt.split(", ")
+        code_txt = dict(txt.split(": ") for txt in code_txt)
+        return cls(*args, **code_txt, **kwargs)
+@dataclass
+class ModelInfoForWebapp:
+    """Model info for webapp."""
+    model_name: str
+    pretrained_path: str
+    dataset_name: str
+    num_codes: int
+    cb_at: str
+    gcb: str
+    n_layers: int
+    n_heads: Optional[int] = None
+    seed: int = 42
+    max_samples: int = 2000
+    def __post_init__(self):
+        """Convert to correct types."""
+        self.num_codes = int(self.num_codes)
+        self.n_layers = int(self.n_layers)
+        if self.n_heads == "None":
+            self.n_heads = None
+        elif self.n_heads is not None:
+            self.n_heads = int(self.n_heads)
+        self.seed = int(self.seed)
+        self.max_samples = int(self.max_samples)
+    @classmethod
+    def load(cls, path):
+        """Parse model info from path."""
+        path = pathlib.Path(path)
+        with open(path / "info.txt", "r") as f:
+            lines = f.readlines()
+            lines = dict(line.strip().split(": ") for line in lines)
+        return cls(**lines)
+    def save(self, path):
+        """Save model info to path."""
+        path = pathlib.Path(path)
+        with open(path / "info.txt", "w") as f:
+            for k, v in self.__dict__.items():
+                f.write(f"{k}: {v}\n")
 def logits_to_pred(logits, tokenizer, k=5):
     """Convert logits to top-k predictions."""
     return [(topk_preds[i], probs[:, -1, i].item()) for i in range(len(topk_preds))]
 def features_to_tokens(cb_key, cb_acts, num_codes, code=None):
     """Return the set of token ids each codebook feature activates on."""
     codebook_ids = cb_acts[cb_key]
 def color_str(s: str, html: bool, color: Optional[str] = None):
     """Color the string for html or terminal."""
     if html:
         color = "DeepSkyBlue" if color is None else color
         return f"<span style='color:{color}'>{s}</span>"
         return colored(s, color)
+def color_tokens_tokfsm(tokens, color_idx, html=False):
     """Separate states with a dash and color red the tokens in color_idx."""
     ret_string = ""
     itr_over_color_idx = 0
     return example_output
+def print_token_activations_of_code(
+    code_act_by_pos,
     tokens,
+    is_fsm=False,
     n=3,
     max_examples=100,
     randomize=False,
     html=False,
     return_example_list=False,
 ):
+    """Print the context with the tokens that a code activates on.
+    Args:
+        code_act_by_pos: list of (example_id, token_pos_id) tuples specifying
+            the token positions that a code activates on in a dataset.
+        tokens: list of tokens of a dataset.
+        is_fsm: whether the dataset is the TokFSM dataset.
+        n: context to print around each side of a token that the code activates on.
+        max_examples: maximum number of examples to print.
+        randomize: whether to randomize the order of examples.
+        html: Format the printing style for html or terminal.
+        return_example_list: whether to return the printed string by examples or as a single string.
+    Returns:
+        string of all examples formatted if `return_example_list` is False otherwise
+        list of (example_string, num_tokens_colored) tuples for each example.
+    """
     if randomize:
         raise NotImplementedError("Randomize not yet implemented.")
+    indices = range(len(code_act_by_pos))
     print_output = [] if return_example_list else ""
+    curr_ex = code_act_by_pos[0][0]
     total_examples = 0
     tokens_to_color = []
+    color_fn = color_tokens_tokfsm if is_fsm else partial(color_tokens, n=n)
     for idx in indices:
         if total_examples > max_examples:
             break
+        i, j = code_act_by_pos[idx]
         if i != curr_ex and curr_ex >= 0:
+            # got new example so print the previous one
             curr_ex_output = prepare_example_print(
                 curr_ex,
                 tokens[curr_ex],
         print_output.append((curr_ex_output, len(tokens_to_color)))
     else:
         print_output += curr_ex_output
+        print_output += color_str("*" * 50, html, "green")
     total_examples += 1
     return print_output
+def print_token_activations_of_codes(
     ft_tkns,
     tokens,
+    is_fsm=False,
     n=3,
     start=0,
     stop=1000,
     num_tokens = len(tokens) * len(tokens[0])
     codes, token_act_freqs, token_acts = [], [], []
     for i in indices:
+        tkns_of_code = ft_tkns[i]
+        freq = (len(tkns_of_code), 100 * len(tkns_of_code) / num_tokens)
         if freq_filter is not None and freq[1] > freq_filter:
             continue
         codes.append(i)
         token_act_freqs.append(freq)
+        if len(tkns_of_code) > 0:
+            tkn_acts = print_token_activations_of_code(
+                tkns_of_code,
                 tokens,
+                is_fsm,
                 n=n,
                 max_examples=max_examples,
                 randomize=randomize,
     return run_cb_ids
+def get_cb_hook_key(cb_at: str, layer_idx: int, gcb_idx: Optional[int] = None):
     """Get the layer name used to store hooks/cache."""
+    comp_name = "attn" if "attn" in cb_at else "mlp"
+    if gcb_idx is None:
+        return f"blocks.{layer_idx}.{comp_name}.codebook_layer.hook_codebook_ids"
     else:
+        return f"blocks.{layer_idx}.{comp_name}.codebook_layer.codebook.{gcb_idx}.hook_codebook_ids"
+def run_model_fn_with_codes(
     input,
     cb_model,
+    fn_name,
+    fn_kwargs=None,
     list_of_code_infos=(),
 ):
+    """Run the `cb_model`'s `fn_name` method while activating the codes in `list_of_code_infos`.
+    Common use case includes running the `run_with_cache` method while activating the codes.
+    For running the `generate` method, use `generate_with_codes` instead.
+    """
+    if fn_kwargs is None:
+        fn_kwargs = {}
     hook_fns = [
+        partial(patch_in_codes, pos=tupl.pos, code=tupl.code, code_pos=tupl.code_pos)
         for tupl in list_of_code_infos
     ]
     fwd_hooks = [
+        (get_cb_hook_key(tupl.cb_at, tupl.layer, tupl.head), hook_fns[i])
         for i, tupl in enumerate(list_of_code_infos)
     ]
     cb_model.reset_hook_kwargs()
     with cb_model.hooks(fwd_hooks, [], True, False) as hooked_model:
+        ret = hooked_model.__getattribute__(fn_name)(input, **fn_kwargs)
+    return ret
+def generate_with_codes(
+    input,
+    cb_model,
+    list_of_code_infos=(),
+    tokfsm=None,
+    generate_kwargs=None,
+):
+    """Sample from the language model while activating the codes in `list_of_code_infos`."""
+    gen = run_model_fn_with_codes(
+        input,
+        cb_model,
+        "generate",
+        generate_kwargs,
+        list_of_code_infos,
     )
+    return tokfsm.seq_to_traj(gen) if tokfsm is not None else gen
 def JSD(logits1, logits2, pos=-1, reduction="batchmean"):
     return 0.5 * loss
+def cb_hook_key_to_info(layer_hook_key: str):
+    """Get the layer info from the codebook layer hook key.
+    Args:
+        layer_hook_key: the hook key of the codebook layer.
+            E.g. `blocks.3.attn.codebook_layer.hook_codebook_ids`
+    Returns:
+        comp_name: the name of the component codebook is appied at.
+        layer_idx: the layer index.
+        gcb_idx: the codebook index if the codebook layer is grouped, otherwise None.
+    """
+    layer_search = re.search(r"blocks\.(\d+)\.(\w+)\.", layer_hook_key)
+    assert layer_search is not None
+    layer_idx, comp_name = int(layer_search.group(1)), layer_search.group(2)
+    gcb_idx_search = re.search(r"codebook\.(\d+)", layer_hook_key)
+    if gcb_idx_search is not None:
+        gcb_idx = int(gcb_idx_search.group(1))
+    else:
+        gcb_idx = None
+    return comp_name, layer_idx, gcb_idx
 def find_code_changes(cache1, cache2, pos=None):
             c1 = cache1[k][0, pos]
             c2 = cache2[k][0, pos]
             if not torch.all(c1 == c2):
+                print(cb_hook_key_to_info(k), c1.tolist(), c2.tolist())
+                print(cb_hook_key_to_info(k), c1.tolist(), c2.tolist())
 def common_codes_in_cache(cache_codes, threshold=0.0):
     return codes, counts
+def parse_topic_codes_string(
+    info_str: str,
+    pos: Optional[int] = None,
+    code_append: Optional[bool] = False,
+    **code_info_kwargs,
+):
+    """Parse the topic codes string."""
     code_info_strs = info_str.strip().split("\n")
+    code_info_strs = [e.strip() for e in code_info_strs if e]
+    topic_codes = []
     layer, head = None, None
+    if code_append is None:
+        code_pos = None
+    else:
+        code_pos = "append" if code_append else -1
     for code_info_str in code_info_strs:
+        topic_codes.append(
+            CodeInfo.from_str(
+                code_info_str,
+                pos=pos,
+                code_pos=code_pos,
+                **code_info_kwargs,
+            )
         )
+        if code_append is None or code_append:
             continue
+        if layer == topic_codes[-1].layer and head == topic_codes[-1].head:
+            code_pos -= 1  # type: ignore
         else:
             code_pos = -1
+        topic_codes[-1].code_pos = code_pos
+        layer, head = topic_codes[-1].layer, topic_codes[-1].head
+    return topic_codes
+def find_similar_codes(cb_model, code_info, n=8):
+    """Find the `n` most similar codes to the given code using cosine similarity.
+    Useful for finding related codes for interpretability.
+    """
+    codebook = cb_model.get_codebook(code_info)
+    device = codebook.weight.device
+    code = codebook(torch.tensor(code_info.code).to(device))
+    code = code.to(device)
+    logits = torch.matmul(code, codebook.weight.T)
+    _, indices = torch.topk(logits, n)
+    assert indices[0] == code_info.code
+    assert torch.allclose(logits[indices[0]], torch.tensor(1.0))
+    return indices[1:], logits[indices[1:]].tolist()

webapp_utils.py CHANGED Viewed

@@ -1,6 +1,9 @@
 """Utility functions for running webapp using streamlit."""
 import streamlit as st
 from streamlit.components.v1 import html
@@ -61,10 +64,10 @@ def load_ft_tkns(model_id, layer, head=None, code=None):
     # model_id required to not mix cache_data for different models
     assert model_id is not None
     cb_at = st.session_state["cb_at"]
-    ccb = st.session_state["ccb"]
     cb_acts = st.session_state["cb_acts"]
     if head is not None:
-        cb_name = f"layer{layer}_{cb_at}{ccb}{head}"
     else:
         cb_name = f"layer{layer}_{cb_at}"
     return utils.features_to_tokens(
@@ -84,11 +87,12 @@ def get_code_acts(
     ctx_size=5,
     num_examples=100,
     return_example_list=False,
 ):
     """Get the token activations for a given code."""
     ft_tkns = load_ft_tkns(model_id, layer, head, code)
     ft_tkns = [ft_tkns]
-    _, freqs, acts = utils.print_ft_tkns(
         ft_tkns,
         tokens=tokens_str,
         indices=[0],
@@ -96,6 +100,7 @@ def get_code_acts(
         n=ctx_size,
         max_examples=num_examples,
         return_example_list=return_example_list,
     )
     return acts[0], freqs[0]
@@ -122,8 +127,16 @@ def find_next_code(code, layer_code_acts, act_range=None):
     """Find the next code that has activations in the given range."""
     if act_range is None:
         return code
     for code_iter, code_act_count in enumerate(layer_code_acts[code:]):
-        if code_act_count >= act_range[0] and code_act_count <= act_range[1]:
             code += code_iter
             break
     return code
@@ -161,8 +174,8 @@ def add_save_code_button(
     demo_file_path: str,
     num_acts: int,
     save_regex: bool = False,
-    prec: float = None,
-    recall: float = None,
     button_st_container=st,
     button_text: bool = False,
     button_key_suffix: str = "",
@@ -176,12 +189,12 @@ def add_save_code_button(
     if save_button:
         description = st.text_input(
             "Write a description for the code",
-            key="save_code_desc",
         )
         if not description:
             return
-    description = st.session_state.get("save_code_desc", None)
     if description:
         layer = st.session_state["ct_act_layer"]
         is_attn = st.session_state["is_attn"]
@@ -207,4 +220,3 @@ def add_save_code_button(
         saved = add_code_to_demo_file(code_info, demo_file_path)
         if saved:
             st.success("Code saved!", icon="🎉")
-            st.success("Code saved!", icon="🎉")

 """Utility functions for running webapp using streamlit."""
+from typing import Optional
+import numpy as np
 import streamlit as st
 from streamlit.components.v1 import html
     # model_id required to not mix cache_data for different models
     assert model_id is not None
     cb_at = st.session_state["cb_at"]
+    gcb = st.session_state["gcb"]
     cb_acts = st.session_state["cb_acts"]
     if head is not None:
+        cb_name = f"layer{layer}_{cb_at}{gcb}{head}"
     else:
         cb_name = f"layer{layer}_{cb_at}"
     return utils.features_to_tokens(
     ctx_size=5,
     num_examples=100,
     return_example_list=False,
+    is_fsm=False,
 ):
     """Get the token activations for a given code."""
     ft_tkns = load_ft_tkns(model_id, layer, head, code)
     ft_tkns = [ft_tkns]
+    _, freqs, acts = utils.print_token_activations_of_codes(
         ft_tkns,
         tokens=tokens_str,
         indices=[0],
         n=ctx_size,
         max_examples=num_examples,
         return_example_list=return_example_list,
+        is_fsm=is_fsm,
     )
     return acts[0], freqs[0]
     """Find the next code that has activations in the given range."""
     if act_range is None:
         return code
+    min_act, max_act = 0, np.inf
+    if isinstance(act_range, tuple):
+        if len(act_range) == 2:
+            min_act, max_act = act_range
+        else:
+            min_act = act_range[0]
+    elif isinstance(act_range, int):
+        min_act = act_range
     for code_iter, code_act_count in enumerate(layer_code_acts[code:]):
+        if code_act_count >= min_act and code_act_count <= max_act:
             code += code_iter
             break
     return code
     demo_file_path: str,
     num_acts: int,
     save_regex: bool = False,
+    prec: Optional[float] = None,
+    recall: Optional[float] = None,
     button_st_container=st,
     button_text: bool = False,
     button_key_suffix: str = "",
     if save_button:
         description = st.text_input(
             "Write a description for the code",
+            key=f"save_code_desc{button_key_suffix}",
         )
         if not description:
             return
+    description = st.session_state.get(f"save_code_desc{button_key_suffix}", None)
     if description:
         layer = st.session_state["ct_act_layer"]
         is_attn = st.session_state["is_attn"]
         saved = add_code_to_demo_file(code_info, demo_file_path)
         if saved:
             st.success("Code saved!", icon="🎉")