Spaces:

taufeeque
/

codebook-features

Sleeping

App Files Files Community

taufeeque commited on Aug 24, 2023

Commit

7f9376c

•

1 Parent(s): 676f3c4

Add streamlit webapp files

Browse files

Files changed (8) hide show

Code_Browser.py +373 -0
README.md +1 -1
code_search_utils.py +299 -0
pages/Concept_Code.py +217 -0
requirements.txt +5 -0
utils.py +578 -0
webapp_utils.py +210 -0
webapp_utils_full_ft_tkns_for_ts.py +233 -0

Code_Browser.py ADDED Viewed

	@@ -0,0 +1,373 @@

+"""Web App for the Codebook Features project."""
+import glob
+import os
+import streamlit as st
+import code_search_utils
+import webapp_utils
+DEPLOY_MODE = True
+webapp_utils.load_widget_state()
+st.set_page_config(
+    page_title="Codebook Features",
+    page_icon="📚",
+)
+st.title("Codebook Features")
+base_cache_dir = "cache/"
+dirs = glob.glob(base_cache_dir + "models/*/")
+model_name_options = [d.split("/")[-2].split("_")[:-2] for d in dirs]
+model_name_options = ["_".join(m) for m in model_name_options]
+model_name_options = sorted(set(model_name_options))
+model_name = st.selectbox(
+    "Model",
+    model_name_options,
+    key=webapp_utils.persist("model_name"),
+)
+model = model_name.split("_")[0].split("#")[0]
+model_layers = {
+    "pythia-410m-deduped": 24,
+    "pythia-70m-deduped": 6,
+    "gpt2": 12,
+    "TinyStories-1Layer-21M": 1,
+}
+model_heads = {
+    "pythia-410m-deduped": 16,
+    "pythia-70m-deduped": 8,
+    "gpt2": 12,
+    "TinyStories-1Layer-21M": 16,
+}
+ccb = model_name.split("_")[1]
+ccb = "_ccb" if ccb == "ccb" else ""
+cb_at = "_".join(model_name.split("_")[2:])
+seq_len = 512 if "tinystories" in model_name.lower() else 1024
+st.session_state["seq_len"] = seq_len
+codes_cache_path = base_cache_dir + f"models/{model_name}_*"
+dirs = glob.glob(codes_cache_path)
+dirs.sort(key=os.path.getmtime)
+# session states
+is_attn = "attn" in cb_at
+num_layers = model_layers[model]
+num_heads = model_heads[model]
+codes_cache_path = dirs[-1] + "/"
+model_info = code_search_utils.parse_model_info(codes_cache_path)
+num_codes = model_info.num_codes
+dataset_cache_path = base_cache_dir + f"datasets/{model_info.dataset_name}/"
+(
+    tokens_str,
+    tokens_text,
+    token_byte_pos,
+    cb_acts,
+    act_count_ft_tkns,
+    metrics,
+) = webapp_utils.load_code_search_cache(codes_cache_path, dataset_cache_path)
+metric_keys = ["eval_loss", "eval_accuracy", "eval_dead_code_fraction"]
+metrics = {k: v for k, v in metrics.items() if k.split("/")[0] in metric_keys}
+st.session_state["model_name_id"] = model_name
+st.session_state["cb_acts"] = cb_acts
+st.session_state["tokens_text"] = tokens_text
+st.session_state["tokens_str"] = tokens_str
+st.session_state["act_count_ft_tkns"] = act_count_ft_tkns
+st.session_state["num_codes"] = num_codes
+st.session_state["ccb"] = ccb
+st.session_state["cb_at"] = cb_at
+st.session_state["is_attn"] = is_attn
+st.markdown("## Metrics")
+# hide metrics by default
+if st.checkbox("Show Model Metrics"):
+    st.write(metrics)
+st.markdown("## Demo Codes")
+demo_file_path = codes_cache_path + "demo_codes.txt"
+if st.checkbox("Show Demo Codes"):
+    try:
+        with open(demo_file_path, "r") as f:
+            demo_codes = f.readlines()
+    except FileNotFoundError:
+        demo_codes = []
+    code_desc, code_regex = "", ""
+    demo_codes = [code.strip() for code in demo_codes if code.strip()]
+    num_cols = 6 if is_attn else 5
+    cols = st.columns([1] * (num_cols - 1) + [2])
+    # st.markdown(button_height_style, unsafe_allow_html=True)
+    cols[0].markdown("Search", help="Button to see token activations for the code.")
+    cols[1].write("Code")
+    cols[2].write("Layer")
+    if is_attn:
+        cols[3].write("Head")
+    cols[-2].markdown(
+        "Num Acts",
+        help="Number of tokens that the code activates on in the acts dataset.",
+    )
+    cols[-1].markdown("Description", help="Interpreted description of the code.")
+    if len(demo_codes) == 0:
+        st.markdown(
+            f"""
+            <div style="font-size: 1.3rem; color: red;">
+            No demo codes found in file {demo_file_path}
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )
+    skip = True
+    for code_txt in demo_codes:
+        if code_txt.startswith("##"):
+            skip = True
+            continue
+        if code_txt.startswith("#"):
+            code_desc, code_regex = code_txt[1:].split(":")
+            code_desc, code_regex = code_desc.strip(), code_regex.strip()
+            skip = False
+            continue
+        if skip:
+            continue
+        code_info = code_search_utils.get_code_info_pr_from_str(code_txt, code_regex)
+        comp_info = f"layer{code_info.layer}_{f'head{code_info.head}' if code_info.head is not None else ''}"
+        button_key = (
+            f"demo_search_code{code_info.code}_layer{code_info.layer}_desc-{code_info.description}"
+            + (f"head{code_info.head}" if code_info.head is not None else "")
+        )
+        cols = st.columns([1] * (num_cols - 1) + [2])
+        button_clicked = cols[0].button(
+            "🔍",
+            key=button_key,
+        )
+        if button_clicked:
+            webapp_utils.set_ct_acts(
+                code_info.code, code_info.layer, code_info.head, None, is_attn
+            )
+        cols[1].write(code_info.code)
+        cols[2].write(str(code_info.layer))
+        if is_attn:
+            cols[3].write(str(code_info.head))
+        cols[-2].write(str(act_count_ft_tkns[comp_info][code_info.code]))
+        cols[-1].write(code_desc)
+        skip = True
+st.markdown("## Code Search")
+regex_pattern = st.text_input(
+    "Enter a regex pattern",
+    help="Wrap code token in the first group. E.g. New (York)",
+    key="regex_pattern",
+)
+# topk = st.slider("Top K", 1, 20, 10)
+prec_col, sort_col = st.columns(2)
+prec_threshold = prec_col.slider(
+    "Precision Threshold",
+    0.0,
+    1.0,
+    0.9,
+    help="Shows codes with precision on the regex pattern above the threshold.",
+)
+sort_by_options = ["Precision", "Recall", "Num Acts"]
+sort_by_name = sort_col.radio(
+    "Sort By",
+    sort_by_options,
+    index=0,
+    horizontal=True,
+    help="Sorts the codes by the selected metric.",
+)
+sort_by = sort_by_options.index(sort_by_name)
+@st.cache_data(ttl=3600)
+def get_codebook_wise_codes_for_regex(regex_pattern, prec_threshold, ccb, model_name):
+    """Get codebook wise codes for a given regex pattern."""
+    assert model_name is not None  # required for loading from correct cache data
+    return code_search_utils.get_codes_from_pattern(
+        regex_pattern,
+        tokens_text,
+        token_byte_pos,
+        cb_acts,
+        act_count_ft_tkns,
+        ccb=ccb,
+        topk=8,
+        prec_threshold=prec_threshold,
+    )
+if regex_pattern:
+    codebook_wise_codes, re_token_matches = get_codebook_wise_codes_for_regex(
+        regex_pattern,
+        prec_threshold,
+        ccb,
+        model_name,
+    )
+    st.markdown(f"Found :green[{re_token_matches}] matches")
+    num_search_cols = 7 if is_attn else 6
+    non_deploy_offset = 0
+    if not DEPLOY_MODE:
+        non_deploy_offset = 1
+        num_search_cols += non_deploy_offset
+    cols = st.columns(num_search_cols)
+    # st.markdown(button_height_style, unsafe_allow_html=True)
+    cols[0].markdown("Search", help="Button to see token activations for the code.")
+    cols[1].write("Layer")
+    if is_attn:
+        cols[2].write("Head")
+    cols[-4 - non_deploy_offset].write("Code")
+    cols[-3 - non_deploy_offset].write("Precision")
+    cols[-2 - non_deploy_offset].write("Recall")
+    cols[-1 - non_deploy_offset].markdown(
+        "Num Acts",
+        help="Number of tokens that the code activates on in the acts dataset.",
+    )
+    if not DEPLOY_MODE:
+        cols[-1].markdown(
+            "Save to Demos",
+            help="Button to save the code to demos along with the regex pattern.",
+        )
+    all_codes = codebook_wise_codes.items()
+    all_codes = [
+        (cb_name, code_pr_info)
+        for cb_name, code_pr_infos in all_codes
+        for code_pr_info in code_pr_infos
+    ]
+    all_codes = sorted(all_codes, key=lambda x: x[1][1 + sort_by], reverse=True)
+    for cb_name, (code, prec, rec, code_acts) in all_codes:
+        layer_head = cb_name.split("_")
+        layer = layer_head[0][5:]
+        head = layer_head[1][4:] if len(layer_head) > 1 else None
+        button_key = f"search_code{code}_layer{layer}" + (
+            f"head{head}" if head is not None else ""
+        )
+        cols = st.columns(num_search_cols)
+        extra_args = {
+            "prec": prec,
+            "recall": rec,
+            "num_acts": code_acts,
+            "regex": regex_pattern,
+        }
+        button_clicked = cols[0].button("🔍", key=button_key)
+        if button_clicked:
+            webapp_utils.set_ct_acts(code, layer, head, extra_args, is_attn)
+        cols[1].write(layer)
+        if is_attn:
+            cols[2].write(head)
+        cols[-4 - non_deploy_offset].write(code)
+        cols[-3 - non_deploy_offset].write(f"{prec*100:.2f}%")
+        cols[-2 - non_deploy_offset].write(f"{rec*100:.2f}%")
+        cols[-1 - non_deploy_offset].write(str(code_acts))
+        if not DEPLOY_MODE:
+            webapp_utils.add_save_code_button(
+                demo_file_path,
+                num_acts=code_acts,
+                save_regex=True,
+                prec=prec,
+                recall=rec,
+                button_st_container=cols[-1],
+                button_key_suffix=f"_code{code}_layer{layer}_head{head}",
+            )
+    if len(all_codes) == 0:
+        st.markdown(
+            f"""
+            <div style="font-size: 1.0rem; color: red;">
+            No codes found for pattern {regex_pattern} at precision threshold: {prec_threshold}
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )
+st.markdown("## Code Token Activations")
+filter_codes = st.checkbox("Filter Codes", key="filter_codes")
+act_range, layer_code_acts = None, None
+if filter_codes:
+    act_range = st.slider(
+        "Num Acts",
+        0,
+        10_000,
+        (100, 10_000),
+        key="ct_act_range",
+        help="Filter codes by the number of tokens they activate on.",
+    )
+cols = st.columns(5 if is_attn else 4)
+layer = cols[0].number_input("Layer", 0, num_layers - 1, 0, key="ct_act_layer")
+if is_attn:
+    head = cols[1].number_input("Head", 0, num_heads - 1, 0, key="ct_act_head")
+else:
+    head = None
+def_code = st.session_state.get("ct_act_code", 0)
+if filter_codes:
+    layer_code_acts = act_count_ft_tkns[
+        f"layer{layer}{'_head'+str(head) if head is not None else ''}"
+    ]
+    def_code = webapp_utils.find_next_code(def_code, layer_code_acts, act_range)
+    if "ct_act_code" in st.session_state:
+        st.session_state["ct_act_code"] = def_code
+code = cols[-3].number_input(
+    "Code",
+    0,
+    num_codes - 1,
+    def_code,
+    key="ct_act_code",
+)
+num_examples = cols[-2].number_input(
+    "Max Results",
+    -1,
+    1000,  # setting to 1000 for efficiency purposes even though it can be more than 1000.
+    100,
+    help="Number of examples to show in the results. Set to -1 to show all examples.",
+)
+ctx_size = cols[-1].number_input(
+    "Context Size",
+    1,
+    10,
+    5,
+    help="Number of tokens to show before and after the code token.",
+)
+acts, acts_count = webapp_utils.get_code_acts(
+    model_name,
+    tokens_str,
+    code,
+    layer,
+    head,
+    ctx_size,
+    num_examples,
+)
+st.write(
+    f"Token Activations for Layer {layer}{f' Head {head}' if head is not None else ''} Code {code} | "
+    f"Activates on {acts_count[0]} tokens on the acts dataset",
+)
+if not DEPLOY_MODE:
+    webapp_utils.add_save_code_button(
+        demo_file_path,
+        acts_count[0],
+        save_regex=False,
+        button_text=True,
+        button_key_suffix="_token_acts",
+    )
+st.markdown(webapp_utils.escape_markdown(acts), unsafe_allow_html=True)

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: gray
 colorTo: green
 sdk: streamlit
 sdk_version: 1.25.0
-app_file: app.py
 pinned: false
 license: mit
 ---

 colorTo: green
 sdk: streamlit
 sdk_version: 1.25.0
+app_file: Code_Browser.py
 pinned: false
 license: mit
 ---

code_search_utils.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""Functions to help with searching codes using regex."""
+import pickle
+import re
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import torch
+from tqdm import tqdm
+import utils
+def load_dataset_cache(cache_base_path):
+    """Load cache files required for dataset from `cache_base_path`."""
+    tokens_str = np.load(cache_base_path + "tokens_str.npy")
+    tokens_text = np.load(cache_base_path + "tokens_text.npy")
+    token_byte_pos = np.load(cache_base_path + "token_byte_pos.npy")
+    return tokens_str, tokens_text, token_byte_pos
+def load_code_search_cache(cache_base_path):
+    """Load cache files required for code search from `cache_base_path`."""
+    metrics = np.load(cache_base_path + "metrics.npy", allow_pickle=True).item()
+    with open(cache_base_path + "cb_acts.pkl", "rb") as f:
+        cb_acts = pickle.load(f)
+    with open(cache_base_path + "act_count_ft_tkns.pkl", "rb") as f:
+        act_count_ft_tkns = pickle.load(f)
+    return cb_acts, act_count_ft_tkns, metrics
+def search_re(re_pattern, tokens_text):
+    """Get list of (example_id, token_pos) where re_pattern matches in tokens_text."""
+    # TODO: ensure that parantheses are not escaped
+    if re_pattern.find("(") == -1:
+        re_pattern = f"({re_pattern})"
+    return [
+        (i, finditer.span(1)[0])
+        for i, text in enumerate(tokens_text)
+        for finditer in re.finditer(re_pattern, text)
+        if finditer.span(1)[0] != finditer.span(1)[1]
+    ]
+def byte_id_to_token_pos_id(example_byte_id, token_byte_pos):
+    """Get (example_id, token_pos_id) for given (example_id, byte_id)."""
+    example_id, byte_id = example_byte_id
+    index = np.searchsorted(token_byte_pos[example_id], byte_id, side="right")
+    return (example_id, index)
+def get_code_pr(token_pos_ids, codebook_acts, cb_act_counts=None):
+    """Get codes, prec, recall for given token_pos_ids and codebook_acts."""
+    codes = np.array(
+        [
+            codebook_acts[example_id][token_pos_id]
+            for example_id, token_pos_id in token_pos_ids
+        ]
+    )
+    codes, counts = np.unique(codes, return_counts=True)
+    recall = counts / len(token_pos_ids)
+    idx = recall > 0.01
+    codes, counts, recall = codes[idx], counts[idx], recall[idx]
+    if cb_act_counts is not None:
+        code_acts = np.array([cb_act_counts[code] for code in codes])
+        prec = counts / code_acts
+        sort_idx = np.argsort(prec)[::-1]
+    else:
+        code_acts = np.zeros_like(codes)
+        prec = np.zeros_like(codes)
+        sort_idx = np.argsort(recall)[::-1]
+    codes, prec, recall = codes[sort_idx], prec[sort_idx], recall[sort_idx]
+    code_acts = code_acts[sort_idx]
+    return codes, prec, recall, code_acts
+def get_neuron_pr(
+    token_pos_ids, recall, neuron_acts_by_ex, neuron_sorted_acts, topk=10
+):
+    """Get codes, prec, recall for given token_pos_ids and codebook_acts."""
+    # check if neuron_acts_by_ex is a torch tensor
+    if isinstance(neuron_acts_by_ex, torch.Tensor):
+        re_neuron_acts = torch.stack(
+            [
+                neuron_acts_by_ex[example_id, token_pos_id]
+                for example_id, token_pos_id in token_pos_ids
+            ],
+            dim=-1,
+        )  # (layers, 2, dim_size, matches)
+        re_neuron_acts = torch.sort(re_neuron_acts, dim=-1).values
+    else:
+        re_neuron_acts = np.stack(
+            [
+                neuron_acts_by_ex[example_id, token_pos_id]
+                for example_id, token_pos_id in token_pos_ids
+            ],
+            axis=-1,
+        )  # (layers, 2, dim_size, matches)
+        re_neuron_acts.sort(axis=-1)
+        re_neuron_acts = torch.from_numpy(re_neuron_acts)
+    # re_neuron_acts = re_neuron_acts[:, :, :, -int(recall * re_neuron_acts.shape[-1]) :]
+    print("Examples for recall", recall, ":", int(recall * re_neuron_acts.shape[-1]))
+    act_thresh = re_neuron_acts[:, :, :, -int(recall * re_neuron_acts.shape[-1])]
+    # binary search act_thresh in neuron_sorted_acts
+    assert neuron_sorted_acts.shape[:-1] == act_thresh.shape
+    prec_den = torch.searchsorted(neuron_sorted_acts, act_thresh.unsqueeze(-1))
+    prec_den = prec_den.squeeze(-1)
+    prec_den = neuron_sorted_acts.shape[-1] - prec_den
+    prec = int(recall * re_neuron_acts.shape[-1]) / prec_den
+    assert (
+        prec.shape == re_neuron_acts.shape[:-1]
+    ), f"{prec.shape} != {re_neuron_acts.shape[:-1]}"
+    best_neuron_idx = np.unravel_index(prec.argmax(), prec.shape)
+    best_prec = prec[best_neuron_idx]
+    print("max prec:", best_prec)
+    best_neuron_act_thresh = act_thresh[best_neuron_idx].item()
+    best_neuron_acts = neuron_acts_by_ex[
+        :, :, best_neuron_idx[0], best_neuron_idx[1], best_neuron_idx[2]
+    ]
+    best_neuron_acts = best_neuron_acts >= best_neuron_act_thresh
+    best_neuron_acts = np.stack(np.where(best_neuron_acts), axis=-1)
+    return best_prec, best_neuron_acts, best_neuron_idx
+def convert_to_adv_name(name, cb_at, ccb=""):
+    """Convert layer0_head0 to layer0_attn_preproj_ccb0."""
+    if ccb:
+        layer, head = name.split("_")
+        return layer + f"_{cb_at}_ccb" + head[4:]
+    else:
+        return layer + "_" + cb_at
+def convert_to_base_name(name, ccb=""):
+    """Convert layer0_attn_preproj_ccb0 to layer0_head0."""
+    split_name = name.split("_")
+    layer, head = split_name[0], split_name[-1][3:]
+    if "ccb" in name:
+        return layer + "_head" + head
+    else:
+        return layer
+def get_layer_head_from_base_name(name):
+    """Convert layer0_head0 to 0, 0."""
+    split_name = name.split("_")
+    layer = int(split_name[0][5:])
+    head = None
+    if len(split_name) > 1:
+        head = int(split_name[-1][4:])
+    return layer, head
+def get_layer_head_from_adv_name(name):
+    """Convert layer0_attn_preproj_ccb0 to 0, 0."""
+    base_name = convert_to_base_name(name)
+    layer, head = get_layer_head_from_base_name(base_name)
+    return layer, head
+def get_codes_from_pattern(
+    re_pattern,
+    tokens_text,
+    token_byte_pos,
+    cb_acts,
+    act_count_ft_tkns,
+    ccb="",
+    topk=5,
+    prec_threshold=0.5,
+):
+    """Fetch codes from a given regex pattern."""
+    byte_ids = search_re(re_pattern, tokens_text)
+    token_pos_ids = [
+        byte_id_to_token_pos_id(ex_byte_id, token_byte_pos) for ex_byte_id in byte_ids
+    ]
+    token_pos_ids = np.unique(token_pos_ids, axis=0)
+    re_token_matches = len(token_pos_ids)
+    codebook_wise_codes = {}
+    for cb_name, cb in tqdm(cb_acts.items()):
+        base_cb_name = convert_to_base_name(cb_name, ccb=ccb)
+        codes, prec, recall, code_acts = get_code_pr(
+            token_pos_ids,
+            cb,
+            cb_act_counts=act_count_ft_tkns[base_cb_name],
+        )
+        idx = np.arange(min(topk, len(codes)))
+        idx = idx[prec[:topk] > prec_threshold]
+        codes, prec, recall = codes[idx], prec[idx], recall[idx]
+        code_acts = code_acts[idx]
+        codes_pr = list(zip(codes, prec, recall, code_acts))
+        codebook_wise_codes[base_cb_name] = codes_pr
+    return codebook_wise_codes, re_token_matches
+def get_neurons_from_pattern(
+    re_pattern,
+    tokens_text,
+    token_byte_pos,
+    neuron_acts_by_ex,
+    neuron_sorted_acts,
+    recall_threshold,
+):
+    """Fetch the best neuron (with act thresh given by recall) from a given regex pattern."""
+    byte_ids = search_re(re_pattern, tokens_text)
+    token_pos_ids = [
+        byte_id_to_token_pos_id(ex_byte_id, token_byte_pos) for ex_byte_id in byte_ids
+    ]
+    token_pos_ids = np.unique(token_pos_ids, axis=0)
+    re_token_matches = len(token_pos_ids)
+    best_prec, best_neuron_acts, best_neuron_idx = get_neuron_pr(
+        token_pos_ids,
+        recall_threshold,
+        neuron_acts_by_ex,
+        neuron_sorted_acts,
+    )
+    return best_prec, best_neuron_acts, best_neuron_idx, re_token_matches
+def compare_codes_with_neurons(
+    best_codes_info,
+    tokens_text,
+    token_byte_pos,
+    neuron_acts_by_ex,
+    neuron_sorted_acts,
+):
+    """Compare codes with neurons."""
+    assert isinstance(neuron_acts_by_ex, np.ndarray)
+    (
+        all_best_prec,
+        all_best_neuron_acts,
+        all_best_neuron_idxs,
+        all_re_token_matches,
+    ) = zip(
+        *[
+            get_neurons_from_pattern(
+                code_info.re_pattern,
+                tokens_text,
+                token_byte_pos,
+                neuron_acts_by_ex,
+                neuron_sorted_acts,
+                code_info.recall,
+            )
+            for code_info in tqdm(range(len(best_codes_info)))
+        ],
+        strict=True,
+    )
+    code_best_precs = np.array(
+        [code_info.prec for code_info in range(len(best_codes_info))]
+    )
+    codes_better_than_neurons = code_best_precs > np.array(all_best_prec)
+    return codes_better_than_neurons.mean()
+def get_code_info_pr_from_str(code_txt, regex):
+    """Extract code info fields from string."""
+    code_txt = code_txt.strip()
+    code_txt = code_txt.split(", ")
+    code_txt = dict(txt.split(": ") for txt in code_txt)
+    return utils.CodeInfo(**code_txt)
+@dataclass
+class ModelInfoForWebapp:
+    """Model info for webapp."""
+    model_name: str
+    pretrained_path: str
+    dataset_name: str
+    num_codes: int
+    cb_at: str
+    ccb: str
+    n_layers: int
+    n_heads: Optional[int] = None
+    seed: int = 42
+    max_samples: int = 2000
+    def __post_init__(self):
+        """Convert to correct types."""
+        self.num_codes = int(self.num_codes)
+        self.n_layers = int(self.n_layers)
+        if self.n_heads == "None":
+            self.n_heads = None
+        elif self.n_heads is not None:
+            self.n_heads = int(self.n_heads)
+        self.seed = int(self.seed)
+        self.max_samples = int(self.max_samples)
+def parse_model_info(path):
+    """Parse model info from path."""
+    with open(path + "info.txt", "r") as f:
+        lines = f.readlines()
+        lines = dict(line.strip().split(": ") for line in lines)
+        return ModelInfoForWebapp(**lines)
+        return ModelInfoForWebapp(**lines)

pages/Concept_Code.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""Web app page for showing codes for different examples in the dataset."""
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+import code_search_utils
+import webapp_utils
+webapp_utils.load_widget_state()
+if "cb_acts" not in st.session_state:
+    switch_page("Code_Browser")
+total_examples = 2000
+prec_threshold = 0.01
+model_name = st.session_state["model_name_id"]
+seq_len = st.session_state["seq_len"]
+tokens_text = st.session_state["tokens_text"]
+tokens_str = st.session_state["tokens_str"]
+cb_acts = st.session_state["cb_acts"]
+act_count_ft_tkns = st.session_state["act_count_ft_tkns"]
+ccb = st.session_state["ccb"]
+def get_example_concept_codes(example_id):
+    """Get concept codes for the given example id."""
+    token_pos_ids = [(example_id, i) for i in range(seq_len)]
+    all_codes = []
+    for cb_name, cb in cb_acts.items():
+        base_cb_name = code_search_utils.convert_to_base_name(cb_name, ccb=ccb)
+        codes, prec, rec, code_acts = code_search_utils.get_code_pr(
+            token_pos_ids,
+            cb,
+            act_count_ft_tkns[base_cb_name],
+        )
+        prec_sat_idx = prec >= prec_threshold
+        codes, prec, rec, code_acts = (
+            codes[prec_sat_idx],
+            prec[prec_sat_idx],
+            rec[prec_sat_idx],
+            code_acts[prec_sat_idx],
+        )
+        rec_sat_idx = rec >= recall_threshold
+        codes, prec, rec, code_acts = (
+            codes[rec_sat_idx],
+            prec[rec_sat_idx],
+            rec[rec_sat_idx],
+            code_acts[rec_sat_idx],
+        )
+        codes_pr = list(zip(codes, prec, rec, code_acts))
+        all_codes.append((cb_name, codes_pr))
+    return all_codes
+def find_next_example(example_id):
+    """Find the example after `example_id` that has concept codes."""
+    initial_example_id = example_id
+    example_id += 1
+    while example_id != initial_example_id:
+        all_codes = get_example_concept_codes(example_id)
+        codes_found = sum([len(code_pr_infos) for _, code_pr_infos in all_codes])
+        if codes_found > 0:
+            st.session_state["example_id"] = example_id
+            return
+        example_id = (example_id + 1) % total_examples
+    st.error(
+        f"No examples found at the specified recall threshold: {recall_threshold}.",
+        icon="🚨",
+    )
+def redirect_to_main_with_code(code, layer, head):
+    """Redirect to main page with the given code."""
+    st.session_state["ct_act_code"] = code
+    st.session_state["ct_act_layer"] = layer
+    if st.session_state["is_attn"]:
+        st.session_state["ct_act_head"] = head
+    switch_page("Code Browser")
+def show_examples_for_concept_code(code, layer, head, code_act_ratio=0.3):
+    """Show examples that the code activates on."""
+    ex_acts, _ = webapp_utils.get_code_acts(
+        model_name,
+        tokens_str,
+        code,
+        layer,
+        head,
+        ctx_size=5,
+        return_example_list=True,
+    )
+    filt_ex_acts = []
+    for act_str, num_acts in ex_acts:
+        if num_acts > seq_len * code_act_ratio:
+            filt_ex_acts.append(act_str)
+    st.markdown("#### Examples for Code")
+    st.markdown(
+        webapp_utils.escape_markdown("".join(filt_ex_acts)), unsafe_allow_html=True
+    )
+is_attn = st.session_state["is_attn"]
+st.markdown("## Concept Code")
+concept_code_description = (
+    "Concept codes are codes that activate a lot on only a particular set of examples that share a concept. "
+    "Hence such codes can be thought to correspond to more higher-level concepts or features and "
+    "can activate on most tokens that belong in an example text. This interface provides a way to search for such "
+    "codes by going through different examples using Example ID."
+)
+st.write(concept_code_description)
+# ex_col, p_col, r_col, trunc_col, sort_col = st.columns([1, 2, 2, 1, 1])
+ex_col, r_col, trunc_col, sort_col = st.columns([1, 1, 1, 1])
+example_id = ex_col.number_input(
+    "Example ID",
+    0,
+    total_examples - 1,
+    0,
+    key="example_id",
+)
+# prec_threshold = p_col.slider(
+#     "Precision Threshold",
+#     0.0,
+#     1.0,
+#     0.02,
+#     key="prec",
+#     help="Precision Threshold controls the specificity of the codes for the given example.",
+# )
+recall_threshold = r_col.slider(
+    "Recall Threshold",
+    0.0,
+    1.0,
+    0.3,
+    key="recall",
+    help="Recall Threshold is the minimum fraction of tokens in the example that the code must activate on.",
+)
+example_truncation = trunc_col.number_input(
+    "Max Output Chars", 0, 10240, 1024, key="max_chars"
+)
+sort_by_options = ["Precision", "Recall", "Num Acts"]
+sort_by_name = sort_col.radio(
+    "Sort By",
+    sort_by_options,
+    index=0,
+    horizontal=True,
+    help="Sorts the codes by the selected metric.",
+)
+sort_by = sort_by_options.index(sort_by_name)
+button = st.button(
+    "Find Next Example",
+    key="find_next_example",
+    on_click=find_next_example,
+    args=(example_id,),
+    help="Find an example which has codes above the recall threshold.",
+)
+# if button:
+#     find_next_example(st.session_state["example_id"])
+st.markdown("### Example Text")
+trunc_suffix = "..." if example_truncation < len(tokens_text[example_id]) else ""
+st.write(tokens_text[example_id][:example_truncation] + trunc_suffix)
+cols = st.columns(7 if is_attn else 6)
+cols[0].markdown("Search", help="Button to see token activations for the code.")
+cols[1].write("Layer")
+if is_attn:
+    cols[2].write("Head")
+cols[-4].write("Code")
+cols[-3].write("Precision")
+cols[-2].write("Recall")
+cols[-1].markdown(
+    "Num Acts",
+    help="Number of tokens that the code activates on in the acts dataset.",
+)
+all_codes = get_example_concept_codes(example_id)
+all_codes = [
+    (cb_name, code_pr_info)
+    for cb_name, code_pr_infos in all_codes
+    for code_pr_info in code_pr_infos
+]
+all_codes = sorted(all_codes, key=lambda x: x[1][1 + sort_by], reverse=True)
+for cb_name, (code, p, r, acts) in all_codes:
+    cols = st.columns(7 if is_attn else 6)
+    code_button = cols[0].button(
+        "🔍",
+        key=f"ex-code-{code}-{cb_name}",
+    )
+    layer, head = code_search_utils.get_layer_head_from_adv_name(cb_name)
+    cols[1].write(str(layer))
+    if is_attn:
+        cols[2].write(str(head))
+    cols[-4].write(code)
+    cols[-3].write(f"{p*100:.2f}%")
+    cols[-2].write(f"{r*100:.2f}%")
+    cols[-1].write(str(acts))
+    if code_button:
+        show_examples_for_concept_code(
+            code,
+            layer,
+            head,
+            code_act_ratio=recall_threshold,
+        )
+if len(all_codes) == 0:
+    st.markdown(
+        f"<div style='text-align:center'>No codes found at recall threshold: {recall_threshold}</div>",
+        unsafe_allow_html=True,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy
+torch>=2.0.0
+tqdm
+termcolor
+streamlit_extras

utils.py ADDED Viewed

	@@ -0,0 +1,578 @@

+"""Util functions for codebook features."""
+import re
+import typing
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from termcolor import colored
+from tqdm import tqdm
+@dataclass
+class CodeInfo:
+    """Dataclass for codebook info."""
+    code: int
+    layer: int
+    head: Optional[int]
+    cb_at: Optional[str] = None
+    # for patching interventions
+    pos: Optional[int] = None
+    code_pos: Optional[int] = -1
+    # for description & regex-based interpretation
+    description: Optional[str] = None
+    regex: Optional[str] = None
+    prec: Optional[float] = None
+    recall: Optional[float] = None
+    num_acts: Optional[int] = None
+    def __post_init__(self):
+        """Convert to appropriate types."""
+        self.code = int(self.code)
+        self.layer = int(self.layer)
+        if self.head:
+            self.head = int(self.head)
+        if self.pos:
+            self.pos = int(self.pos)
+        if self.code_pos:
+            self.code_pos = int(self.code_pos)
+        if self.prec:
+            self.prec = float(self.prec)
+            assert 0 <= self.prec <= 1
+        if self.recall:
+            self.recall = float(self.recall)
+            assert 0 <= self.recall <= 1
+        if self.num_acts:
+            self.num_acts = int(self.num_acts)
+    def check_description_info(self):
+        """Check if the regex info is present."""
+        assert self.num_acts is not None and self.description is not None
+        if self.regex is not None:
+            assert self.prec is not None and self.recall is not None
+    def check_patch_info(self):
+        """Check if the patch info is present."""
+        # TODO: pos can be none for patching
+        assert self.pos is not None and self.code_pos is not None
+    def __repr__(self):
+        """Return the string representation."""
+        repr = f"CodeInfo(code={self.code}, layer={self.layer}, head={self.head}, cb_at={self.cb_at}"
+        if self.pos is not None or self.code_pos is not None:
+            repr += f", pos={self.pos}, code_pos={self.code_pos}"
+        if self.description is not None:
+            repr += f", description={self.description}"
+        if self.regex is not None:
+            repr += f", regex={self.regex}, prec={self.prec}, recall={self.recall}"
+        if self.num_acts is not None:
+            repr += f", num_acts={self.num_acts}"
+        repr += ")"
+        return repr
+def logits_to_pred(logits, tokenizer, k=5):
+    """Convert logits to top-k predictions."""
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    probs = sorted_logits.softmax(dim=-1)
+    topk_preds = [tokenizer.convert_ids_to_tokens(e) for e in sorted_indices[:, -1, :k]]
+    topk_preds = [
+        tokenizer.convert_tokens_to_string([e]) for batch in topk_preds for e in batch
+    ]
+    return [(topk_preds[i], probs[:, -1, i].item()) for i in range(len(topk_preds))]
+def patch_codebook_ids(
+    corrupted_codebook_ids, hook, pos, cache, cache_pos=None, code_idx=None
+):
+    """Patch codebook ids with cached ids."""
+    if cache_pos is None:
+        cache_pos = pos
+    if code_idx is None:
+        corrupted_codebook_ids[:, pos] = cache[hook.name][:, cache_pos]
+    else:
+        for code_id in range(32):
+            if code_id in code_idx:
+                corrupted_codebook_ids[:, pos, code_id] = cache[hook.name][
+                    :, cache_pos, code_id
+                ]
+            else:
+                corrupted_codebook_ids[:, pos, code_id] = -1
+    return corrupted_codebook_ids
+def logits_to_ave_logit_diff(logits, answer_tokens, per_prompt=False):
+    """Calculate the average logit difference between the answer and the other token."""
+    # Only the final logits are relevant for the answer
+    final_logits = logits[:, -1, :]
+    answer_logits = final_logits.gather(dim=-1, index=answer_tokens)
+    answer_logit_diff = answer_logits[:, 0] - answer_logits[:, 1]
+    if per_prompt:
+        return answer_logit_diff
+    else:
+        return answer_logit_diff.mean()
+def normalize_patched_logit_diff(
+    patched_logit_diff,
+    base_average_logit_diff,
+    corrupted_average_logit_diff,
+):
+    """Normalize the patched logit difference."""
+    # Subtract corrupted logit diff to measure the improvement,
+    # divide by the total improvement from clean to corrupted to normalise
+    # 0 means zero change, negative means actively made worse,
+    # 1 means totally recovered clean performance, >1 means actively *improved* on clean performance
+    return (patched_logit_diff - corrupted_average_logit_diff) / (
+        base_average_logit_diff - corrupted_average_logit_diff
+    )
+def features_to_tokens(cb_key, cb_acts, num_codes, code=None):
+    """Return the set of token ids each codebook feature activates on."""
+    codebook_ids = cb_acts[cb_key]
+    if code is None:
+        features_tokens = [[] for _ in range(num_codes)]
+        for i in tqdm(range(codebook_ids.shape[0])):
+            for j in range(codebook_ids.shape[1]):
+                for k in range(codebook_ids.shape[2]):
+                    features_tokens[codebook_ids[i, j, k]].append((i, j))
+    else:
+        idx0, idx1, _ = np.where(codebook_ids == code)
+        features_tokens = list(zip(idx0, idx1))
+    return features_tokens
+def color_str(s: str, color: str, html: bool):
+    """Color the string for html or terminal."""
+    if html:
+        return f"<span style='color:{color}'>{s}</span>"
+    else:
+        return colored(s, color)
+def color_tokens_red_automata(tokens, red_idx, html=False):
+    """Separate states with a dash and color red the tokens in red_idx."""
+    ret_string = ""
+    itr_over_red_idx = 0
+    tokens_enumerate = enumerate(tokens)
+    if tokens[0] == "<|endoftext|>":
+        next(tokens_enumerate)
+        if red_idx[0] == 0:
+            itr_over_red_idx += 1
+    for i, c in tokens_enumerate:
+        if i % 2 == 1:
+            ret_string += "-"
+        if itr_over_red_idx < len(red_idx) and i == red_idx[itr_over_red_idx]:
+            ret_string += color_str(c, "red", html)
+            itr_over_red_idx += 1
+        else:
+            ret_string += c
+    return ret_string
+def color_tokens_red(tokens, red_idx, n=3, html=False):
+    """Color red the tokens in red_idx."""
+    ret_string = ""
+    last_colored_token_idx = -1
+    for i in red_idx:
+        c_str = tokens[i]
+        if i <= last_colored_token_idx + 2 * n + 1:
+            ret_string += "".join(tokens[last_colored_token_idx + 1 : i])
+        else:
+            ret_string += "".join(
+                tokens[last_colored_token_idx + 1 : last_colored_token_idx + n + 1]
+            )
+            ret_string += " ... "
+            ret_string += "".join(tokens[i - n : i])
+        ret_string += color_str(c_str, "red", html)
+        last_colored_token_idx = i
+    ret_string += "".join(
+        tokens[
+            last_colored_token_idx + 1 : min(last_colored_token_idx + n, len(tokens))
+        ]
+    )
+    return ret_string
+def prepare_example_print(
+    example_id,
+    example_tokens,
+    tokens_to_color_red,
+    html,
+    color_red_fn=color_tokens_red,
+):
+    """Format example to print."""
+    example_output = color_str(example_id, "green", html)
+    example_output += (
+        ": "
+        + color_red_fn(example_tokens, tokens_to_color_red, html=html)
+        + ("<br>" if html else "\n")
+    )
+    return example_output
+def tkn_print(
+    ll,
+    tokens,
+    separate_states,
+    n=3,
+    max_examples=100,
+    randomize=False,
+    html=False,
+    return_example_list=False,
+):
+    """Format and prints the tokens in ll."""
+    if randomize:
+        raise NotImplementedError("Randomize not yet implemented.")
+    indices = range(len(ll))
+    print_output = [] if return_example_list else ""
+    curr_ex = ll[0][0]
+    total_examples = 0
+    tokens_to_color_red = []
+    color_red_fn = (
+        color_tokens_red_automata if separate_states else partial(color_tokens_red, n=n)
+    )
+    for idx in indices:
+        if total_examples > max_examples:
+            break
+        i, j = ll[idx]
+        if i != curr_ex and curr_ex >= 0:
+            curr_ex_output = prepare_example_print(
+                curr_ex,
+                tokens[curr_ex],
+                tokens_to_color_red,
+                html,
+                color_red_fn,
+            )
+            total_examples += 1
+            if return_example_list:
+                print_output.append((curr_ex_output, len(tokens_to_color_red)))
+            else:
+                print_output += curr_ex_output
+            curr_ex = i
+            tokens_to_color_red = []
+        tokens_to_color_red.append(j)
+    curr_ex_output = prepare_example_print(
+        curr_ex,
+        tokens[curr_ex],
+        tokens_to_color_red,
+        html,
+        color_red_fn,
+    )
+    if return_example_list:
+        print_output.append((curr_ex_output, len(tokens_to_color_red)))
+    else:
+        print_output += curr_ex_output
+        asterisk_str = "********************************************"
+        print_output += color_str(asterisk_str, "green", html)
+    total_examples += 1
+    return print_output
+def print_ft_tkns(
+    ft_tkns,
+    tokens,
+    separate_states=False,
+    n=3,
+    start=0,
+    stop=1000,
+    indices=None,
+    max_examples=100,
+    freq_filter=None,
+    randomize=False,
+    html=False,
+    return_example_list=False,
+):
+    """Print the tokens for the codebook features."""
+    indices = list(range(start, stop)) if indices is None else indices
+    num_tokens = len(tokens) * len(tokens[0])
+    codes, token_act_freqs, token_acts = [], [], []
+    for i in indices:
+        tkns = ft_tkns[i]
+        freq = (len(tkns), 100 * len(tkns) / num_tokens)
+        if freq_filter is not None and freq[1] > freq_filter:
+            continue
+        codes.append(i)
+        token_act_freqs.append(freq)
+        if len(tkns) > 0:
+            tkn_acts = tkn_print(
+                tkns,
+                tokens,
+                separate_states,
+                n=n,
+                max_examples=max_examples,
+                randomize=randomize,
+                html=html,
+                return_example_list=return_example_list,
+            )
+            token_acts.append(tkn_acts)
+        else:
+            token_acts.append("")
+    return codes, token_act_freqs, token_acts
+def patch_in_codes(run_cb_ids, hook, pos, code, code_pos=None):
+    """Patch in the `code` at `run_cb_ids`."""
+    pos = slice(None) if pos is None else pos
+    code_pos = slice(None) if code_pos is None else code_pos
+    if code_pos == "append":
+        assert pos == slice(None)
+        run_cb_ids = F.pad(run_cb_ids, (0, 1), mode="constant", value=code)
+    if isinstance(pos, typing.Iterable) or isinstance(pos, typing.Iterable):
+        for p in pos:
+            run_cb_ids[:, p, code_pos] = code
+    else:
+        run_cb_ids[:, pos, code_pos] = code
+    return run_cb_ids
+def get_cb_layer_name(cb_at, layer_idx, head_idx=None):
+    """Get the layer name used to store hooks/cache."""
+    if head_idx is None:
+        return f"blocks.{layer_idx}.{cb_at}.codebook_layer.hook_codebook_ids"
+    else:
+        return f"blocks.{layer_idx}.{cb_at}.codebook_layer.codebook.{head_idx}.hook_codebook_ids"
+def get_cb_layer_names(layer, patch_types, n_heads):
+    """Get the layer names used to store hooks/cache."""
+    layer_names = []
+    attn_added, mlp_added = False, False
+    if "attn_out" in patch_types:
+        attn_added = True
+        for head in range(n_heads):
+            layer_names.append(
+                f"blocks.{layer}.attn.codebook_layer.codebook.{head}.hook_codebook_ids"
+            )
+    if "mlp_out" in patch_types:
+        mlp_added = True
+        layer_names.append(f"blocks.{layer}.mlp.codebook_layer.hook_codebook_ids")
+    for patch_type in patch_types:
+        # match patch_type of the pattern attn_\d_head_\d
+        attn_head = re.match(r"attn_(\d)_head_(\d)", patch_type)
+        if (not attn_added) and attn_head and attn_head[1] == str(layer):
+            layer_names.append(
+                f"blocks.{layer}.attn.codebook_layer.codebook.{attn_head[2]}.hook_codebook_ids"
+            )
+        mlp = re.match(r"mlp_(\d)", patch_type)
+        if (not mlp_added) and mlp and mlp[1] == str(layer):
+            layer_names.append(f"blocks.{layer}.mlp.codebook_layer.hook_codebook_ids")
+    return layer_names
+def cb_layer_name_to_info(layer_name):
+    """Get the layer info from the layer name."""
+    layer_name_split = layer_name.split(".")
+    layer_idx = int(layer_name_split[1])
+    cb_at = layer_name_split[2]
+    if cb_at == "mlp":
+        head_idx = None
+    else:
+        head_idx = int(layer_name_split[5])
+    return cb_at, layer_idx, head_idx
+def get_hooks(code, cb_at, layer_idx, head_idx=None, pos=None):
+    """Get the hooks for the codebook features."""
+    hook_fns = [
+        partial(patch_in_codes, pos=pos, code=code[i]) for i in range(len(code))
+    ]
+    return [
+        (get_cb_layer_name(cb_at[i], layer_idx[i], head_idx[i]), hook_fns[i])
+        for i in range(len(code))
+    ]
+def run_with_codes(
+    input, cb_model, code, cb_at, layer_idx, head_idx=None, pos=None, prepend_bos=True
+):
+    """Run the model with the codebook features patched in."""
+    hook_fns = [
+        partial(patch_in_codes, pos=pos, code=code[i]) for i in range(len(code))
+    ]
+    cb_model.reset_codebook_metrics()
+    cb_model.reset_hook_kwargs()
+    fwd_hooks = [
+        (get_cb_layer_name(cb_at[i], layer_idx[i], head_idx[i]), hook_fns[i])
+        for i in range(len(cb_at))
+    ]
+    with cb_model.hooks(fwd_hooks, [], True, False) as hooked_model:
+        patched_logits, patched_cache = hooked_model.run_with_cache(
+            input, prepend_bos=prepend_bos
+        )
+    return patched_logits, patched_cache
+def in_hook_list(list_of_arg_tuples, layer, head=None):
+    """Check if the component specified by `layer` and `head` is in the `list_of_arg_tuples`."""
+    # if head is not provided, then checks in MLP
+    for arg_tuple in list_of_arg_tuples:
+        if head is None:
+            if arg_tuple.cb_at == "mlp" and arg_tuple.layer == layer:
+                return True
+        else:
+            if (
+                arg_tuple.cb_at == "attn"
+                and arg_tuple.layer == layer
+                and arg_tuple.head == head
+            ):
+                return True
+    return False
+# def generate_with_codes(input, code, cb_at, layer_idx, head_idx=None, pos=None, disable_other_comps=False):
+def generate_with_codes(
+    input,
+    cb_model,
+    list_of_code_infos=(),
+    disable_other_comps=False,
+    automata=None,
+    generate_kwargs=None,
+):
+    """Model's generation with the codebook features patched in."""
+    if generate_kwargs is None:
+        generate_kwargs = {}
+    hook_fns = [
+        partial(patch_in_codes, pos=tupl.pos, code=tupl.code)
+        for tupl in list_of_code_infos
+    ]
+    fwd_hooks = [
+        (get_cb_layer_name(tupl.cb_at, tupl.layer, tupl.head), hook_fns[i])
+        for i, tupl in enumerate(list_of_code_infos)
+    ]
+    cb_model.reset_hook_kwargs()
+    if disable_other_comps:
+        for layer, cb in cb_model.all_codebooks.items():
+            for head_idx, head in enumerate(cb[0].codebook):
+                if not in_hook_list(list_of_code_infos, layer, head_idx):
+                    head.set_hook_kwargs(
+                        disable_topk=1, disable_for_tkns=[-1], keep_k_codes=False
+                    )
+            if not in_hook_list(list_of_code_infos, layer):
+                cb[1].set_hook_kwargs(
+                    disable_topk=1, disable_for_tkns=[-1], keep_k_codes=False
+                )
+    with cb_model.hooks(fwd_hooks, [], True, False) as hooked_model:
+        gen = hooked_model.generate(input, **generate_kwargs)
+    return automata.seq_to_traj(gen)[0] if automata is not None else gen
+def kl_div(logits1, logits2, pos=-1, reduction="batchmean"):
+    """Calculate the KL divergence between the logits at `pos`."""
+    logits1_last, logits2_last = logits1[:, pos, :], logits2[:, pos, :]
+    # calculate kl divergence between clean and mod logits last
+    return F.kl_div(
+        F.log_softmax(logits1_last, dim=-1),
+        F.log_softmax(logits2_last, dim=-1),
+        log_target=True,
+        reduction=reduction,
+    )
+def JSD(logits1, logits2, pos=-1, reduction="batchmean"):
+    """Compute the Jensen-Shannon divergence between two distributions."""
+    if len(logits1.shape) == 3:
+        logits1, logits2 = logits1[:, pos, :], logits2[:, pos, :]
+    probs1 = F.softmax(logits1, dim=-1)
+    probs2 = F.softmax(logits2, dim=-1)
+    total_m = (0.5 * (probs1 + probs2)).log()
+    loss = 0.0
+    loss += F.kl_div(
+        total_m,
+        F.log_softmax(logits1, dim=-1),
+        log_target=True,
+        reduction=reduction,
+    )
+    loss += F.kl_div(
+        total_m,
+        F.log_softmax(logits2, dim=-1),
+        log_target=True,
+        reduction=reduction,
+    )
+    return 0.5 * loss
+def residual_stream_patching_hook(resid_pre, hook, cache, position: int):
+    """Patch in the codebook features at `position` from `cache`."""
+    clean_resid_pre = cache[hook.name]
+    resid_pre[:, position, :] = clean_resid_pre[:, position, :]
+    return resid_pre
+def find_code_changes(cache1, cache2, pos=None):
+    """Find the codebook codes that are different between the two caches."""
+    for k in cache1.keys():
+        if "codebook" in k:
+            c1 = cache1[k][0, pos]
+            c2 = cache2[k][0, pos]
+            if not torch.all(c1 == c2):
+                print(cb_layer_name_to_info(k), c1.tolist(), c2.tolist())
+                print(cb_layer_name_to_info(k), c1.tolist(), c2.tolist())
+def common_codes_in_cache(cache_codes, threshold=0.0):
+    """Get the common code in the cache."""
+    codes, counts = torch.unique(cache_codes, return_counts=True, sorted=True)
+    counts = counts.float() * 100
+    counts /= cache_codes.shape[1]
+    counts, indices = torch.sort(counts, descending=True)
+    codes = codes[indices]
+    indices = counts > threshold
+    codes, counts = codes[indices], counts[indices]
+    return codes, counts
+def parse_code_info_string(
+    info_str: str, cb_at="attn", pos=None, code_pos=-1
+) -> CodeInfo:
+    """Parse the code info string.
+    The format of the `info_str` is:
+    `code: 0, layer: 0, head: 0, occ_freq: 0.0, train_act_freq: 0.0`.
+    """
+    code, layer, head, occ_freq, train_act_freq = info_str.split(", ")
+    code = int(code.split(": ")[1])
+    layer = int(layer.split(": ")[1])
+    head = int(head.split(": ")[1]) if head else None
+    occ_freq = float(occ_freq.split(": ")[1])
+    train_act_freq = float(train_act_freq.split(": ")[1])
+    return CodeInfo(code, layer, head, pos=pos, code_pos=code_pos, cb_at=cb_at)
+def parse_concept_codes_string(info_str: str, pos=None, code_append=False):
+    """Parse the concept codes string."""
+    code_info_strs = info_str.strip().split("\n")
+    concept_codes = []
+    layer, head = None, None
+    code_pos = "append" if code_append else -1
+    for code_info_str in code_info_strs:
+        concept_codes.append(
+            parse_code_info_string(code_info_str, pos=pos, code_pos=code_pos)
+        )
+        if code_append:
+            continue
+        if layer == concept_codes[-1].layer and head == concept_codes[-1].head:
+            code_pos -= 1
+        else:
+            code_pos = -1
+        concept_codes[-1].code_pos = code_pos
+        layer, head = concept_codes[-1].layer, concept_codes[-1].head
+    return concept_codes

webapp_utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Utility functions for running webapp using streamlit."""
+import streamlit as st
+from streamlit.components.v1 import html
+import code_search_utils
+import utils
+_PERSIST_STATE_KEY = f"{__name__}_PERSIST"
+TOTAL_SAVE_BUTTONS = 0
+def persist(key: str) -> str:
+    """Mark widget state as persistent."""
+    if _PERSIST_STATE_KEY not in st.session_state:
+        st.session_state[_PERSIST_STATE_KEY] = set()
+    st.session_state[_PERSIST_STATE_KEY].add(key)
+    return key
+def load_widget_state():
+    """Load persistent widget state."""
+    if _PERSIST_STATE_KEY in st.session_state:
+        st.session_state.update(
+            {
+                key: value
+                for key, value in st.session_state.items()
+                if key in st.session_state[_PERSIST_STATE_KEY]
+            }
+        )
+@st.cache_resource
+def load_dataset_cache(dataset_cache_path):
+    """Load cache files required for dataset from `cache_path`."""
+    return code_search_utils.load_dataset_cache(dataset_cache_path)
+@st.cache_resource
+def load_code_search_cache(codes_cache_path, dataset_cache_path):
+    """Load cache files required for code search from `codes_cache_path`."""
+    (
+        tokens_str,
+        tokens_text,
+        token_byte_pos,
+    ) = load_dataset_cache(dataset_cache_path)
+    (
+        cb_acts,
+        act_count_ft_tkns,
+        metrics,
+    ) = code_search_utils.load_code_search_cache(codes_cache_path)
+    return tokens_str, tokens_text, token_byte_pos, cb_acts, act_count_ft_tkns, metrics
+@st.cache_data(max_entries=100)
+def load_ft_tkns(model_id, layer, head=None, code=None):
+    """Load the code-to-token map for a codebook."""
+    # model_id required to not mix cache_data for different models
+    assert model_id is not None
+    cb_at = st.session_state["cb_at"]
+    ccb = st.session_state["ccb"]
+    cb_acts = st.session_state["cb_acts"]
+    if head is not None:
+        cb_name = f"layer{layer}_{cb_at}{ccb}{head}"
+    else:
+        cb_name = f"layer{layer}_{cb_at}"
+    return utils.features_to_tokens(
+        cb_name,
+        cb_acts,
+        num_codes=st.session_state["num_codes"],
+        code=code,
+    )
+def get_code_acts(
+    model_id,
+    tokens_str,
+    code,
+    layer,
+    head=None,
+    ctx_size=5,
+    num_examples=100,
+    return_example_list=False,
+):
+    """Get the token activations for a given code."""
+    ft_tkns = load_ft_tkns(model_id, layer, head, code)
+    ft_tkns = [ft_tkns]
+    _, freqs, acts = utils.print_ft_tkns(
+        ft_tkns,
+        tokens=tokens_str,
+        indices=[0],
+        html=True,
+        n=ctx_size,
+        max_examples=num_examples,
+        return_example_list=return_example_list,
+    )
+    return acts[0], freqs[0]
+def set_ct_acts(code, layer, head=None, extra_args=None, is_attn=False):
+    """Set the code and layer for the token activations."""
+    # convert to int
+    code, layer, head = int(code), int(layer), int(head) if head is not None else None
+    st.session_state["ct_act_code"] = code
+    st.session_state["ct_act_layer"] = layer
+    if is_attn:
+        st.session_state["ct_act_head"] = head
+    st.session_state["filter_codes"] = False
+    my_html = """
+    <script>
+        document.location.href = "#code-token-activations";
+    </script>
+    """
+    html(my_html, height=0, width=0, scrolling=False)
+def find_next_code(code, layer_code_acts, act_range=None):
+    """Find the next code that has activations in the given range."""
+    if act_range is None:
+        return code
+    for code_iter, code_act_count in enumerate(layer_code_acts[code:]):
+        if code_act_count >= act_range[0] and code_act_count <= act_range[1]:
+            code += code_iter
+            break
+    return code
+def escape_markdown(text):
+    """Escapes markdown special characters."""
+    MD_SPECIAL_CHARS = r"\`*_{}[]()#+-.!$"
+    for char in MD_SPECIAL_CHARS:
+        text = text.replace(char, "\\" + char)
+    return text
+def add_code_to_demo_file(code_info: utils.CodeInfo, file_path: str):
+    """Add code to demo file."""
+    # TODO: add check for duplicate code and return False if found
+    # TODO: convert saved codes to databases instead of txt files?
+    code_info.check_description_info()
+    with open(file_path, "a") as f:
+        f.write("\n")
+        f.write(f"# {code_info.description}:")
+        if code_info.regex:
+            f.write(f" {code_info.regex}")
+        f.write("\n")
+        f.write(f"layer: {code_info.layer}")
+        f.write(f", head: {code_info.head}" if code_info.head is not None else "")
+        f.write(f", code: {code_info.code}")
+        if code_info.regex:
+            f.write(f", prec: {code_info.prec:.4f}, recall: {code_info.recall:.4f}")
+        f.write(f", num_acts: {code_info.num_acts}\n")
+        return True
+def add_save_code_button(
+    demo_file_path: str,
+    num_acts: int,
+    save_regex: bool = False,
+    prec: float = None,
+    recall: float = None,
+    button_st_container=st,
+    button_text: bool = False,
+    button_key_suffix: str = "",
+):
+    """Add a button on streamlit to save code to demo codes file."""
+    save_button = button_st_container.button(
+        "💾" + (" Save Code to Demos" if button_text else ""),
+        key=f"save_code_button{button_key_suffix}",
+        help="Save code to demo codes file",
+    )
+    if save_button:
+        description = st.text_input(
+            "Write a description for the code",
+            key="save_code_desc",
+        )
+        if not description:
+            return
+    description = st.session_state.get("save_code_desc", None)
+    if description:
+        layer = st.session_state["ct_act_layer"]
+        is_attn = st.session_state["is_attn"]
+        if is_attn:
+            head = st.session_state["ct_act_head"]
+        else:
+            head = None
+        code = st.session_state["ct_act_code"]
+        code_info = utils.CodeInfo(
+            layer=layer,
+            head=head,
+            code=code,
+            description=description,
+            num_acts=num_acts,
+        )
+        if save_regex:
+            code_info.regex = st.session_state["regex_pattern"]
+            code_info.prec = prec
+            code_info.recall = recall
+        saved = add_code_to_demo_file(code_info, demo_file_path)
+        if saved:
+            st.success("Code saved!", icon="🎉")
+            st.success("Code saved!", icon="🎉")

webapp_utils_full_ft_tkns_for_ts.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""Utility functions for running webapp using streamlit."""
+import streamlit as st
+from streamlit.components.v1 import html
+import code_search_utils
+import utils
+_PERSIST_STATE_KEY = f"{__name__}_PERSIST"
+TOTAL_SAVE_BUTTONS = 0
+def persist(key: str) -> str:
+    """Mark widget state as persistent."""
+    if _PERSIST_STATE_KEY not in st.session_state:
+        st.session_state[_PERSIST_STATE_KEY] = set()
+    st.session_state[_PERSIST_STATE_KEY].add(key)
+    return key
+def load_widget_state():
+    """Load persistent widget state."""
+    if _PERSIST_STATE_KEY in st.session_state:
+        st.session_state.update(
+            {
+                key: value
+                for key, value in st.session_state.items()
+                if key in st.session_state[_PERSIST_STATE_KEY]
+            }
+        )
+@st.cache_resource
+def load_dataset_cache(dataset_cache_path):
+    """Load cache files required for dataset from `cache_path`."""
+    return code_search_utils.load_dataset_cache(dataset_cache_path)
+@st.cache_resource
+def load_code_search_cache(codes_cache_path, dataset_cache_path):
+    """Load cache files required for code search from `codes_cache_path`."""
+    (
+        tokens_str,
+        tokens_text,
+        token_byte_pos,
+    ) = load_dataset_cache(dataset_cache_path)
+    (
+        cb_acts,
+        act_count_ft_tkns,
+        metrics,
+    ) = code_search_utils.load_code_search_cache(codes_cache_path)
+    return tokens_str, tokens_text, token_byte_pos, cb_acts, act_count_ft_tkns, metrics
+@st.cache_data(max_entries=100)
+def load_ft_tkns(model_id, layer, head=None, code=None):
+    """Load the code-to-token map for a codebook."""
+    # model_id required to not mix cache_data for different models
+    assert model_id is not None
+    cb_at = st.session_state["cb_at"]
+    ccb = st.session_state["ccb"]
+    cb_acts = st.session_state["cb_acts"]
+    if head is not None:
+        cb_name = f"layer{layer}_{cb_at}{ccb}{head}"
+    else:
+        cb_name = f"layer{layer}_{cb_at}"
+    return utils.features_to_tokens(
+        cb_name,
+        cb_acts,
+        num_codes=st.session_state["num_codes"],
+        code=code,
+    )
+def get_code_acts(
+    model_id,
+    tokens_str,
+    code,
+    layer,
+    head=None,
+    ctx_size=5,
+    num_examples=100,
+    return_example_list=False,
+):
+    """Get the token activations for a given code."""
+    code_to_pass = None if "tinystories" in model_id.lower() else code
+    ft_tkns = load_ft_tkns(model_id, layer, head, code_to_pass)
+    if code_to_pass is not None:
+        ft_tkns = [ft_tkns]
+    else:
+        ft_tkns = ft_tkns[code : code + 1]
+    _, freqs, acts = utils.print_ft_tkns(
+        ft_tkns,
+        tokens=tokens_str,
+        indices=[0],
+        html=True,
+        n=ctx_size,
+        max_examples=num_examples,
+        return_example_list=return_example_list,
+    )
+    return acts[0], freqs[0]
+def set_ct_acts(code, layer, head=None, extra_args=None, is_attn=False):
+    """Set the code and layer for the token activations."""
+    # convert to int
+    code, layer, head = int(code), int(layer), int(head) if head is not None else None
+    st.session_state["ct_act_code"] = code
+    st.session_state["ct_act_layer"] = layer
+    if is_attn:
+        st.session_state["ct_act_head"] = head
+    st.session_state["filter_codes"] = False
+    info_txt = (
+        f"layer: {layer},{f' head: {head},' if head is not None else ''} code: {code}"
+    )
+    if extra_args:
+        for k, v in extra_args.items():
+            info_txt += f", {k}: {v}"
+    my_html = f"""
+    <script>
+        async function myF() {{
+            await new Promise(r => setTimeout(r, 10));
+            const textarea = document.createElement("textarea");
+            textarea.textContent = "{info_txt}";
+            document.body.appendChild(textarea);
+            textarea.select();
+            document.execCommand("copy");
+            document.body.removeChild(textarea);
+        }}
+        myF();
+        window.location.hash = "code-token-activations";
+        console.log(window.location.hash)
+    </script>
+    """
+    html(my_html, height=0, width=0, scrolling=False)
+def find_next_code(code, layer_code_acts, act_range=None):
+    """Find the next code that has activations in the given range."""
+    # code = st.session_state["ct_act_code"]
+    if act_range is None:
+        return code
+    for code_iter, code_act_count in enumerate(layer_code_acts[code:]):
+        if code_act_count >= act_range[0] and code_act_count <= act_range[1]:
+            code += code_iter
+            # st.session_state["ct_act_code"] = code
+            break
+    return code
+def escape_markdown(text):
+    """Escapes markdown special characters."""
+    MD_SPECIAL_CHARS = r"\`*_{}[]()#+-.!$"
+    for char in MD_SPECIAL_CHARS:
+        text = text.replace(char, "\\" + char)
+    return text
+def add_code_to_demo_file(code_info: utils.CodeInfo, file_path: str):
+    """Add code to demo file."""
+    # TODO: add check for duplicate code and return False if found
+    # TODO: convert saved codes to databases instead of txt files?
+    code_info.check_description_info()
+    with open(file_path, "a") as f:
+        f.write("\n")
+        f.write(f"# {code_info.description}:")
+        if code_info.regex:
+            f.write(f" {code_info.regex}")
+        f.write("\n")
+        f.write(f"layer: {code_info.layer}")
+        f.write(f", head: {code_info.head}" if code_info.head is not None else "")
+        f.write(f", code: {code_info.code}")
+        if code_info.regex:
+            f.write(f", prec: {code_info.prec:.4f}, recall: {code_info.recall:.4f}")
+        f.write(f", num_acts: {code_info.num_acts}\n")
+        return True
+def add_save_code_button(
+    demo_file_path: str,
+    num_acts: int,
+    save_regex: bool = False,
+    prec: float = None,
+    recall: float = None,
+    button_st_container=st,
+    button_text: bool = False,
+    button_key_suffix: str = "",
+):
+    """Add a button on streamlit to save code to demo codes file."""
+    save_button = button_st_container.button(
+        "💾" + (" Save Code to Demos" if button_text else ""),
+        key=f"save_code_button{button_key_suffix}",
+        help="Save code to demo codes file",
+    )
+    if save_button:
+        description = st.text_input(
+            "Write a description for the code",
+            key="save_code_desc",
+        )
+        if not description:
+            return
+    description = st.session_state.get("save_code_desc", None)
+    if description:
+        layer = st.session_state["ct_act_layer"]
+        is_attn = st.session_state["is_attn"]
+        if is_attn:
+            head = st.session_state["ct_act_head"]
+        else:
+            head = None
+        code = st.session_state["ct_act_code"]
+        code_info = utils.CodeInfo(
+            layer=layer,
+            head=head,
+            code=code,
+            description=description,
+            num_acts=num_acts,
+        )
+        if save_regex:
+            code_info.regex = st.session_state["regex_pattern"]
+            code_info.prec = prec
+            code_info.recall = recall
+        saved = add_code_to_demo_file(code_info, demo_file_path)
+        if saved:
+            st.success("Code saved!", icon="🎉")
+            st.success("Code saved!", icon="🎉")