Spaces:

antenmanuuel
/

bert-attention-visualizer

Sleeping

App Files Files Community

antenmanuuel commited on Apr 15

Commit

03de5c7

verified ·

1 Parent(s): efa23c6

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

attention_comparison_helpers.py +45 -15
attention_processing.py +240 -0
classes.py +2 -0
helpers.py +89 -0
models.py +7 -1
requirements.txt +2 -1
routes/attention.py +30 -3
routes/attention_comparison.py +3 -0
routes/tokenize.py +1 -1

attention_comparison_helpers.py CHANGED Viewed

@@ -8,13 +8,18 @@ from routes.tokenize import tokenize_text
 async def get_attention_comparison_bert(request: ComparisonRequest):
     """
-    BERT-specific implementation of attention comparison
     """
     try:
-        print(f"\n=== USING BERT ATTENTION COMPARISON IMPLEMENTATION ===")
         # 1. Get the "before" attention data
-        before_attention_request = AttentionRequest(text=request.text, model_name=request.model_name)
         before_data = (await get_attention_matrices(before_attention_request))["attention_data"]
         # 2. Tokenize the text
@@ -46,7 +51,7 @@ async def get_attention_comparison_bert(request: ComparisonRequest):
         # HANDLE PUNCTUATION
         if is_punctuation:
-            print(f"\nUsing BERT punctuation replacement approach")
             # Find all occurrences of this punctuation in the original text
             punctuation_positions = [pos for pos, char in enumerate(original_text) if char == selected_token]
@@ -61,8 +66,9 @@ async def get_attention_comparison_bert(request: ComparisonRequest):
                 # We'll use a heuristic based on the token's position
                 # Count non-special tokens before our selected token
                 non_special_tokens_before = sum(1 for t in tokens[:request.masked_index]
-                                             if t["text"] not in ["[CLS]", "[SEP]"])
                 # Select the corresponding punctuation position (or last one if out of range)
                 punct_idx = min(non_special_tokens_before, len(punctuation_positions) - 1)
@@ -76,14 +82,18 @@ async def get_attention_comparison_bert(request: ComparisonRequest):
                 print(f"Replaced text: '{replaced_text}'")
                 # Get the after attention data
-                after_attention_request = AttentionRequest(text=replaced_text, model_name=request.model_name)
                 after_data = (await get_attention_matrices(after_attention_request))["attention_data"]
                 # Return comparison data
                 return {"before_attention": before_data, "after_attention": after_data}
-        # HANDLE REGULAR WORDS FOR BERT
-        print(f"\nUsing BERT word replacement approach")
         words = original_text.split()
         print(f"Words: {words}")
@@ -193,7 +203,7 @@ async def get_attention_comparison_bert(request: ComparisonRequest):
             print(f"Could not determine token position, using simple word replacement")
             words = original_text.split()
-            # Adjust for special tokens in BERT ([CLS])
             adjusted_index = max(0, request.masked_index - 1)
             word_idx = min(adjusted_index, len(words) - 1)
@@ -218,14 +228,18 @@ async def get_attention_comparison_bert(request: ComparisonRequest):
             print(f"Replaced text: '{replaced_text}'")
         # Get the after attention data
-        after_attention_request = AttentionRequest(text=replaced_text, model_name=request.model_name)
         after_data = (await get_attention_matrices(after_attention_request))["attention_data"]
         # Return comparison data
         return {"before_attention": before_data, "after_attention": after_data}
     except Exception as e:
-        print(f"BERT Attention comparison error: {str(e)}")
         import traceback
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=str(e))
@@ -243,7 +257,11 @@ async def get_attention_comparison_roberta(request: ComparisonRequest):
         print(f"Replacement word: '{request.replacement_word}'")
         # Get the "before" attention data
-        before_attention_request = AttentionRequest(text=request.text, model_name=request.model_name)
         before_data = (await get_attention_matrices(before_attention_request))["attention_data"]
         # Tokenize the text
@@ -302,7 +320,11 @@ async def get_attention_comparison_roberta(request: ComparisonRequest):
                 print(f"Replaced text: '{replaced_text}'")
                 # Get the "after" attention data and return
-                after_request = AttentionRequest(text=replaced_text, model_name=request.model_name)
                 after_data = (await get_attention_matrices(after_request))["attention_data"]
                 return {"before_attention": before_data, "after_attention": after_data}
@@ -341,7 +363,11 @@ async def get_attention_comparison_roberta(request: ComparisonRequest):
             print(f"Replaced text: '{replaced_text}'")
             # Get the "after" attention data
-            after_request = AttentionRequest(text=replaced_text, model_name=request.model_name)
             after_data = (await get_attention_matrices(after_request))["attention_data"]
             return {"before_attention": before_data, "after_attention": after_data}
@@ -411,7 +437,11 @@ async def get_attention_comparison_roberta(request: ComparisonRequest):
                 print(f"Replaced text: '{replaced_text}'")
             # Get the "after" attention data
-            after_request = AttentionRequest(text=replaced_text, model_name=request.model_name)
             after_data = (await get_attention_matrices(after_request))["attention_data"]
             return {"before_attention": before_data, "after_attention": after_data}

 async def get_attention_comparison_bert(request: ComparisonRequest):
     """
+    BERT and DistilBERT implementation of attention comparison
     """
     try:
+        model_type = "DistilBERT" if "distilbert" in request.model_name.lower() else "BERT"
+        print(f"\n=== USING {model_type} ATTENTION COMPARISON IMPLEMENTATION ===")
         # 1. Get the "before" attention data
+        before_attention_request = AttentionRequest(
+            text=request.text,
+            model_name=request.model_name,
+            visualization_method=request.visualization_method
+        )
         before_data = (await get_attention_matrices(before_attention_request))["attention_data"]
         # 2. Tokenize the text
         # HANDLE PUNCTUATION
         if is_punctuation:
+            print(f"\nUsing {model_type} punctuation replacement approach")
             # Find all occurrences of this punctuation in the original text
             punctuation_positions = [pos for pos, char in enumerate(original_text) if char == selected_token]
                 # We'll use a heuristic based on the token's position
                 # Count non-special tokens before our selected token
+                special_tokens = ["[CLS]", "[SEP]"] # Same special tokens for BERT and DistilBERT
                 non_special_tokens_before = sum(1 for t in tokens[:request.masked_index]
+                                             if t["text"] not in special_tokens)
                 # Select the corresponding punctuation position (or last one if out of range)
                 punct_idx = min(non_special_tokens_before, len(punctuation_positions) - 1)
                 print(f"Replaced text: '{replaced_text}'")
                 # Get the after attention data
+                after_attention_request = AttentionRequest(
+                    text=replaced_text,
+                    model_name=request.model_name,
+                    visualization_method=request.visualization_method
+                )
                 after_data = (await get_attention_matrices(after_attention_request))["attention_data"]
                 # Return comparison data
                 return {"before_attention": before_data, "after_attention": after_data}
+        # HANDLE REGULAR WORDS FOR BERT/DistilBERT
+        print(f"\nUsing {model_type} word replacement approach")
         words = original_text.split()
         print(f"Words: {words}")
             print(f"Could not determine token position, using simple word replacement")
             words = original_text.split()
+            # Adjust for special tokens in BERT/DistilBERT ([CLS])
             adjusted_index = max(0, request.masked_index - 1)
             word_idx = min(adjusted_index, len(words) - 1)
             print(f"Replaced text: '{replaced_text}'")
         # Get the after attention data
+        after_attention_request = AttentionRequest(
+            text=replaced_text,
+            model_name=request.model_name,
+            visualization_method=request.visualization_method
+        )
         after_data = (await get_attention_matrices(after_attention_request))["attention_data"]
         # Return comparison data
         return {"before_attention": before_data, "after_attention": after_data}
     except Exception as e:
+        print(f"{model_type} Attention comparison error: {str(e)}")
         import traceback
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=str(e))
         print(f"Replacement word: '{request.replacement_word}'")
         # Get the "before" attention data
+        before_attention_request = AttentionRequest(
+            text=request.text,
+            model_name=request.model_name,
+            visualization_method=request.visualization_method
+        )
         before_data = (await get_attention_matrices(before_attention_request))["attention_data"]
         # Tokenize the text
                 print(f"Replaced text: '{replaced_text}'")
                 # Get the "after" attention data and return
+                after_request = AttentionRequest(
+                    text=replaced_text,
+                    model_name=request.model_name,
+                    visualization_method=request.visualization_method
+                )
                 after_data = (await get_attention_matrices(after_request))["attention_data"]
                 return {"before_attention": before_data, "after_attention": after_data}
             print(f"Replaced text: '{replaced_text}'")
             # Get the "after" attention data
+            after_request = AttentionRequest(
+                text=replaced_text,
+                model_name=request.model_name,
+                visualization_method=request.visualization_method
+            )
             after_data = (await get_attention_matrices(after_request))["attention_data"]
             return {"before_attention": before_data, "after_attention": after_data}
                 print(f"Replaced text: '{replaced_text}'")
             # Get the "after" attention data
+            after_request = AttentionRequest(
+                text=replaced_text,
+                model_name=request.model_name,
+                visualization_method=request.visualization_method
+            )
             after_data = (await get_attention_matrices(after_request))["attention_data"]
             return {"before_attention": before_data, "after_attention": after_data}

attention_processing.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import torch
+import numpy as np
+import networkx as nx
+from typing import List, Dict, Any, Optional, Tuple
+#############################################
+# Attention Rollout Calculation Function
+#############################################
+def compute_attention_rollout(attentions, add_identity: bool = True, debug: bool = False):
+    """
+    Compute attention rollout from raw attention matrices
+    Args:
+        attentions: List of attention tensors from the model
+        add_identity: Whether to add identity matrix to each attention layer
+        debug: Whether to print debug information
+    Returns:
+        The attention rollout matrix
+    """
+    num_layers = len(attentions)
+    seq_len = attentions[0].size(-1)
+    rollout = torch.eye(seq_len)
+    for i, att in enumerate(attentions):
+        att_avg = att.squeeze(0).mean(dim=0)
+        if add_identity:
+            att_aug = att_avg + torch.eye(seq_len)
+        else:
+            att_aug = att_avg
+        att_aug = att_aug / (att_aug.sum(dim=-1, keepdim=True) + 1e-8)
+        att_aug = torch.nan_to_num(att_aug, nan=0.0, posinf=0.0, neginf=0.0)
+        rollout = rollout @ att_aug
+        if debug:
+            print(f"[DEBUG] Rollout after layer {i+1}:")
+            print(att_aug)
+            print(rollout)
+    # Normalize rollout to ensure it sums to 1.0 exactly
+    rollout_sum = rollout.sum(dim=-1, keepdim=True)
+    # Handle zero sums to avoid division by zero
+    is_zero_sum = (rollout_sum == 0)
+    if is_zero_sum.any():
+        # For rows with zero sum, distribute evenly
+        seq_len = rollout.size(-1)
+        even_dist = torch.ones_like(rollout) / seq_len
+        rollout = torch.where(is_zero_sum, even_dist, rollout / rollout_sum)
+    else:
+        rollout = rollout / rollout_sum
+    return rollout
+#############################################
+# Capacity Graph Construction Function (for Flow)
+#############################################
+def build_graph(joint_attentions, input_tokens, remove_diag: bool = False, debug: bool = False):
+    """
+    Build a graph representation for attention flow computation
+    Args:
+        joint_attentions: Joint attention matrices
+        input_tokens: List of input token text
+        remove_diag: Whether to remove diagonal elements (self-attention)
+        debug: Whether to print debug information
+    Returns:
+        Tuple of (capacity matrix, node labels dictionary)
+    """
+    n_layers, seq_len, _ = joint_attentions.shape
+    total_nodes = (n_layers + 1) * seq_len
+    capacity = np.zeros((total_nodes, total_nodes))
+    labels = {}
+    for k in range(seq_len):
+        labels[k] = f"0_{k}_{input_tokens[k]}"
+    for i in range(1, n_layers + 1):
+        for k_to in range(seq_len):
+            node_to = i * seq_len + k_to
+            labels[node_to] = f"L{i}_{k_to}"
+            for k_from in range(seq_len):
+                if remove_diag and (k_from == k_to):
+                    continue
+                node_from = (i - 1) * seq_len + k_from
+                cap = joint_attentions[i - 1][k_from][k_to]
+                capacity[node_from][node_to] = cap
+                if debug:
+                    print(f"[DEBUG] Edge from {labels[node_from]} to {labels[node_to]} with capacity: {cap:.6f}")
+    return capacity, labels
+#############################################
+# Attention Flow Calculation Function (using networkx)
+#############################################
+def compute_attention_flow_networkx(attentions, add_identity: bool = True, debug: bool = False, mask_idx=None):
+    """
+    Compute attention flow using networkx max flow algorithm
+    Args:
+        attentions: List of attention tensors from the model
+        add_identity: Whether to add identity matrix to each attention layer
+        debug: Whether to print debug information
+        mask_idx: Index of token to compute flow from (if None, computes flow for all tokens)
+    Returns:
+        Flow matrix or vector depending on mask_idx
+    """
+    num_layers = len(attentions)
+    seq_len = attentions[0].size(-1)
+    joint_attentions = []
+    for att in attentions:
+        att_avg = att.squeeze(0).mean(dim=0)
+        if add_identity:
+            alpha = 0.5
+            att_aug = (att_avg + torch.eye(seq_len)) * alpha
+        else:
+            att_aug = att_avg
+        att_aug = att_aug / (att_aug.sum(dim=-1, keepdim=True) + 1e-8)
+        joint_attentions.append(att_aug.cpu().numpy())
+    joint_attentions = np.stack(joint_attentions, axis=0)
+    input_tokens = [str(i) for i in range(seq_len)]
+    capacity, labels = build_graph(joint_attentions, input_tokens, remove_diag=False, debug=debug)
+    total_nodes = capacity.shape[0]
+    G = nx.DiGraph()
+    for u in range(total_nodes):
+        for v in range(total_nodes):
+            cap = capacity[u][v]
+            if cap > 1e-8:
+                G.add_edge(u, v, capacity=float(cap))
+    if mask_idx is not None:
+        source = mask_idx
+        flow_vector = np.zeros(seq_len)
+        if debug:
+            print(f"[DEBUG] Networkx max flow from {labels[source]} to each output node:")
+        for sink in range(num_layers * seq_len, (num_layers + 1) * seq_len):
+            try:
+                flow_value, _ = nx.maximum_flow(G, source, sink, capacity='capacity')
+            except Exception:
+                flow_value = 0
+            flow_vector[sink - num_layers * seq_len] = flow_value
+        flow_vector = flow_vector / (flow_vector.sum() + 1e-8)
+        return flow_vector.reshape(1, seq_len)
+    else:
+        flow_matrix = np.zeros((seq_len, seq_len))
+        if debug:
+            print("[DEBUG] Networkx max flow for each input node to each output node:")
+        for i in range(seq_len):
+            source = i
+            flow_vector = np.zeros(seq_len)
+            for sink in range(num_layers * seq_len, (num_layers + 1) * seq_len):
+                try:
+                    flow_value, _ = nx.maximum_flow(G, source, sink, capacity='capacity')
+                except Exception:
+                    flow_value = 0
+                flow_vector[sink - num_layers * seq_len] = flow_value
+            # Normalize flow vector to ensure it sums to 1.0 exactly
+            flow_sum = flow_vector.sum()
+            if flow_sum > 0:
+                flow_vector = flow_vector / flow_sum
+            else:
+                # If there is no flow, distribute evenly
+                flow_vector = np.ones(seq_len) / seq_len
+            flow_matrix[i] = flow_vector
+        if debug:
+            print("[DEBUG] Final networkx flow matrix:")
+            print(flow_matrix)
+        return flow_matrix
+#############################################
+# Process Attention with Selected Method
+#############################################
+def process_attention_with_method(attention_matrices, method: str = "raw", debug: bool = False) -> List[Dict[str, Any]]:
+    """
+    Process attention matrices using the specified method
+    Args:
+        attention_matrices: List of attention tensors from the model
+        method: Method to use (raw, rollout, flow)
+        debug: Whether to print debug information
+    Returns:
+        Processed attention data in the same format as the original attention data
+    """
+    if method == "raw":
+        # Return raw attention as is
+        return attention_matrices
+    # Convert to list if it's a tuple
+    if isinstance(attention_matrices, tuple):
+        attention_matrices = list(attention_matrices)
+    # Get dimensions
+    num_layers = len(attention_matrices)
+    if method == "rollout":
+        # Calculate rollout attention
+        rollout_matrix = compute_attention_rollout(attention_matrices, add_identity=True, debug=debug)
+        # Create new attention matrices with rollout values
+        new_attention_matrices = []
+        for layer_idx in range(num_layers):
+            # For each layer, we'll use the same rollout matrix for all heads
+            heads = attention_matrices[layer_idx].shape[1]  # Number of heads
+            layer_data = {"layerIndex": layer_idx, "heads": []}
+            for head_idx in range(heads):
+                # Convert rollout matrix to list format for this head
+                attention_matrix = rollout_matrix.cpu().tolist()
+                layer_data["heads"].append({
+                    "headIndex": head_idx,
+                    "attention": attention_matrix
+                })
+            new_attention_matrices.append(layer_data)
+        return new_attention_matrices
+    elif method == "flow":
+        # Calculate flow attention
+        flow_matrix = compute_attention_flow_networkx(attention_matrices, add_identity=True, debug=debug)
+        # Create new attention matrices with flow values
+        new_attention_matrices = []
+        for layer_idx in range(num_layers):
+            # For each layer, we'll use the same flow matrix for all heads
+            heads = attention_matrices[layer_idx].shape[1]  # Number of heads
+            layer_data = {"layerIndex": layer_idx, "heads": []}
+            for head_idx in range(heads):
+                # Convert flow matrix to list format for this head
+                attention_matrix = flow_matrix.tolist()
+                layer_data["heads"].append({
+                    "headIndex": head_idx,
+                    "attention": attention_matrix
+                })
+            new_attention_matrices.append(layer_data)
+        return new_attention_matrices
+    else:
+        raise ValueError(f"Unknown attention processing method: {method}")

classes.py CHANGED Viewed

@@ -32,6 +32,7 @@ class MaskPredictionResponse(BaseModel):
 class AttentionRequest(BaseModel):
     text: str
     model_name: str = "bert-base-uncased"
 class AttentionHead(BaseModel):
     headIndex: int
@@ -53,6 +54,7 @@ class ComparisonRequest(BaseModel):
     masked_index: int
     replacement_word: str
     model_name: str = "bert-base-uncased"
 class AttentionComparisonResponse(BaseModel):
     before_attention: AttentionData

 class AttentionRequest(BaseModel):
     text: str
     model_name: str = "bert-base-uncased"
+    visualization_method: str = "raw"  # Options: "raw", "rollout", "flow"
 class AttentionHead(BaseModel):
     headIndex: int
     masked_index: int
     replacement_word: str
     model_name: str = "bert-base-uncased"
+    visualization_method: str = "raw"  # Options: "raw", "rollout", "flow"
 class AttentionComparisonResponse(BaseModel):
     before_attention: AttentionData

helpers.py CHANGED Viewed

@@ -112,6 +112,95 @@ def map_roberta_tokens_to_words(tokens, original_text):
     return token_to_word_map
 # Helper function to load models on demand
 def get_model_and_tokenizer(model_name):
     if model_name not in MODEL_CONFIGS:

     return token_to_word_map
+# Helper function to map BERT and DistilBERT tokens to word positions
+def map_bert_tokens_to_words(tokens, original_text):
+    """
+    Maps BERT/DistilBERT tokens to words in the original text.
+    Returns a dictionary mapping token indices to word indices.
+    """
+    # Get the words from the original text
+    words = original_text.split()
+    print(f"Original words: {words}")
+    # Filter out special tokens
+    content_tokens = []
+    for i, token in enumerate(tokens):
+        if token["text"] not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"]:
+            content_tokens.append((i, token["text"]))
+    print(f"Content tokens: {[t for _, t in content_tokens]}")
+    # Create the mapping
+    token_to_word_map = {}
+    # First approach: direct matching of tokens to words, handling WordPiece tokens
+    word_idx = 0
+    for token_idx, token_text in content_tokens:
+        clean_token = token_text.lower().strip("##")
+        # Check if this is a continuation token (starting with ##)
+        if token_text.startswith("##"):
+            # If it's a continuation, map it to the same word as the previous token
+            if token_idx > 0 and (token_idx - 1) in token_to_word_map:
+                token_to_word_map[token_idx] = token_to_word_map[token_idx - 1]
+                print(f"Continuation token: '{token_text}' -> Word '{words[token_to_word_map[token_idx]]}'")
+            continue
+        # Try to find a match with words
+        while word_idx < len(words):
+            word_lower = words[word_idx].lower()
+            if clean_token in word_lower:
+                token_to_word_map[token_idx] = word_idx
+                print(f"Match: Token '{token_text}' -> Word '{words[word_idx]}' at index {word_idx}")
+                # Only advance to next word if this token is a complete word
+                if clean_token == word_lower:
+                    word_idx += 1
+                break
+            else:
+                word_idx += 1
+            # If we've gone through all words, break
+            if word_idx >= len(words):
+                break
+    # Second approach: Position-based matching for any remaining tokens
+    if len(token_to_word_map) < len(content_tokens):
+        print("Using position-based matching for remaining tokens")
+        # Assign unmapped tokens based on surrounding mapped tokens
+        for token_idx, token_text in content_tokens:
+            if token_idx not in token_to_word_map:
+                # Look for the nearest mapped token before this one
+                prev_idx = token_idx - 1
+                while prev_idx >= 0 and prev_idx not in token_to_word_map:
+                    prev_idx -= 1
+                # Look for the nearest mapped token after this one
+                next_idx = token_idx + 1
+                while next_idx < len(tokens) and next_idx not in token_to_word_map:
+                    next_idx += 1
+                # Assign to the closest mapped word
+                if prev_idx >= 0 and prev_idx in token_to_word_map:
+                    token_to_word_map[token_idx] = token_to_word_map[prev_idx]
+                    print(f"Position match: Token '{token_text}' -> Word '{words[token_to_word_map[token_idx]]}' (based on previous)")
+                elif next_idx < len(tokens) and next_idx in token_to_word_map:
+                    token_to_word_map[token_idx] = token_to_word_map[next_idx]
+                    print(f"Position match: Token '{token_text}' -> Word '{words[token_to_word_map[token_idx]]}' (based on next)")
+                elif word_idx > 0:
+                    # Fallback to the last word if no nearby tokens are mapped
+                    token_to_word_map[token_idx] = min(word_idx - 1, len(words) - 1)
+                    print(f"Fallback match: Token '{token_text}' -> Word '{words[token_to_word_map[token_idx]]}'")
+    # Print the final mapping
+    print("Final token-to-word mapping:")
+    for token_idx, word_idx in sorted(token_to_word_map.items()):
+        token_text = next((t["text"] for i, t in enumerate(tokens) if i == token_idx), "")
+        if word_idx < len(words):
+            print(f"  Token '{token_text}' (idx {token_idx}) -> Word {word_idx} '{words[word_idx]}'")
+    return token_to_word_map
 # Helper function to load models on demand
 def get_model_and_tokenizer(model_name):
     if model_name not in MODEL_CONFIGS:

models.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers import BertForMaskedLM, RobertaForMaskedLM, AutoTokenizer, BertModel, RobertaModel
 import nltk
@@ -24,6 +24,12 @@ MODEL_CONFIGS = {
         "model_class": RobertaForMaskedLM,
         "tokenizer_class": AutoTokenizer,
         "base_model_class": RobertaModel
     }
 }

+from transformers import BertForMaskedLM, RobertaForMaskedLM, AutoTokenizer, BertModel, RobertaModel, DistilBertForMaskedLM, DistilBertModel
 import nltk
         "model_class": RobertaForMaskedLM,
         "tokenizer_class": AutoTokenizer,
         "base_model_class": RobertaModel
+    },
+    "distilbert-base-uncased": {
+        "name": "DistilBERT Base Uncased",
+        "model_class": DistilBertForMaskedLM,
+        "tokenizer_class": AutoTokenizer,
+        "base_model_class": DistilBertModel
     }
 }

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ uvicorn>=0.21.1
 pydantic>=2.0.0
 python-multipart>=0.0.6
 numpy>=1.24.0
-nltk>=3.8.1

 pydantic>=2.0.0
 python-multipart>=0.0.6
 numpy>=1.24.0
+nltk>=3.8.1
+networkx>=3.0

routes/attention.py CHANGED Viewed

@@ -2,13 +2,14 @@ from fastapi import APIRouter, HTTPException
 from classes import *
 from helpers import *
 from routes.tokenize import tokenize_text
 router = APIRouter()
 @router.post("", response_model=AttentionResponse)
 async def get_attention_matrices(request: AttentionRequest):
     """Get attention matrices for the input text using the specified model"""
     try:
-        print(f"Processing attention request: text='{request.text}', model={request.model_name}")
         # First tokenize the text using the same function that the /tokenize endpoint uses
         # to ensure consistency
@@ -29,7 +30,7 @@ async def get_attention_matrices(request: AttentionRequest):
         base_model_key = f"{model_name}_base"
         if base_model_key not in models:
             print(f"Loading base model {model_name}...")
-            models[base_model_key] = base_model_class.from_pretrained(model_name)
             if torch.cuda.is_available():
                 models[base_model_key] = models[base_model_key].cuda()
             models[base_model_key].eval()
@@ -46,9 +47,16 @@ async def get_attention_matrices(request: AttentionRequest):
                 return_tensors="pt",
                 return_attention_mask=True
             )
         else:
             text = f"[CLS] {request.text} [SEP]"
             encoding = tokenizer(text, return_tensors="pt")
         if torch.cuda.is_available():
             encoding = {k: v.cuda() for k, v in encoding.items()}
@@ -73,6 +81,10 @@ async def get_attention_matrices(request: AttentionRequest):
         attention_matrices = outputs.attentions
         print(f"Got attention matrices for {len(attention_matrices)} layers")
         # Convert attention matrices to the expected response format
         layers = []
         for layer_idx, layer_attention in enumerate(attention_matrices):
@@ -96,8 +108,23 @@ async def get_attention_matrices(request: AttentionRequest):
                 "layerIndex": layer_idx,
                 "heads": heads
             })
         print(f"Processed {len(layers)} layers with {num_heads} heads each")
         # Return complete attention data
         attention_data = {

 from classes import *
 from helpers import *
 from routes.tokenize import tokenize_text
+from attention_processing import process_attention_with_method
 router = APIRouter()
 @router.post("", response_model=AttentionResponse)
 async def get_attention_matrices(request: AttentionRequest):
     """Get attention matrices for the input text using the specified model"""
     try:
+        print(f"Processing attention request: text='{request.text}', model={request.model_name}, method={request.visualization_method}")
         # First tokenize the text using the same function that the /tokenize endpoint uses
         # to ensure consistency
         base_model_key = f"{model_name}_base"
         if base_model_key not in models:
             print(f"Loading base model {model_name}...")
+            models[base_model_key] = base_model_class.from_pretrained(model_name, attn_implementation="eager")
             if torch.cuda.is_available():
                 models[base_model_key] = models[base_model_key].cuda()
             models[base_model_key].eval()
                 return_tensors="pt",
                 return_attention_mask=True
             )
+            # Map RoBERTa tokens to words for better visualization
+            token_to_word_map = map_roberta_tokens_to_words(tokens, request.text)
         else:
+            # For BERT and DistilBERT
             text = f"[CLS] {request.text} [SEP]"
             encoding = tokenizer(text, return_tensors="pt")
+            # Map BERT/DistilBERT tokens to words for better visualization
+            token_to_word_map = map_bert_tokens_to_words(tokens, request.text)
         if torch.cuda.is_available():
             encoding = {k: v.cuda() for k, v in encoding.items()}
         attention_matrices = outputs.attentions
         print(f"Got attention matrices for {len(attention_matrices)} layers")
+        # Process attention using the specified method
+        if request.visualization_method != "raw":
+            print(f"Processing attention with method: {request.visualization_method}")
         # Convert attention matrices to the expected response format
         layers = []
         for layer_idx, layer_attention in enumerate(attention_matrices):
                 "layerIndex": layer_idx,
                 "heads": heads
             })
         print(f"Processed {len(layers)} layers with {num_heads} heads each")
+        # Process with selected visualization method
+        if request.visualization_method != "raw":
+            processed_layers = process_attention_with_method(
+                attention_matrices,
+                method=request.visualization_method,
+                debug=False
+            )
+            # Replace the layers with the processed ones
+            layers = processed_layers
+        # Add token-to-word mapping to the response
+        for i, token in enumerate(tokens):
+            if i in token_to_word_map:
+                token["wordIndex"] = token_to_word_map[i]
         # Return complete attention data
         attention_data = {

routes/attention_comparison.py CHANGED Viewed

@@ -16,11 +16,14 @@ async def get_attention_comparison(request: ComparisonRequest):
     print(f"Masked index: {request.masked_index}")
     print(f"Replacement word: '{request.replacement_word}'")
     print(f"Model: {request.model_name}")
     # Dispatch based on model type
     if "roberta" in request.model_name.lower():
         return await get_attention_comparison_roberta(request)
     else:
         return await get_attention_comparison_bert(request)

     print(f"Masked index: {request.masked_index}")
     print(f"Replacement word: '{request.replacement_word}'")
     print(f"Model: {request.model_name}")
+    print(f"Visualization method: {request.visualization_method}")
     # Dispatch based on model type
     if "roberta" in request.model_name.lower():
         return await get_attention_comparison_roberta(request)
     else:
+        # Both BERT and DistilBERT use the same tokenization approach (WordPiece)
+        # and can use the same comparison implementation
         return await get_attention_comparison_bert(request)

routes/tokenize.py CHANGED Viewed

@@ -27,7 +27,7 @@ async def tokenize_text(request: TokenizeRequest):
             # Clean the tokens to remove the leading 'Ġ' character from RoBERTa tokens
             tokens = [clean_roberta_token(token) for token in tokens]
         else:
-            # For BERT, add special tokens and tokenize
             text = f"[CLS] {request.text} [SEP]"
             tokens = tokenizer.tokenize(text)

             # Clean the tokens to remove the leading 'Ġ' character from RoBERTa tokens
             tokens = [clean_roberta_token(token) for token in tokens]
         else:
+            # For BERT and DistilBERT, add special tokens and tokenize
             text = f"[CLS] {request.text} [SEP]"
             tokens = tokenizer.tokenize(text)