Spaces:

valcore
/

Branchy-phi-2

Sleeping

App Files Files Community

Florian valade commited on Jun 3, 2024

Commit

bda5ea2

1 Parent(s): 950b367

refactor to use HF hub and better design

Browse files

Files changed (5) hide show

.gitignore +1 -1
app.py +108 -60
requirements.txt +2 -1
src/BranchyModel.py +0 -469
src/utils.py +0 -57

.gitignore CHANGED Viewed

	@@ -1 +1 @@
1	- model/*


1	+ __pycache__

app.py CHANGED Viewed

@@ -1,75 +1,123 @@
 # Save this as app.py and run with `streamlit run app.py`
 import streamlit as st
 import torch
 import pandas as pd
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from src.utils import generate_next_token, breaking_ties
-from src.BranchyModel import BranchyModel
 st.title("Multi-Head LLM Demo")
-def add_and_run(token, head):
-    # Update pd with Head and mean of previous heads and actual head
-    head_list = st.session_state["computation_pd"]["Head"].to_list() + [head]
-    mean = sum(head_list) / len(head_list)
-    st.session_state["computation_pd"] = pd.concat([st.session_state["computation_pd"], pd.DataFrame({"Head": [head], "Mean": [mean], "Base model consumption": [st.session_state['head_number']]})], ignore_index=True)
-    st.session_state['current_sentence'] += token
-    _, st.session_state['logits'], _, st.session_state['head_tokens'] = generate_next_token(st.session_state.model, st.session_state.tokenizer, st.session_state['current_sentence'])
-def reset():
-    st.session_state['computation_pd'] = pd.DataFrame(columns=["Head", "Mean", "Base model consumption"])
-    st.session_state['current_sentence'] = "The climate in"
-    _, st.session_state['logits'], _, st.session_state['head_tokens'] = generate_next_token(st.session_state.model, st.session_state.tokenizer, st.session_state['current_sentence'])
 @st.cache_resource
-def load_model(model_path):
-    model_str = "susnato/phi-1_5_dev"
-    model = AutoModelForCausalLM.from_pretrained(model_str).to("cuda:1")
-    tokenizer = AutoTokenizer.from_pretrained(model_str)
-    branch_locations = list(range(0, 23, 5))
-    model = BranchyModel(branch_locations= branch_locations, model= model).to("cuda:1")
-    # Load the specific model
-    model.load_state_dict(torch.load(model_path, map_location="cuda:1"))
     return model, tokenizer
 if "model" not in st.session_state or "tokenizer" not in st.session_state:
     print("Loading model...")
-    st.session_state.model, st.session_state.tokenizer = load_model("model/model.bin")
-    st.session_state["head_number"] = len(st.session_state.model.branch_locations) + 1
-    print(f"Head number: {st.session_state['head_number']}")
-# Session state to store the current sentence
-if 'current_sentence' not in st.session_state:
-    reset()
-# Create a container to hold the buttons
-cols = st.columns(len(st.session_state.head_tokens))  # Create a column for each token
-# Iterate through each head token and create a button in a separate column
-for i, (col, token) in enumerate(zip(cols, st.session_state.head_tokens)):
-    col.button(f"{st.session_state['head_tokens'][i]}",
-                key=f"head_{i}",
-                use_container_width=True,
-                on_click=add_and_run,
-                args=(st.session_state['head_tokens'][i], i))
-# Display the current sentence
-st.markdown(f"{st.session_state['current_sentence']}")
-# Reset button to start over
-st.button('Reset', on_click=reset)
-if 'computation_pd' in st.session_state:
-    st.line_chart(st.session_state['computation_pd'])
-    # get last element from a pd
-    saved_budget = 100 - ((st.session_state["computation_pd"]["Mean"].iloc[-1] * 100) / st.session_state["computation_pd"]["Base model consumption"].iloc[-1])
-    st.markdown(f"You saved **{saved_budget:.2f}%** of the base model consumption.")
-    #st.write(st.session_state['computation_pd'])

 # Save this as app.py and run with `streamlit run app.py`
+import time
 import streamlit as st
 import torch
 import pandas as pd
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from typer import clear
+from annotated_text import annotated_text
 st.title("Multi-Head LLM Demo")
+st.markdown("""This is a demo of a multi-head language model with early exit capabilities.
+            The model is based on the Phi-2 architecture and model is available here : https://huggingface.co/valcore/Branchy-Phi-2.
+            \nThe model has four heads, each of which can be exited early based on a threshold. The graph show the depth of early exit for each token (the deeper being the faster) and the time taken to generate each token.
+            Early exited tokens are annotated with the depth of early exit (with a float smaller than 1, 1 being the deepest)
+            """)
+def annotated_to_normal(text):
+    result = ""
+    for elem in text:
+        if isinstance(elem, tuple):
+            result += elem[0]
+        else:
+            result += elem
+    return result
+def generate_next_token():
+    print(f"Generating next token from {st.session_state.messages}")
+    inputs = ""
+    for message in st.session_state.messages:
+        inputs += message["role"] + ": " + annotated_to_normal(message["content"]) + "\n"
+    inputs += "Assistant:"
+    print(f"Inputs: {inputs}")
+    inputs = st.session_state.tokenizer.encode(inputs, return_tensors="pt")
+    for i in range(50):
+        start = time.time()
+        outputs = st.session_state.model(inputs)
+        stop = time.time()
+        next_token_logits = outputs.logits[:, -1, :].squeeze()
+        next_token_probs = torch.softmax(next_token_logits, dim=-1)
+        next_token_id = torch.argmax(next_token_probs, dim=-1)
+        if next_token_id == 50256:
+            break
+        print(inputs.shape, next_token_id.shape)
+        inputs = torch.cat([inputs, next_token_id.unsqueeze(0).unsqueeze(-1)], dim=-1)
+        next_token = st.session_state.tokenizer.decode(next_token_id, return_tensors="pt")
+        time_taken = stop - start
+        branch_locations = st.session_state.model.config.branch_locations
+        print(outputs.head_indices)
+        if outputs.head_indices in branch_locations:
+            print(sorted(branch_locations, reverse=True))
+            early_exit = (branch_locations.index(outputs.head_indices) + 1) / len(branch_locations)
+        else:
+            early_exit = 0
+        # Add data to dataframe
+        new_row = pd.DataFrame({"Time taken (in ms)": [time_taken], "Early exit depth": [early_exit]})
+        st.session_state.data = pd.concat([st.session_state.data, new_row], ignore_index=True)
+        yield next_token, early_exit
 @st.cache_resource
+def load_model(model_str, tokenizer_str):
+    model = AutoModelForCausalLM.from_pretrained(model_str, trust_remote_code=True)
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_str)
     return model, tokenizer
+model_str = "valcore/Branchy-Phi-2"
+tokenizer_str = "microsoft/Phi-2"
 if "model" not in st.session_state or "tokenizer" not in st.session_state:
     print("Loading model...")
+    st.session_state.model, st.session_state.tokenizer = load_model(model_str, tokenizer_str)
+# Initialize chat history and dataframe
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+st.session_state.data = pd.DataFrame(columns=["Time taken (in ms)", "Early exit depth"])
+col1, col2 = st.columns([1, 4])
+with col1:
+    early_exit = st.checkbox("Early exit", value=False)
+    if early_exit:
+        st.session_state.model.head_thresholds = [2.506962537765503, 2.656052589416504, 1.924393653869629, 1.4434680938720703]
+    else:
+        st.session_state.model.head_thresholds = [10., 10., 10., 10.]
+    clear_session = st.button("Clear session")
+    if clear_session:
+        print("Clearing session")
+        st.session_state.messages = []
+        st.session_state.data = pd.DataFrame(columns=["Time taken (in ms)", "Early exit depth"])
+with col2:
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            annotated_text(message["content"])
+    prompt = st.chat_input("What is up?")
+    # React to user input
+    if prompt:
+        # Display user message in chat message container
+        with st.chat_message("User"):
+            st.markdown(prompt)
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "User", "content": prompt})
+        # Display assistant response in chat message container
+        with st.chat_message("Assistant"):
+            response = []
+            with st.spinner('Running inference...'):
+                for next_token, early_exit in generate_next_token():
+                    if early_exit > 0.0:
+                        response.append(tuple((next_token, str(early_exit))))
+                    else:
+                        response.append(next_token)
+                    print(response)
+            annotated_text(response)
+        # Add assistant response to chat history
+        st.session_state.messages.append({"role": "Assistant", "content": response})
+        st.line_chart(st.session_state.data, x=None, y=["Time taken (in ms)", "Early exit depth"])
+        print(st.session_state.messages)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 streamlit==1.31.0
 torch==2.0.1
 pandas==2.0.3
-transformers==4.36.0

 streamlit==1.31.0
 torch==2.0.1
 pandas==2.0.3
+transformers==4.36.0
+st-annotated-text

src/BranchyModel.py DELETED Viewed

@@ -1,469 +0,0 @@
-from typing import Dict, List, Optional
-from dataclasses import dataclass
-import torch
-from torch import nn
-from torch.nn import functional as F
-from transformers import PreTrainedModel
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from transformers.utils import ModelOutput
-@dataclass
-class CausalBranchyLLMOutputWithPast(ModelOutput):
-    loss: Optional[torch.Tensor] = None
-    lm_loss: Optional[torch.Tensor] = None
-    head_loss: Optional[torch.Tensor] = None
-    logits: torch.Tensor = None
-    head_outputs: Optional[torch.Tensor] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-class Branch(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
-    def forward(self, x):
-        x = self.layernorm(x)
-        x = self.lm_head(x)
-        return x
-class BranchyModel(PreTrainedModel):
-    """
-    This class is a wrapper for transformer models with added functionality for branchy networks.
-    It uses BranchyConfig to initialize a model and later will be extended to add branches.
-    Args:
-        branch_locations (List[int]): The locations of the branches in the model.
-        starts indexing from 0. Branch 0 is after layer 0.
-        model (PreTrainedModel): The underlying transformer model to wrap.
-    Returns:
-        A model instance with the given configuration.
-    """
-    def __init__(self, branch_locations, model, loss_type="kl_div", penality_weight=None):
-        super().__init__(model.config)
-        # Initialize the base transformer model
-        self.model = model
-        self.branch_locations = branch_locations
-        self.loss_type = loss_type
-        self.penality_weight = penality_weight
-        if self.loss_type == "penalized_cross_entropy":
-            assert self.penality_weight is not None, "penality_weight must be provided for penalized_cross_entropy loss"
-        # Get details on layering inside the model
-        if hasattr(self.model.config, "n_layer") or hasattr(
-            self.model.config, "num_hidden_layers"
-        ):  # If there is no n_layer in the config, there might be ways to get it from the model itself
-            self.num_layers = (
-                self.model.config.n_layer
-                if hasattr(self.model.config, "n_layer")
-                else self.model.config.num_hidden_layers
-            )
-        else:
-            raise ValueError("cannot find n_layer in config")
-        # if no branch locations are specified, branch at every layer
-        if self.branch_locations is None:
-            self.branch_locations = list(range(self.num_layers - 1))
-        assert self.num_layers > 0, "The number of layers must be greater than 0"
-        assert (
-            len(self.branch_locations) < self.num_layers
-        ), "The number of branches must be less than the number of layers"
-        assert all(
-            [0 <= i < self.num_layers for i in self.branch_locations]
-        ), "The branch locations must be between 0 and num_layers"
-        # Make sure the base model is frozen
-        for param in self.model.parameters():
-            param.requires_grad = False
-        # Instantiate heads. Default: heads are copies of the lm_head
-        self.model.heads = torch.nn.ModuleList(
-            [
-                Branch(self.model.config) for _ in range(len(self.branch_locations))
-            ]
-        )
-        # initialize heads
-        for head in self.model.heads:
-            head.apply(self.model._init_weights)
-            # Make them trainable
-            for param in head.parameters():
-                param.requires_grad = True
-        self.post_init()
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        **kwargs,
-    ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if (
-                attention_mask is not None
-                and attention_mask.shape[1] > input_ids.shape[1]
-            ):
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "fixed_output_head": kwargs.get("fixed_output_head", None),
-            }
-        )
-        return model_inputs
-    def compute_self_supervision_loss(
-        self,
-        aux_logits: torch.Tensor,
-        lm_logits: torch.Tensor,
-        return_per_head: bool = False,
-    ) -> Dict[str, torch.Tensor]:
-        last_aux_logits = aux_logits[..., -1, :]
-        last_lm_logits = lm_logits[..., -1, :]
-        repeated_last_lm_logits = last_lm_logits.repeat(
-            last_aux_logits.shape[0], 1, 1, 1
-        )
-        losses = []
-        # Can be useful to have detailed loss per head for comparison of performance
-        if return_per_head:
-            for head_logit in last_aux_logits:
-                if self.loss_type == "kl_div":
-                    losses.append(
-                        nn.KLDivLoss(reduction="batchmean")(
-                            F.log_softmax(head_logit, dim=-1),
-                            F.softmax(last_lm_logits, dim=-1),
-                        )
-                    )
-                elif self.loss_type == "cross_entropy":
-                    losses.append(
-                        nn.CrossEntropyLoss(reduction="mean")(
-                            head_logit, torch.argmax(last_lm_logits, dim=-1)
-                        )
-                    )
-                elif self.loss_type == "penalized_cross_entropy":
-                    ce_loss = nn.CrossEntropyLoss(reduction="mean")(
-                        head_logit, torch.argmax(last_lm_logits, dim=-1)
-                    )
-                    probas = F.softmax(head_logit, dim=-1)
-                    entropy = torch.mean(-torch.sum(probas * torch.log(probas + 1e-8), dim=-1))
-                    #losses.append(ce_loss - self.penality_weight * (1.0 / (1.0 + entropy)))
-                    losses.append(ce_loss - self.penality_weight * entropy)
-                else:
-                    raise ValueError(
-                        "The loss type must be either kl_div or cross_entropy"
-                    )
-            loss = torch.stack(losses, dim=0).mean(dim=-1)
-        else:
-            # Compute the KL divergence between the last auxiliary head and the last LM head
-            if self.loss_type == "kl_div":
-                loss = nn.KLDivLoss(reduction="batchmean")(
-                    F.log_softmax(last_aux_logits.view(-1, self.config.vocab_size), dim=-1),
-                    F.softmax(
-                        repeated_last_lm_logits.view(-1, self.config.vocab_size), dim=-1
-                    ),
-                )
-            elif self.loss_type == "cross_entropy":
-                loss = nn.CrossEntropyLoss(reduction="mean")(
-                    last_aux_logits.view(-1, self.config.vocab_size),
-                    torch.argmax(
-                        repeated_last_lm_logits.view(-1, self.config.vocab_size), dim=-1
-                    ),
-                )
-            elif self.loss_type == "penalized_cross_entropy":
-                ce_loss = nn.CrossEntropyLoss(reduction="mean")(
-                    last_aux_logits.view(-1, self.config.vocab_size),
-                    torch.argmax(
-                        repeated_last_lm_logits.view(-1, self.config.vocab_size), dim=-1
-                    ),
-                )
-                probas = F.softmax(
-                    last_aux_logits.view(-1, self.config.vocab_size), dim=-1
-                )
-                entropy = torch.mean(-torch.sum(probas * torch.log(probas + 1e-8), dim=-1))
-                loss = ce_loss + self.penality_weight * entropy
-            else:
-                raise ValueError(
-                    "The loss type must be either kl_div or cross_entropy"
-                )
-        if return_per_head:
-            return {"loss": loss, "aux_loss": torch.stack(losses)}
-        else:
-            return {"loss": loss, "aux_loss": None}
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        self_supervision: Optional[bool] = None,
-        fixed_output_head: Optional[int] = None,
-    ):
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        if self_supervision:
-            output_hidden_states = True
-            return self.forward_for_training(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                labels=labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        else:
-            return self.forward_for_inference(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                use_cache=use_cache,
-                return_dict=return_dict,
-                fixed_output_head=fixed_output_head,
-            )
-    def forward_for_inference(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        fixed_output_head: Optional[int] = None,
-    ):
-        if fixed_output_head not in self.branch_locations and fixed_output_head is not None and fixed_output_head != -1:
-            raise ValueError(
-                "The fixed output head must be one of the branch locations"
-            )
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-        past_key_values_length = 0
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0)
-        if inputs_embeds is None:
-            inputs_embeds = self.model.model.embed_tokens(input_ids)
-        inputs_embeds = self.model.model.embed_dropout(inputs_embeds)
-        # Attention mask.
-        if self.model.model._use_flash_attention_2:
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-            )
-        all_head_logits = []
-        hidden_states = inputs_embeds
-        is_early_exited = False
-        for layer_idx, decoder_layer in enumerate(self.model.model.layers):
-            layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    use_cache=use_cache,
-                )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache = layer_outputs[1]
-            if fixed_output_head is not None and layer_idx == fixed_output_head:
-                # find postion of layer idx in branch_locations
-                branch_idx = self.branch_locations.index(layer_idx)
-                logits = self.model.heads[branch_idx](hidden_states)
-                is_early_exited = True
-                break
-            elif fixed_output_head == -1 and layer_idx in self.branch_locations:
-                # -1 means output all heads
-                branch_idx = self.branch_locations.index(layer_idx)
-                logits = self.model.heads[branch_idx](hidden_states)
-                all_head_logits.append(logits)
-        if not is_early_exited:
-            hidden_states = self.model.model.final_layernorm(hidden_states)
-            logits = self.model.lm_head(hidden_states)
-            if fixed_output_head == -1:
-                all_head_logits.append(logits)
-                all_head_logits = torch.stack(all_head_logits, dim=0)
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-        if not return_dict:
-            return tuple(v for v in [logits, next_cache] if v is not None)
-        return CausalBranchyLLMOutputWithPast(
-            logits=logits,
-            head_outputs=all_head_logits,
-            past_key_values=next_cache,
-        )
-    def forward_for_training(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        if not output_hidden_states:
-            raise ValueError("output_hidden_states must be True for BranchyLLM")
-        if labels is not None:
-            raise NotImplementedError("BranchyLLM only supports self-supervision")
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        if not hasattr(outputs, "hidden_states") or outputs.hidden_states is None:
-            raise ValueError("The model must return hidden states")
-        hidden_states = outputs.hidden_states
-        heads_logits = []
-        for i, branch in enumerate(self.branch_locations):
-            heads_logits.append(
-                self.model.heads[i](
-                    hidden_states[branch]
-                )
-            )
-        lm_logits = self.model.lm_head(hidden_states[-1])
-        heads_logits = torch.stack(heads_logits, dim=0).float()
-        lm_logits = lm_logits.float()
-        logits = torch.cat([heads_logits, lm_logits.unsqueeze(0)], dim=0)
-        loss = None
-        lm_loss = None
-        aux_loss = None
-        losses = self.compute_self_supervision_loss(
-            heads_logits, lm_logits, return_per_head=True
-        )
-        loss = losses["loss"]
-        if losses["aux_loss"] is not None:
-            aux_loss = losses["aux_loss"]
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss, aux_loss, lm_loss) + output) if loss is not None else output
-        return CausalBranchyLLMOutputWithPast(
-            loss=loss,
-            lm_loss=lm_loss,
-            head_loss=aux_loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )

src/utils.py DELETED Viewed

@@ -1,57 +0,0 @@
-import torch
-def generate_next_token(model, tokenizer, input, method='greedy'):
-    """
-    Generate the next token of a sequence using the given model and tokenizer.
-    Specific for multi branched models.
-    Only output token from last head.
-    Args:
-        model (torch.nn.Module): The model to use for generation.
-        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use for generation.
-        input (str): The input text to generate from.
-    Returns:
-        token (str): The next token in the sequence.
-        logits (torch.Tensor): The logits of the next token. of shape[Head, vocab_size]
-        new_sequence (str): The new sequence after adding the next token.
-    """
-    device = model.device
-    input_ids = tokenizer.encode(input, return_tensors="pt").to(device)
-    model.eval()
-    logits = model(input_ids, fixed_output_head=-1).head_outputs[..., -1, :].squeeze(1) # squeeze batch dimension as it is 1 new shape is (head_count, vocab_size)
-    if logits == []:
-        raise ValueError("Model does not have head_outputs")
-    if method == 'greedy':
-        head_tokens = torch.argmax(logits, dim=-1)
-    elif method == 'sample':
-        head_tokens = torch.multinomial(torch.nn.functional.softmax(logits, dim=-1), num_samples=1)
-    elif method == 'top_k':
-        k = 5
-        top_k = torch.topk(logits, k, dim=-1)
-        top_k_logits, top_k_indices = top_k.values, top_k.indices
-        top_k_probs = torch.nn.functional.softmax(top_k_logits, dim=-1)
-        head_tokens = top_k_indices[torch.arange(top_k_probs.shape[0]), torch.multinomial(top_k_probs, num_samples=1).squeeze()]
-    elif method == 'top_p':
-        # logits is of shape [batch, vocab_size]
-        p = 0.9
-        probs = torch.nn.functional.softmax(logits, dim=-1)
-        sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
-        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
-        sorted_indices_to_remove = cumulative_probs > p
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-        indices_to_remove = sorted_indices[sorted_indices_to_remove]
-        tmp_logits = logits.clone()
-        for i in range(logits.shape[0]):
-            tmp_logits[i, indices_to_remove[i]] = float('-inf')
-        head_tokens = torch.multinomial(torch.nn.functional.softmax(tmp_logits, dim=-1), num_samples=1).squeeze()
-    else:
-        raise ValueError(f"Unknown method: {method}")
-    head_tokens = tokenizer.batch_decode(head_tokens) # Treat head dim as batch dim
-    new_sequence = input + head_tokens[-1]
-    return head_tokens[-1], logits, new_sequence, head_tokens
-def breaking_ties(tensor):
-    return torch.sub(torch.topk(tensor, 2, dim=-1).values[..., 0], torch.topk(tensor, 2, dim=-1).values[..., 1]).squeeze()