zstanjj
/

HTML-Pruner-Llama-1B

@@ -17,7 +17,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import bs4
 import math
 from typing import List, Optional, Tuple, Union
@@ -32,7 +35,6 @@ from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_flash_attention_utils import _flash_attention_forward
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -50,6 +52,19 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
 from .configuration_llama import LlamaConfig
 from collections import defaultdict
 from typing import List, Tuple
@@ -97,66 +112,6 @@ class TokenIdNode(Node):
         self.input_ids = kwargs.get('input_ids', [])
         self.prob = kwargs.get('prob', np.float32(0.0))
-def split_tree(soup: bs4.BeautifulSoup, max_node_words=0) -> List[Tuple[bs4.element.Tag, List[str], bool]]:
-    word_count = len(soup.get_text().split())
-    if word_count > max_node_words:
-        possible_trees = [(soup, [])]
-        target_trees = []  # [(tag, path, is_leaf)]
-        #  split the entire dom tee into subtrees, until the length of the subtree is less than max_node_words words
-        #  find all possible trees
-        while True:
-            if len(possible_trees) == 0:
-                break
-            tree = possible_trees.pop(0)
-            tag_children = defaultdict(int)
-            bare_word_count = 0
-            #  count child tags
-            for child in tree[0].contents:
-                if isinstance(child, bs4.element.Tag):
-                    tag_children[child.name] += 1
-            _tag_children = {k: 0 for k in tag_children.keys()}
-            #  check if the tree can be split
-            for child in tree[0].contents:
-                if isinstance(child, bs4.element.Tag):
-                    #  change child tag with duplicate names
-                    if tag_children[child.name] > 1:
-                        new_name = f"{child.name}{_tag_children[child.name]}"
-                        new_tree = (child, tree[1] + [new_name])
-                        _tag_children[child.name] += 1
-                        child.name = new_name
-                    else:
-                        new_tree = (child, tree[1] + [child.name])
-                    word_count = len(child.get_text().split())
-                    #  add node with more than max_node_words words, and recursion depth is less than 64
-                    if word_count > max_node_words and len(new_tree[1]) < 64:
-                        possible_trees.append(new_tree)
-                    else:
-                        target_trees.append((new_tree[0], new_tree[1], True))
-                else:
-                    bare_word_count += len(str(child).split())
-            #  add leaf node
-            if len(tag_children) == 0:
-                target_trees.append((tree[0], tree[1], True))
-            #  add node with more than max_node_words bare words
-            elif bare_word_count > max_node_words:
-                target_trees.append((tree[0], tree[1], False))
-    else:
-        soup_children = [c for c in soup.contents if isinstance(c, bs4.element.Tag)]
-        if len(soup_children) == 1:
-            target_trees = [(soup_children[0], [soup_children[0].name], True)]
-        else:
-            # add an html tag to wrap all children
-            new_soup = bs4.BeautifulSoup("", 'html.parser')
-            new_tag = new_soup.new_tag("html")
-            new_soup.append(new_tag)
-            for child in soup_children:
-                new_tag.append(child)
-            target_trees = [(new_tag, ["html"], True)]
-    return target_trees
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "LlamaConfig"
@@ -517,6 +472,107 @@ class LlamaFlashAttention2(LlamaAttention):
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -600,17 +656,16 @@ class LlamaFlashAttention2(LlamaAttention):
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
-        attn_output = _flash_attention_forward(
             query_states,
             key_states,
             value_states,
             attention_mask,
             q_len,
-            position_ids=position_ids,
-            dropout=dropout_rate,
-            sliding_window=getattr(self, "sliding_window", None),
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-            is_causal=self.is_causal,
         )
         attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@@ -1752,6 +1807,7 @@ class LlamaForHTMLTreeGeneration(LlamaPreTrainedModel):
                            tokenizer,
                            query: List[str],
                            htmls: List[List[str]],
                            **kwargs):
         max_seq_length = kwargs.pop("max_seq_length", 131072)
         def apply_html_tree_template(query, htmls):
@@ -1787,11 +1843,11 @@ class LlamaForHTMLTreeGeneration(LlamaPreTrainedModel):
                 soup.append(bs4.BeautifulSoup(html, 'html.parser'))
             token_id_paths = []
-            html_chunk_paths = split_tree(soup, max_node_words=self.max_node_words)
-            is_leaf = [p[2] for p in html_chunk_paths]
-            html_chunk_paths = [p[1] for p in html_chunk_paths]
-            for path in html_chunk_paths:
                 path_str = "<" + "><".join(path) + ">"
                 token_ids = tokenizer.encode(path_str, add_special_tokens=False)
                 token_id_paths.append(token_ids)
@@ -1849,7 +1905,7 @@ class LlamaForHTMLTreeGeneration(LlamaPreTrainedModel):
             res_html_refs.append({
                 "html": str(soup),
-                "paths": html_chunk_paths,
                 "is_leaf": is_leaf,
                 "path_token_ids": token_id_paths,
                 "node_tree": list(TokenDotExporter(root, nodenamefunc=nodenamefunc))

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 import bs4
+import loguru
 import math
 from typing import List, Optional, Tuple, Union
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     logging,
     replace_return_docstrings,
 )
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+except ImportError as error:
+    loguru.logger.warning(
+        f"`flash-attention` package not found, consider installing for better performance: {error}."
+    )
+    if not _flash_supports_window_size:
+        loguru.logger.warning(
+            "Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`."
+        )
 from .configuration_llama import LlamaConfig
 from collections import defaultdict
 from typing import List, Tuple
         self.input_ids = kwargs.get('input_ids', [])
         self.prob = kwargs.get('prob', np.float32(0.0))
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "LlamaConfig"
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            query_length,
+            dropout=0.0,
+            softmax_scale=None,
+            use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+        return attn_output
     def forward(
         self,
         hidden_states: torch.Tensor,
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
             query_states,
             key_states,
             value_states,
             attention_mask,
             q_len,
+            dropout_rate,
+            None,
+            getattr(self, "sliding_window", None),
         )
         attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
                            tokenizer,
                            query: List[str],
                            htmls: List[List[str]],
+                           block_tree: List[Tuple],
                            **kwargs):
         max_seq_length = kwargs.pop("max_seq_length", 131072)
         def apply_html_tree_template(query, htmls):
                 soup.append(bs4.BeautifulSoup(html, 'html.parser'))
             token_id_paths = []
+            _block_tree = block_tree[idx]
+            is_leaf = [p[2] for p in _block_tree]
+            _block_tree = [p[1] for p in _block_tree]
+            for path in _block_tree:
                 path_str = "<" + "><".join(path) + ">"
                 token_ids = tokenizer.encode(path_str, add_special_tokens=False)
                 token_id_paths.append(token_ids)
             res_html_refs.append({
                 "html": str(soup),
+                "paths": _block_tree,
                 "is_leaf": is_leaf,
                 "path_token_ids": token_id_paths,
                 "node_tree": list(TokenDotExporter(root, nodenamefunc=nodenamefunc))