RWKV
/

rwkv-5-world-1b5

Text Generation

Transformers

PyTorch

rwkv5

custom_code

Model card Files Files and versions Community

KaleiNeely commited on Mar 19

Commit

3edf4ac

•

1 Parent(s): 1c0f950

Update modeling_rwkv5.py

Browse files

Files changed (1) hide show

modeling_rwkv5.py +189 -149

modeling_rwkv5.py CHANGED Viewed

@@ -16,6 +16,7 @@
 """PyTorch RWKV5 World model."""
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 import torch
@@ -30,6 +31,7 @@ from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_ninja_available,
     is_torch_cuda_available,
     logging,
@@ -52,6 +54,7 @@ RWKV5_PRETRAINED_MODEL_ARCHIVE_LIST = [
 rwkv5_cuda_kernel = None
 def load_wkv5_cuda_kernel(head_size):
     from torch.utils.cpp_extension import load as load_kernel
@@ -86,89 +89,108 @@ def load_wkv5_cuda_kernel(head_size):
 class WKV_5(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, B, T, C, H, r, k, v, w, u, s):
         with torch.no_grad():
-            assert r.dtype == torch.bfloat16
-            assert k.dtype == torch.bfloat16
-            assert v.dtype == torch.bfloat16
-            assert w.dtype == torch.bfloat16
-            assert u.dtype == torch.bfloat16
-            assert s.dtype == torch.float32
-            ctx.B = B
-            ctx.T = T
-            ctx.C = C
-            ctx.H = H
-            assert r.is_contiguous()
-            assert k.is_contiguous()
-            assert v.is_contiguous()
-            assert w.is_contiguous()
-            assert u.is_contiguous()
-            ew = (-torch.exp(w.float())).contiguous()
-            eew = (torch.exp(ew)).contiguous()
-            ctx.save_for_backward(r, k, v, eew, ew, u)
-            y = torch.empty(
-                (B, T, C), device=r.device, dtype=torch.bfloat16, memory_format=torch.contiguous_format
-            )  # .uniform_(-1, 1)
-            rwkv5_cuda_kernel.forward(B, T, C, H, r, k, v, eew, u, y, s)
-            return y, s
     @staticmethod
-    def backward(ctx, gy):
         with torch.no_grad():
-            assert gy.dtype == torch.bfloat16
-            B = ctx.B
-            T = ctx.T
-            C = ctx.C
-            H = ctx.H
-            assert gy.is_contiguous()
-            r, k, v, eew, ew, u = ctx.saved_tensors
-            gr = torch.empty(
-                (B, T, C),
-                device=gy.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
-            )  # .uniform_(-1, 1)
-            gk = torch.empty(
-                (B, T, C),
-                device=gy.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
-            )  # .uniform_(-1, 1)
-            gv = torch.empty(
-                (B, T, C),
-                device=gy.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
-            )  # .uniform_(-1, 1)
-            gw = torch.empty(
-                (B, C),
-                device=gy.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
-            )  # .uniform_(-1, 1)
-            gu = torch.empty(
-                (B, C),
-                device=gy.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
-            )  # .uniform_(-1, 1)
-            rwkv5_cuda_kernel.backward(B, T, C, H, r, k, v, eew, ew, u, gy, gr, gk, gv, gw, gu)
-            gw = torch.sum(gw, 0).view(H, C // H)
-            gu = torch.sum(gu, 0).view(H, C // H)
-            return (None, None, None, None, gr, gk, gv, gw, gu)
 def rwkv_linear_attention_v5_cpu(
-    B,
-    H,
-    S,
-    T,
-    n_head,
     hidden,
     time_decay,
     time_first,
@@ -176,20 +198,24 @@ def rwkv_linear_attention_v5_cpu(
     key,
     value,
     gate,
-    lxw,
-    lxb,
-    ow,
     state,
 ):
-    key = key.to(torch.float32).view(B, T, H, S).transpose(1, 2).transpose(-2, -1)
-    value = value.to(torch.float32).view(B, T, H, S).transpose(1, 2)
-    receptance = receptance.to(torch.float32).view(B, T, H, S).transpose(1, 2)
-    time_decay = torch.exp(-torch.exp(time_decay.float())).reshape(-1, 1, 1).reshape(n_head, -1, 1)
-    time_first = time_first.float().reshape(-1, 1, 1).reshape(n_head, -1, 1)
-    lxw = lxw.float()
-    lxb = lxb.float()
-    out = torch.zeros_like(key).reshape(B, T, H, S)
-    for t in range(T):
         rt = receptance[:, :, t : t + 1, :]
         kt = key[:, :, :, t : t + 1]
         vt = value[:, :, t : t + 1, :]
@@ -198,20 +224,17 @@ def rwkv_linear_attention_v5_cpu(
         with torch.no_grad():
             state = at + time_decay * state
-    out = out.reshape(B * T, H * S)
-    out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H * S)
     out = out.to(dtype=hidden.dtype) * gate
-    out = out @ ow
     return out, state
 def rwkv_linear_attention(
-    B,
-    H,
-    S,
-    T,
-    n_head,
     hidden,
     time_decay,
     time_first,
@@ -219,22 +242,21 @@ def rwkv_linear_attention(
     key,
     value,
     gate,
-    lxw,
-    lxb,
-    ow,
     state,
 ):
     no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, receptance, key, value])
     # Launching the CUDA kernel for just one token will actually be slower (there is no for loop in the CPU version
     # in this case).
     one_token = key.size(1) == 1
     if rwkv5_cuda_kernel is None or no_cuda or one_token:
         return rwkv_linear_attention_v5_cpu(
-            B,
-            H,
-            S,
-            T,
-            n_head,
             hidden,
             time_decay,
             time_first,
@@ -242,17 +264,30 @@ def rwkv_linear_attention(
             key,
             value,
             gate,
-            lxw,
-            lxb,
-            ow,
             state,
         )
     else:
-        out, state = WKV_5.apply(B, T, H * S, H, receptance, key, value, time_decay, time_first, state)
-        out = out.reshape(B * T, H * S)
-        out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H * S)
         out = out.to(dtype=hidden.dtype) * gate
-        out = out @ ow
         return out, state
@@ -268,7 +303,6 @@ class RwkvSelfAttention(nn.Module):
                 logger.info("Could not load the custom CUDA kernel for RWKV5 attention.")
         self.layer_id = layer_id
         hidden_size = config.hidden_size
-        # https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/src/model.py#L146
         num_attention_heads = hidden_size // config.head_size
         self.num_attention_heads = num_attention_heads
         attention_hidden_size = (
@@ -290,11 +324,9 @@ class RwkvSelfAttention(nn.Module):
         self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.gate = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)
-        # https://github.com/BlinkDL/RWKV-LM/blob/3db37a72356b736966ddd377268f02b80963af3f/RWKV-v4neo/src/model.py#L190C1-L190C1
         self.ln_x = nn.GroupNorm(hidden_size // config.head_size, hidden_size)
-    # TODO: maybe jit, otherwise move inside forward
-    def extract_key_value(self, B, H, S, T, hidden, state=None):
         # Mix hidden with the previous timestep to produce key, value, receptance
         if hidden.size(1) == 1 and state is not None:
             shifted = state[0][:, :, self.layer_id]
@@ -309,7 +341,6 @@ class RwkvSelfAttention(nn.Module):
         receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
         gate = hidden * self.time_mix_gate + shifted * (1 - self.time_mix_gate)
-        # https://github.com/BlinkDL/ChatRWKV/blob/main/rwkv_pip_package/src/rwkv/model.py#L693
         key = self.key(key)
         value = self.value(value)
         receptance = self.receptance(receptance)
@@ -321,19 +352,9 @@ class RwkvSelfAttention(nn.Module):
         return receptance, key, value, gate, state
     def forward(self, hidden, state=None, use_cache=False, seq_mode=True):
-        B = hidden.shape[0]
-        H = self.time_decay.shape[0]
-        S = hidden.shape[-1] // H
-        T = hidden.shape[1]
-        receptance, key, value, gate, state = self.extract_key_value(B, H, S, T, hidden, state=state)
         layer_state = state[1][:, :, :, :, self.layer_id] if state is not None else None
         rwkv, layer_state = rwkv_linear_attention(
-            B,
-            H,
-            S,
-            T,
-            self.num_attention_heads,
             hidden,
             self.time_decay,
             self.time_faaaa,
@@ -359,7 +380,6 @@ class RwkvFeedForward(nn.Module):
         self.config = config
         self.layer_id = layer_id
         hidden_size = config.hidden_size
-        # https://github.com/BlinkDL/RWKV-LM/blob/3db37a72356b736966ddd377268f02b80963af3f/RWKV-v4neo/train.py#L168
         intermediate_size = (
             config.intermediate_size
             if config.intermediate_size is not None
@@ -396,7 +416,8 @@ class RwkvFeedForward(nn.Module):
         return receptance * value, state
-class RwkvBlock(nn.Module):
     def __init__(self, config, layer_id):
         super().__init__()
         self.config = config
@@ -437,7 +458,7 @@ class Rwkv5PreTrainedModel(PreTrainedModel):
     config_class = Rwkv5Config
     base_model_prefix = "rwkv"
-    _no_split_modules = ["RwkvBlock"]
     _keep_in_fp32_modules = ["time_decay", "time_first"]
     supports_gradient_checkpointing = True
@@ -460,7 +481,6 @@ class Rwkv5PreTrainedModel(PreTrainedModel):
             )
             time_weight = time_weight[None, None, :]
-            # https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/src/model.py#L398
             decay_speed = [
                 -6.0 + 5.0 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
                 for h in range(attention_hidden_size)
@@ -503,6 +523,7 @@ class Rwkv5PreTrainedModel(PreTrainedModel):
                 module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
 @dataclass
 class Rwkv5Output(ModelOutput):
     """
@@ -530,6 +551,7 @@ class Rwkv5Output(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
 class Rwkv5CausalLMOutput(ModelOutput):
     """
@@ -611,7 +633,7 @@ class Rwkv5Model(Rwkv5PreTrainedModel):
         super().__init__(config)
         self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx) for idx in range(config.num_hidden_layers)])
         self.ln_out = nn.LayerNorm(config.hidden_size)
         self.layers_are_rescaled = False
@@ -665,39 +687,35 @@ class Rwkv5Model(Rwkv5PreTrainedModel):
             inputs_embeds = self.embeddings(input_ids)
         if use_cache and state is None:
-            # https://github.com/BlinkDL/ChatRWKV/blob/main/rwkv_pip_package/src/rwkv/model.py#L904-L906
             state = []
             num_attention_heads = self.config.hidden_size // self.config.num_attention_heads
-            state.append(
-                torch.zeros(
-                    (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers),
-                    dtype=inputs_embeds.dtype,
-                    requires_grad=False,
-                    device=inputs_embeds.device,
-                ).contiguous()
-            )
-            state.append(
-                torch.zeros(
-                    (
-                        inputs_embeds.size(0),
-                        num_attention_heads,
-                        self.config.hidden_size // num_attention_heads,
-                        self.config.hidden_size // num_attention_heads,
-                        self.config.num_hidden_layers,
-                    ),
-                    dtype=torch.float32,
-                    requires_grad=False,
-                    device=inputs_embeds.device,
-                ).contiguous()
-            )
-            state.append(
-                torch.zeros(
-                    (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers),
-                    dtype=inputs_embeds.dtype,
-                    requires_grad=False,
-                    device=inputs_embeds.device,
-                ).contiguous()
-            )
         seq_mode = inputs_embeds.shape[1] > 1
         hidden_states = inputs_embeds
@@ -752,10 +770,32 @@ class Rwkv5Model(Rwkv5PreTrainedModel):
         self.layers_are_rescaled = not self.training
 @add_start_docstrings(
     """
-    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
     embeddings).
     """,
     RWKV_START_DOCSTRING,

 """PyTorch RWKV5 World model."""
 from dataclasses import dataclass
+from pathlib import Path
 from typing import List, Optional, Tuple, Union
 import torch
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_bitsandbytes_available,
     is_ninja_available,
     is_torch_cuda_available,
     logging,
 rwkv5_cuda_kernel = None
+# Copied from https://github.com/huggingface/transformers/blob/18cbaf13dcaca7145f5652aefb9b19734c56c3cd/src/transformers/models/rwkv/modeling_rwkv.py#L65
 def load_wkv5_cuda_kernel(head_size):
     from torch.utils.cpp_extension import load as load_kernel
 class WKV_5(torch.autograd.Function):
     @staticmethod
+    def forward(ctx, receptance, key, value, time_decay, time_first, state):
         with torch.no_grad():
+            Batch = key.shape[0]
+            SequenceLength = key.shape[1]
+            HiddenSize = key.shape[2]
+            HeadSize = HiddenSize // time_decay.shape[0]
+            ctx.Batch = Batch
+            ctx.SequenceLength = SequenceLength
+            ctx.HiddenSize = HiddenSize
+            ctx.HeadSize = HeadSize
+            e_time_decay = (-torch.exp(time_decay.float())).contiguous()
+            ee_time_decay = (torch.exp(e_time_decay)).contiguous()
+            ctx.save_for_backward(receptance, key, value, ee_time_decay, e_time_decay, time_first)
+            out = torch.empty(
+                (Batch, SequenceLength, HiddenSize),
+                device=receptance.device,
+                dtype=torch.bfloat16,
+                memory_format=torch.contiguous_format,
+            )
+            rwkv5_cuda_kernel.forward(
+                Batch,
+                SequenceLength,
+                HiddenSize,
+                HeadSize,
+                receptance,
+                key,
+                value,
+                ee_time_decay,
+                time_first,
+                out,
+                state,
+            )
+            return out, state
     @staticmethod
+    def backward(ctx, gout):
         with torch.no_grad():
+            assert gout.dtype == torch.bfloat16
+            Batch = ctx.Batch
+            SequenceLength = ctx.SequenceLength
+            HiddenSize = ctx.HiddenSize
+            HeadSize = ctx.HeadSize
+            receptance, key, value, ee_time_decay, e_time_decay, time_first = ctx.saved_tensors
+            greceptance = torch.empty(
+                (Batch, SequenceLength, HiddenSize),
+                device=gout.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
+            )
+            g_key = torch.empty(
+                (Batch, SequenceLength, HiddenSize),
+                device=gout.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
+            )
+            g_value = torch.empty(
+                (Batch, SequenceLength, HiddenSize),
+                device=gout.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
+            )
+            g_time_decay = torch.empty(
+                (Batch, HiddenSize),
+                device=gout.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
+            )
+            g_time_first = torch.empty(
+                (Batch, HiddenSize),
+                device=gout.device,
                 requires_grad=False,
                 dtype=torch.bfloat16,
                 memory_format=torch.contiguous_format,
+            )
+            rwkv5_cuda_kernel.backward(
+                Batch,
+                SequenceLength,
+                HiddenSize,
+                HeadSize,
+                receptance,
+                key,
+                value,
+                ee_time_decay,
+                e_time_decay,
+                time_first,
+                gout,
+                greceptance,
+                g_key,
+                g_value,
+                g_time_decay,
+                g_time_first,
+            )
+            g_time_decay = torch.sum(g_time_decay, 0).view(HeadSize, HiddenSize // HeadSize)
+            g_time_first = torch.sum(g_time_first, 0).view(HeadSize, HiddenSize // HeadSize)
+            return (None, None, None, None, greceptance, g_key, g_value, g_time_decay, g_time_first)
 def rwkv_linear_attention_v5_cpu(
     hidden,
     time_decay,
     time_first,
     key,
     value,
     gate,
+    layer_norm_weight,
+    layer_norm_bias,
+    output_weight,
     state,
 ):
+    Batch = hidden.shape[0]
+    AttentionHeads = time_decay.shape[0]
+    HeadSize = hidden.shape[-1] // AttentionHeads
+    SequenceLength = hidden.shape[1]
+    key = key.to(torch.float32).view(Batch, SequenceLength, AttentionHeads, HeadSize).transpose(1, 2).transpose(-2, -1)
+    value = value.to(torch.float32).view(Batch, SequenceLength, AttentionHeads, HeadSize).transpose(1, 2)
+    receptance = receptance.to(torch.float32).view(Batch, SequenceLength, AttentionHeads, HeadSize).transpose(1, 2)
+    time_decay = torch.exp(-torch.exp(time_decay.float())).reshape(-1, 1, 1).reshape(AttentionHeads, -1, 1)
+    time_first = time_first.float().reshape(-1, 1, 1).reshape(AttentionHeads, -1, 1)
+    layer_norm_weight = layer_norm_weight.float()
+    layer_norm_bias = layer_norm_bias.float()
+    out = torch.zeros_like(key).reshape(Batch, SequenceLength, AttentionHeads, HeadSize)
+    for t in range(SequenceLength):
         rt = receptance[:, :, t : t + 1, :]
         kt = key[:, :, :, t : t + 1]
         vt = value[:, :, t : t + 1, :]
         with torch.no_grad():
             state = at + time_decay * state
+    out = out.reshape(Batch * SequenceLength, AttentionHeads * HeadSize)
+    out = F.group_norm(out, num_groups=AttentionHeads, weight=layer_norm_weight, bias=layer_norm_bias).reshape(
+        Batch, SequenceLength, AttentionHeads * HeadSize
+    )
     out = out.to(dtype=hidden.dtype) * gate
+    out = out @ output_weight
     return out, state
 def rwkv_linear_attention(
     hidden,
     time_decay,
     time_first,
     key,
     value,
     gate,
+    layer_norm_weight,
+    layer_norm_bias,
+    output_weight,
     state,
 ):
+    Batch = hidden.shape[0]
+    AttentionHeads = time_decay.shape[0]
+    HeadSize = hidden.shape[-1] // AttentionHeads
+    SequenceLength = hidden.shape[1]
     no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, receptance, key, value])
     # Launching the CUDA kernel for just one token will actually be slower (there is no for loop in the CPU version
     # in this case).
     one_token = key.size(1) == 1
     if rwkv5_cuda_kernel is None or no_cuda or one_token:
         return rwkv_linear_attention_v5_cpu(
             hidden,
             time_decay,
             time_first,
             key,
             value,
             gate,
+            layer_norm_weight,
+            layer_norm_bias,
+            output_weight,
             state,
         )
     else:
+        out, state = WKV_5.apply(
+            Batch,
+            SequenceLength,
+            AttentionHeads * HeadSize,
+            AttentionHeads,
+            receptance,
+            key,
+            value,
+            time_decay,
+            time_first,
+            state,
+        )
+        out = out.reshape(Batch * SequenceLength, AttentionHeads * HeadSize)
+        out = F.group_norm(out, num_groups=AttentionHeads, weight=layer_norm_weight, bias=layer_norm_bias).reshape(
+            Batch, SequenceLength, AttentionHeads * HeadSize
+        )
         out = out.to(dtype=hidden.dtype) * gate
+        out = out @ output_weight
         return out, state
                 logger.info("Could not load the custom CUDA kernel for RWKV5 attention.")
         self.layer_id = layer_id
         hidden_size = config.hidden_size
         num_attention_heads = hidden_size // config.head_size
         self.num_attention_heads = num_attention_heads
         attention_hidden_size = (
         self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.gate = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)
         self.ln_x = nn.GroupNorm(hidden_size // config.head_size, hidden_size)
+    def extract_key_value(self, hidden, state=None):
         # Mix hidden with the previous timestep to produce key, value, receptance
         if hidden.size(1) == 1 and state is not None:
             shifted = state[0][:, :, self.layer_id]
         receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
         gate = hidden * self.time_mix_gate + shifted * (1 - self.time_mix_gate)
         key = self.key(key)
         value = self.value(value)
         receptance = self.receptance(receptance)
         return receptance, key, value, gate, state
     def forward(self, hidden, state=None, use_cache=False, seq_mode=True):
+        receptance, key, value, gate, state = self.extract_key_value(hidden, state=state)
         layer_state = state[1][:, :, :, :, self.layer_id] if state is not None else None
         rwkv, layer_state = rwkv_linear_attention(
             hidden,
             self.time_decay,
             self.time_faaaa,
         self.config = config
         self.layer_id = layer_id
         hidden_size = config.hidden_size
         intermediate_size = (
             config.intermediate_size
             if config.intermediate_size is not None
         return receptance * value, state
+# copied from HuggingFace https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py
+class Rwkv5Block(nn.Module):
     def __init__(self, config, layer_id):
         super().__init__()
         self.config = config
     config_class = Rwkv5Config
     base_model_prefix = "rwkv"
+    _no_split_modules = ["Rwkv5Block"]
     _keep_in_fp32_modules = ["time_decay", "time_first"]
     supports_gradient_checkpointing = True
             )
             time_weight = time_weight[None, None, :]
             decay_speed = [
                 -6.0 + 5.0 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
                 for h in range(attention_hidden_size)
                 module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
+# copied from HuggingFace https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py
 @dataclass
 class Rwkv5Output(ModelOutput):
     """
     attentions: Optional[Tuple[torch.FloatTensor]] = None
+# copied from HuggingFace https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py
 @dataclass
 class Rwkv5CausalLMOutput(ModelOutput):
     """
         super().__init__(config)
         self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.blocks = nn.ModuleList([Rwkv5Block(config, layer_id=idx) for idx in range(config.num_hidden_layers)])
         self.ln_out = nn.LayerNorm(config.hidden_size)
         self.layers_are_rescaled = False
             inputs_embeds = self.embeddings(input_ids)
         if use_cache and state is None:
             state = []
             num_attention_heads = self.config.hidden_size // self.config.num_attention_heads
+            state_attn_x = torch.zeros(
+                (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers),
+                dtype=inputs_embeds.dtype,
+                requires_grad=False,
+                device=inputs_embeds.device,
+            ).contiguous()
+            state_attn_kv = torch.zeros(
+                (
+                    inputs_embeds.size(0),
+                    num_attention_heads,
+                    self.config.hidden_size // num_attention_heads,
+                    self.config.hidden_size // num_attention_heads,
+                    self.config.num_hidden_layers,
+                ),
+                dtype=torch.float32,
+                requires_grad=False,
+                device=inputs_embeds.device,
+            ).contiguous()
+            state_ffn_x = torch.zeros(
+                (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers),
+                dtype=inputs_embeds.dtype,
+                requires_grad=False,
+                device=inputs_embeds.device,
+            ).contiguous()
+            state.append(state_attn_x)
+            state.append(state_attn_kv)
+            state.append(state_ffn_x)
         seq_mode = inputs_embeds.shape[1] > 1
         hidden_states = inputs_embeds
         self.layers_are_rescaled = not self.training
+    def _bnb_4bit_dequantize_and_rescale(self, target_layer, block_id):
+        r"""
+        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
+        be quantized again.
+        """
+        if not is_bitsandbytes_available():
+            raise ImportError("Please install bitsandbytes to use this method.")
+        import bitsandbytes as bnb
+        dequant_weights = bnb.functional.dequantize_4bit(target_layer.weight.data, target_layer.weight.quant_state)
+        dequant_weights.div_(2 ** int(block_id // self.config.rescale_every))
+        # re-quantize the model:
+        # we need to put it first on CPU then back to the device
+        # this will create an overhead :/
+        # We set requires_grad=False as we cannot compute gradients on top of 4bit parameters anyway and to avoid
+        # bugs with bnb
+        quant_weight = bnb.nn.Params4bit(dequant_weights.to("cpu"), requires_grad=False).to(dequant_weights.device)
+        setattr(target_layer, "weight", quant_weight)
+# copied from HuggingFace https://github.com/huggingface/transformers/blob/main/src/transformers/models/rwkv/modeling_rwkv.py
 @add_start_docstrings(
     """
+    The RWKV5 Model transformer with a language modeling head on top (linear layer with weights tied to the input
     embeddings).
     """,
     RWKV_START_DOCSTRING,