from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaAttention, LlamaRotaryEmbedding
from transformers.models.llama.configuration_llama import LlamaConfig
import torch


class CodeLlamaConfig(LlamaConfig):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.rope_theta = 10000.0
        if kwargs.get("rope_theta"):
            try:
                self.rope_theta = float(kwargs["rope_theta"])
                print(f"Rope theta set to {self.rope_theta}")
            except Exception:
                print("Could not set rope theta properly, ensure it is a number")

                
class CodeLlamaNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):

    def __init__(self, dim, max_position_embeddings=2048, base=1000000.0, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        self.base = base
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)

        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)

class CodeLlamaForCausalLM(LlamaForCausalLM):
    _tied_weights_keys = ["lm_head.weight"]
    
    config_class = CodeLlamaConfig
    
    def __init__(self, config):
        super().__init__(config)
        for layer in self.model.layers:
            attn = layer.self_attn
            head_dim = attn.head_dim
            max_embeddings = attn.max_position_embeddings
            base = config.rope_theta
                
            attn.rotary_emb = CodeLlamaNTKScalingRotaryEmbedding(head_dim, max_embeddings, base=base)