from transformers import PretrainedConfig

class BilmaConfig(PretrainedConfig):
    model_type = "bilma"

    def __init__(
        self,
        weights="MX",
        include_top = True,
        add_head = None,
        pooling = None,
        num_attention_heads: int = 4,
        num_hidden_layers: int = 2,
        seq_max_length: int = 280,
        hidden_size: int = 512,
        vocab_size: int = 29025,
        hidden_dropout_prob: float = 0.1,        
        **kwargs,
    ):
        countries = ["MX"]
        poolings = ["mean", "cls", "max"]
        if weights not in countries:
            raise ValueError(f"`weights` must be one of {countries}, got {weights}.")
        if add_head is not None and include_top == True:
            raise ValueError(f"To add a head, 'include_top' must be False")
        if pooling is not None and include_top == True:
            raise ValueError(f"To specify a pooling, 'include_top' must be False")
        if pooling is not None and pooling not in poolings:
            raise ValueError(f"`pooling` must be one of {poolings}, got {pooling}.")
        if weights is not None:
            self.weights = weights
            self.include_top = include_top
            self.add_head = add_head
            self.pooling = pooling
            self.num_attention_heads = 4
            self.num_hidden_layers = 2
            self.seq_max_length = 280
            self.hidden_size = 512
            self.vocab_size = 29025
            self.hidden_dropout_prob = 0.1
            super().__init__(**kwargs)
            return

        self.weights = weights
        self.include_top = include_top
        self.add_head = add_head
        self.pooling = pooling
        self.num_attention_heads = num_attention_heads
        self.num_hidden_layers = num_hidden_layers
        self.seq_max_length = seq_max_length
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.hidden_dropout_prob = hidden_dropout_prob        
        super().__init__(**kwargs)