keras-chatbot-arena

Runtime error

File size: 4,981 Bytes

import keras
import keras_hub

model_presets = [
    # 8B params models
    "hf://google/gemma-2-instruct-9b-keras",
    "hf://meta-llama/Llama-3.1-8B-Instruct",
    "hf://google/codegemma-7b-it-keras",
    "hf://keras/mistral_instruct_7b_en",
    "hf://keras/vicuna_1.5_7b_en",
    # "keras/gemma_1.1_instruct_7b_en", # won't fit?
    # 1-3B params models
    "hf://meta-llama/Llama-3.2-1B-Instruct",
    "hf://google/gemma-2b-it-keras",
    "hf://meta-llama/Llama-3.2-3B-Instruct",
]

model_labels = map(lambda s: s.removeprefix("hf://"), model_presets)
model_labels = map(lambda s: s.removeprefix("google/"), model_labels)
model_labels = map(lambda s: s.removeprefix("keras/"), model_labels)
model_labels = map(lambda s: s.removeprefix("meta-llama/"), model_labels)


def preset_to_website_url(preset):
    preset = preset.removeprefix("hf://")
    url = "http://huggingface.co/" + preset
    return url


def get_appropriate_chat_template(preset):
    return "Vicuna" if "vicuna" in preset else "auto"


def get_default_layout_map(preset_name, device_mesh):
    # Llama's default layout map works for mistral and vicuna
    # because their transformer layers have the same names.
    if (
        "Llama" in preset_name
        or "mistral" in preset_name
        or "vicuna" in preset_name
    ):
        layout_map = keras_hub.models.Llama3Backbone.get_layout_map(device_mesh)
        # Default layout map patch:
        # This line is missing for some Llama models (TODO: fix this in keras_hub)
        layout_map["token_embedding/reverse_embeddings"] = ("batch", "model")
        return layout_map

    elif "gemma" in preset_name:
        layout_map = keras_hub.models.GemmaBackbone.get_layout_map(device_mesh)

        if "gemma-2b-" in preset_name:
            # Default layout map patch:
            # Gemma QKV weigts are shaped [NB_HEADS, EMBED_DIM, INNER_DIM]
            # Llama QKV weights are shaped [EMBED_DIM, NB_HEADS, INNER_DIM]
            # However:
            # The default layout map for KQV weights on Gemma is: (model_dim,data_dim,None)
            # Which means sharding NB_HEADS on the "model" dimension.
            # But gemma-2b-it-keras has only 1 head so this won't work: must patch it
            # TODO: fix this in the Gemma layout map in Keras hub.
            patch_key = "decoder_block.*attention.*(query|key|value).kernel"
            layout_map.pop(patch_key)
            layout_map[patch_key] = (None, "model", "batch")

        return layout_map


def log_applied_layout_map(model):
    print("Model class:", type(model).__name__)

    if "Gemma" in type(model).__name__:
        transformer_decoder_block_name = "decoder_block_1"
    elif "Llama" in type(model).__name__:  # works for Llama (Vicuna) and Llama3
        transformer_decoder_block_name = "transformer_layer_1"
    elif "Mistral" in type(model).__name__:
        transformer_decoder_block_name = "transformer_layer_1"
    else:
        print("Unknown architecture. Cannot display the applied layout.")
        return

    # See how layer sharding was applied
    embedding_layer = model.backbone.get_layer("token_embedding")
    print(embedding_layer)
    decoder_block = model.backbone.get_layer(transformer_decoder_block_name)
    print(type(decoder_block))
    for variable in embedding_layer.weights + decoder_block.weights:
        print(
            f"{variable.path:<58}  \
                {str(variable.shape):<16}  \
                {str(variable.value.sharding.spec):<35} \
                {str(variable.dtype)}"
        )


def load_model(preset):
    devices = keras.distribution.list_devices()
    device_mesh = keras.distribution.DeviceMesh(
        shape=(1, len(devices)), axis_names=["batch", "model"], devices=devices
    )
    model_parallel = keras.distribution.ModelParallel(
        layout_map=get_default_layout_map(preset, device_mesh),
        batch_dim_name="batch",
    )

    with model_parallel.scope():
        # These two buggy models need this workaround to be loaded in bfloat16
        if "google/gemma-2-instruct-9b-keras" in preset:
            model = keras_hub.models.GemmaCausalLM(
                backbone=keras_hub.models.GemmaBackbone.from_preset(
                    preset, dtype="bfloat16"
                ),
                preprocessor=keras_hub.models.GemmaCausalLMPreprocessor.from_preset(
                    preset
                ),
            )
        elif "meta-llama/Llama-3.1-8B-Instruct" in preset:
            model = keras_hub.models.Llama3CausalLM(
                backbone=keras_hub.models.Llama3Backbone.from_preset(
                    preset, dtype="bfloat16"
                ),
                preprocessor=keras_hub.models.Llama3CausalLMPreprocessor.from_preset(
                    preset
                ),
            )
        else:
            model = keras_hub.models.CausalLM.from_preset(
                preset, dtype="bfloat16"
            )

    log_applied_layout_map(model)
    return model