Spaces:

pandora-s
/

Pixtral-12B-EXL2

Running on Zero

App Files Files Community

pandora-s commited on Nov 11, 2024

Commit

522e174

•

1 Parent(s): 183e11b

Create app.py

Browse files

Files changed (1) hide show

app.py +161 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths
+import cuda_bug
+cuda_bug.install_cuda_toolkit_requirements()
+##
+import gradio as gr
+from gradio.data_classes import FileData
+from huggingface_hub import snapshot_download
+from pathlib import Path
+import base64
+import spaces
+import os
+import sys, os
+import torch
+from exllamav2 import (
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Tokenizer,
+    ExLlamaV2VisionTower,
+)
+from exllamav2.generator import (
+    ExLlamaV2DynamicGenerator,
+    ExLlamaV2Sampler,
+)
+from PIL import Image
+import requests
+from huggingface_hub import snapshot_download
+default_bpw = "4.0bpw"
+available_models = [
+    "2.5bpw",
+    "3.0bpw",
+    "3.5bpw",
+    "4.0bpw",
+    "4.5bpw",
+    "5.0bpw",
+    "6.0bpw",
+    "8.0bpw"
+]
+dirs = {}
+for model in available_models:
+    dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
+@spaces.GPU(duration=45)
+def run_inference(message, history, model_picked):
+    local_dir = dirs[model_picked]
+    print(message)
+    print(history)
+    # Loading only once GPU available
+    config = ExLlamaV2Config(local_dir)
+    config.max_seq_len = 16384
+    vision_model = ExLlamaV2VisionTower(config)
+    vision_model.load(progress = True)
+    model = ExLlamaV2(config)
+    cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
+    model.load_autosplit(cache, progress = True)
+    tokenizer = ExLlamaV2Tokenizer(config)
+    generator = ExLlamaV2DynamicGenerator(
+        model = model,
+        cache = cache,
+        tokenizer = tokenizer
+    )
+    # Making Prompt Template
+    prompt = ""
+    image_prompt = ""
+    images_embeddings = []
+    for couple in history:
+        if type(couple[0]) is tuple:
+            images_embeddings += [
+                vision_model.get_image_embeddings(
+                    model = model,
+                    tokenizer = tokenizer,
+                    image = img,
+                    text_alias = alias,
+                )
+                for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])]
+            ]
+            image_prompt = ""
+            for i in range(len(couple[0])):
+                image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}"
+        elif couple[0][1]:
+            prompt += "[INST]" + image_prompt + couple[0][1] + "[/INST]"
+            prompt += couple[1] + "</s>"
+    if type(message) is dict:
+        images_embeddings += [
+            vision_model.get_image_embeddings(
+                model = model,
+                tokenizer = tokenizer,
+                image = img,
+                text_alias = alias,
+            )
+            for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])]
+        ]
+        image_prompt = ""
+        for i in range(len(message['files'])):
+            image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}"
+        prompt += "[INST]" + image_prompt + message["text"] + "[/INST]"
+    else:
+        prompt += "[INST]" + image_prompt + message + "[/INST]"
+    print(prompt)
+    # Gnerating Response
+    for out in generator.generate(
+        prompt = prompt,
+        max_new_tokens = 1024,
+        temperature = 0.15,
+        add_bos = True,
+        encode_special_tokens = True,
+        decode_special_tokens = True,
+        stop_conditions = [tokenizer.eos_token_id],
+        gen_settings = ExLlamaV2Sampler.Settings.greedy(),
+        embeddings = images_embeddings,
+        stream = True
+    ):
+        if "[/INST]" in out:
+            result = out.split("[/INST]")[-1]
+        else:
+            result = out
+        print(result)
+        yield result
+description="""
+A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**!
+The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available.
+The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev).
+The model at **4bpw and 16k context size fits in less than 12GB of VRAM**!
+The current settings are:
+- Context Size: 16k tokens
+- Max Output: 1024 tokens
+- Temperature: 0.15
+You can select other quants and experiment!
+Thanks, turboderp!
+"""
+examples = [
+    [
+        {"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]},
+    ]
+]
+drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
+demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = drop)
+demo.queue().launch()