Spaces:

pandora-s
/

Pixtral-12B-EXL2

Running on Zero

App Files Files Community

Pixtral-12B-EXL2 / app.py

pandora-s

Update app.py

2a9695b verified about 2 months ago

raw

history blame contribute delete

5.62 kB

	## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths
	import cuda_bug
	cuda_bug.install_cuda_toolkit_requirements()
	##

	import gradio as gr
	from gradio.data_classes import FileData
	from huggingface_hub import snapshot_download
	from pathlib import Path
	import base64
	import spaces
	import os

	import sys, os

	import torch

	from exllamav2 import (
	ExLlamaV2,
	ExLlamaV2Config,
	ExLlamaV2Cache,
	ExLlamaV2Tokenizer,
	ExLlamaV2VisionTower,
	)

	from exllamav2.generator import (
	ExLlamaV2DynamicGenerator,
	ExLlamaV2Sampler,
	)

	from PIL import Image
	import requests

	from huggingface_hub import snapshot_download

	from tqdm import tqdm

	default_max_context = 16384
	default_max_output = 512

	default_bpw = "4.0bpw"
	available_models = [
	"2.5bpw",
	"3.0bpw",
	"3.5bpw",
	"4.0bpw",
	"4.5bpw",
	"5.0bpw",
	"6.0bpw",
	"8.0bpw"
	]
	dirs = {}
	for model in tqdm(available_models):
	dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})

	@spaces.GPU(duration=45)
	def run_inference(message, history, model_picked, context_size, max_output):
	if not model_picked:
	model_picked = default_bpw
	if not context_size:
	context_size = default_max_context
	if not max_output:
	max_output = default_max_output

	local_dir = dirs[model_picked]

	# Loading only once GPU available
	config = ExLlamaV2Config(local_dir)
	config.max_seq_len = context_size

	vision_model = ExLlamaV2VisionTower(config)
	vision_model.load(progress = True)

	model = ExLlamaV2(config)
	cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = context_size)
	model.load_autosplit(cache, progress = True)
	tokenizer = ExLlamaV2Tokenizer(config)

	generator = ExLlamaV2DynamicGenerator(
	model = model,
	cache = cache,
	tokenizer = tokenizer
	)

	# Making Prompt Template
	prompt = ""
	image_prompt = ""
	images_embeddings = []
	for couple in history:
	if type(couple[0]) is tuple:
	images_embeddings += [
	vision_model.get_image_embeddings(
	model = model,
	tokenizer = tokenizer,
	image = img,
	text_alias = alias,
	)
	for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])]
	]
	image_prompt = ""
	for i in range(len(couple[0])):
	image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}"
	elif couple[0]:
	prompt += "[INST]" + image_prompt + couple[0] + "[/INST]"
	prompt += couple[1] + "</s>"

	if type(message) is dict:
	images_embeddings += [
	vision_model.get_image_embeddings(
	model = model,
	tokenizer = tokenizer,
	image = img,
	text_alias = alias,
	)
	for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])]
	]
	image_prompt = ""
	for i in range(len(message['files'])):
	image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}"
	prompt += "[INST]" + image_prompt + message["text"] + "[/INST]"
	else:
	prompt += "[INST]" + image_prompt + message + "[/INST]"

	print(prompt)

	# Gnerating Response
	output = generator.generate(
	prompt = prompt,
	max_new_tokens = max_output,
	add_bos = True,
	encode_special_tokens = True,
	decode_special_tokens = True,
	stop_conditions = [tokenizer.eos_token_id],
	gen_settings = ExLlamaV2Sampler.Settings.greedy(),
	embeddings = images_embeddings
	)
	result = output.split("[/INST]")[-1]
	print(result)
	return result

	description="""A demo chat interface with Pixtral 12B EXL2 Quants, deployed using ExllamaV2!
	The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available.
	The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev).
	The model at 4bpw and 16k context size fits in less than 12GB of VRAM, and at 2.5bpw and short context can potentially fit in 8GB of VRAM!

	The current default settings are:
	- Model Quant: 4.0bpw
	- Context Size: 16k tokens
	- Max Output: 512 tokens
	You can select other quants and experiment!

	Thanks, turboderp!"""
	examples = [
	[
	{"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]},
	]
	]

	drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
	context_size_gradio = gr.Slider(minimum = 256, maximum = 32768, label="Context Size", value=default_max_context, step = 1)
	output_length_gradio = gr.Slider(minimum = 1, maximum = 4096, label="Max Ouput Length", value=default_max_output, step = 1)
	demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = [drop, context_size_gradio, output_length_gradio])
	demo.queue().launch()