pandora-s commited on
Commit
522e174
1 Parent(s): 183e11b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths
2
+ import cuda_bug
3
+ cuda_bug.install_cuda_toolkit_requirements()
4
+ ##
5
+
6
+ import gradio as gr
7
+ from gradio.data_classes import FileData
8
+ from huggingface_hub import snapshot_download
9
+ from pathlib import Path
10
+ import base64
11
+ import spaces
12
+ import os
13
+
14
+ import sys, os
15
+
16
+ import torch
17
+
18
+ from exllamav2 import (
19
+ ExLlamaV2,
20
+ ExLlamaV2Config,
21
+ ExLlamaV2Cache,
22
+ ExLlamaV2Tokenizer,
23
+ ExLlamaV2VisionTower,
24
+ )
25
+
26
+ from exllamav2.generator import (
27
+ ExLlamaV2DynamicGenerator,
28
+ ExLlamaV2Sampler,
29
+ )
30
+
31
+ from PIL import Image
32
+ import requests
33
+
34
+ from huggingface_hub import snapshot_download
35
+
36
+ default_bpw = "4.0bpw"
37
+ available_models = [
38
+ "2.5bpw",
39
+ "3.0bpw",
40
+ "3.5bpw",
41
+ "4.0bpw",
42
+ "4.5bpw",
43
+ "5.0bpw",
44
+ "6.0bpw",
45
+ "8.0bpw"
46
+ ]
47
+ dirs = {}
48
+ for model in available_models:
49
+ dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
50
+
51
+ @spaces.GPU(duration=45)
52
+ def run_inference(message, history, model_picked):
53
+ local_dir = dirs[model_picked]
54
+ print(message)
55
+ print(history)
56
+ # Loading only once GPU available
57
+ config = ExLlamaV2Config(local_dir)
58
+ config.max_seq_len = 16384
59
+
60
+ vision_model = ExLlamaV2VisionTower(config)
61
+ vision_model.load(progress = True)
62
+
63
+ model = ExLlamaV2(config)
64
+ cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
65
+ model.load_autosplit(cache, progress = True)
66
+ tokenizer = ExLlamaV2Tokenizer(config)
67
+
68
+ generator = ExLlamaV2DynamicGenerator(
69
+ model = model,
70
+ cache = cache,
71
+ tokenizer = tokenizer
72
+ )
73
+
74
+ # Making Prompt Template
75
+ prompt = ""
76
+ image_prompt = ""
77
+ images_embeddings = []
78
+ for couple in history:
79
+ if type(couple[0]) is tuple:
80
+ images_embeddings += [
81
+ vision_model.get_image_embeddings(
82
+ model = model,
83
+ tokenizer = tokenizer,
84
+ image = img,
85
+ text_alias = alias,
86
+ )
87
+ for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])]
88
+ ]
89
+ image_prompt = ""
90
+ for i in range(len(couple[0])):
91
+ image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}"
92
+ elif couple[0][1]:
93
+ prompt += "[INST]" + image_prompt + couple[0][1] + "[/INST]"
94
+ prompt += couple[1] + "</s>"
95
+
96
+ if type(message) is dict:
97
+ images_embeddings += [
98
+ vision_model.get_image_embeddings(
99
+ model = model,
100
+ tokenizer = tokenizer,
101
+ image = img,
102
+ text_alias = alias,
103
+ )
104
+ for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])]
105
+ ]
106
+ image_prompt = ""
107
+ for i in range(len(message['files'])):
108
+ image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}"
109
+ prompt += "[INST]" + image_prompt + message["text"] + "[/INST]"
110
+ else:
111
+ prompt += "[INST]" + image_prompt + message + "[/INST]"
112
+
113
+ print(prompt)
114
+
115
+ # Gnerating Response
116
+ for out in generator.generate(
117
+ prompt = prompt,
118
+ max_new_tokens = 1024,
119
+ temperature = 0.15,
120
+ add_bos = True,
121
+ encode_special_tokens = True,
122
+ decode_special_tokens = True,
123
+ stop_conditions = [tokenizer.eos_token_id],
124
+ gen_settings = ExLlamaV2Sampler.Settings.greedy(),
125
+ embeddings = images_embeddings,
126
+ stream = True
127
+ ):
128
+ if "[/INST]" in out:
129
+ result = out.split("[/INST]")[-1]
130
+ else:
131
+ result = out
132
+ print(result)
133
+ yield result
134
+
135
+ description="""
136
+ A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**!
137
+
138
+ The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available.
139
+
140
+ The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev).
141
+
142
+ The model at **4bpw and 16k context size fits in less than 12GB of VRAM**!
143
+
144
+ The current settings are:
145
+ - Context Size: 16k tokens
146
+ - Max Output: 1024 tokens
147
+ - Temperature: 0.15
148
+
149
+ You can select other quants and experiment!
150
+
151
+ Thanks, turboderp!
152
+ """
153
+ examples = [
154
+ [
155
+ {"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]},
156
+ ]
157
+ ]
158
+
159
+ drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
160
+ demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = drop)
161
+ demo.queue().launch()