anilbhatt1 commited on
Commit
6acc3f7
1 Parent(s): d016b5d

Added app.py & requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +218 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import json
4
+ import torch
5
+ import torch.nn as nn
6
+ from torch.nn import functional as F
7
+ import re
8
+ import random
9
+ import numpy as np
10
+ from PIL import Image
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoProcessor
12
+ from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
13
+ import peft
14
+ from peft import LoraConfig
15
+ from peft import PeftModel
16
+ import whisperx
17
+ import requests
18
+ from io import BytesIO
19
+
20
+ # Check if CUDA is available
21
+ if torch.cuda.is_available():
22
+ device = torch.device("cuda")
23
+ print("Using GPU:", torch.cuda.get_device_name(0)) # Print GPU name
24
+ else:
25
+ device = torch.device("cpu")
26
+ print("Using CPU")
27
+
28
+
29
+ model_name = "microsoft/phi-2"
30
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
31
+ tokenizer.pad_token = tokenizer.eos_token
32
+ bos_token_id = tokenizer.bos_token_id
33
+ pad_token_id = tokenizer.bos_token_id
34
+ eos_token_id = tokenizer.bos_token_id
35
+ eoc_string = 'caption image:'
36
+ eoc_tokens = tokenizer.encode(eoc_string)
37
+ eoq_string = 'end of question:'
38
+ eoq_tokens = tokenizer.encode(eoq_string)
39
+
40
+ model_name = "microsoft/phi-2"
41
+ base_model = AutoModelForCausalLM.from_pretrained(model_name,
42
+ low_cpu_mem_usage=True,
43
+ return_dict=True,
44
+ torch_dtype=torch.float16,
45
+ trust_remote_code=True).to(device)
46
+ base_model.resize_token_embeddings(len(tokenizer))
47
+
48
+ user = "anilbhatt1" # put your user name here
49
+ model_name = "phi2-proj-offset-peft-model"
50
+ model_id = f"{user}/{model_name}"
51
+
52
+ # Merging the peft-model(trained adapters) downloaded from HF with base-phi2-model
53
+ merged_phi2 = peft.PeftModel.from_pretrained(base_model, model_id)
54
+
55
+ vision_model_name = 'openai/clip-vit-base-patch32' ## torch.Size([1, 49, 768])
56
+ clip_patches = 49
57
+ clip_processor = CLIPImageProcessor.from_pretrained(vision_model_name)
58
+ clip_model = CLIPVisionModel.from_pretrained(vision_model_name).to(device)
59
+
60
+ class ClipProjectionBlock(nn.Module):
61
+ def __init__(self, channels):
62
+ super().__init__()
63
+ self.pre_norm = nn.LayerNorm(channels)
64
+
65
+ self.proj = nn.Sequential(
66
+ nn.Linear(channels, channels),
67
+ nn.GELU(),
68
+ nn.Linear(channels, channels)
69
+ )
70
+ def forward(self, x):
71
+ x = self.pre_norm(x)
72
+ return x + self.proj(x)
73
+
74
+ class Phi2ProjModel(nn.Module):
75
+ def __init__(self, clip_model, clip_processor, proj_model, phi2_model, clip_embed_dim=768, phi2_dim=2560):
76
+ super(Phi2ProjModel, self).__init__()
77
+ self.clip_embed_dim = clip_embed_dim
78
+ self.phi2_dim = phi2_dim
79
+ self.proj_lin_layer = nn.Linear(clip_embed_dim, phi2_dim)
80
+ self.clip_model = clip_model
81
+ self.clip_processor = clip_processor
82
+ self.proj_model = proj_model
83
+ self.phi2_model = phi2_model
84
+
85
+ def forward(self, input_embed):
86
+ max_len = 100
87
+ output = self.phi2_model.generate(inputs_embeds=input_embed,
88
+ max_new_tokens=max_len,
89
+ return_dict_in_generate = True,
90
+ bos_token_id=bos_token_id,
91
+ pad_token_id=bos_token_id,
92
+ eos_token_id=bos_token_id)
93
+
94
+ return output
95
+
96
+ projection_layer = ClipProjectionBlock(2560).to(device)
97
+
98
+ phi2_proj_model = Phi2ProjModel(clip_model, clip_processor, projection_layer, merged_phi2).to(device)
99
+
100
+ phi2_proj_model.proj_lin_layer.load_state_dict(torch.load('./phi2_proj_model_offset_ll.pth'))
101
+ phi2_proj_model.proj_model.load_state_dict(torch.load('./phi2_proj_model_offset_projmodel.pth'))
102
+
103
+ audio_model = whisperx.load_model("small", "cuda", compute_type="float16")
104
+
105
+ def prepare_input_embed(img=None, audio=None, text=None):
106
+
107
+ input_embed_exists = 0
108
+
109
+ inputs_given = []
110
+
111
+ if img is not None:
112
+ inputs = clip_processor(images=img, return_tensors="pt").to(device)
113
+ clip_output = clip_model(**inputs, output_hidden_states=True) # B, 50, 768
114
+ clip_embeddings = clip_output.last_hidden_state[:,1:, :] # B, 49, 768
115
+ image_embed = phi2_proj_model.proj_lin_layer(clip_embeddings) # B, 49, 2560
116
+ image_embed = phi2_proj_model.proj_model(image_embed) # B, 49, 2560
117
+ B, _, C = image_embed.shape
118
+
119
+ eoc_tkn_tensor = torch.tensor(eoc_tokens, dtype=torch.int64).to(device) # [4] -> EOI token matrix
120
+ eoc_tensor = eoc_tkn_tensor.repeat(B, 1) # [B, 4]
121
+ eoc_embed = phi2_proj_model.phi2_model.base_model.model.model.embed_tokens(eoc_tensor) # B, 4, 2560 -> EOI embeddings (torch.float32)
122
+
123
+ input_image_embed = torch.cat([image_embed, eoc_embed], dim=1) #B, 53, 2560 -> Adding EOI embeddings to indicate end of image
124
+ input_image_embed = input_image_embed.to(dtype=torch.float16)
125
+
126
+ if audio is not None:
127
+ audio_tkn_tensor = torch.tensor(audio, dtype=torch.int64).to(device) # [4] -> EOI token matrix
128
+ audio_tkn_tensor = audio_tkn_tensor.unsqueeze(0)
129
+ audio_embed = phi2_proj_model.phi2_model.base_model.model.model.embed_tokens(audio_tkn_tensor)
130
+
131
+ if text is not None:
132
+ text_tkn_tensor = torch.tensor(text, dtype=torch.int64).to(device) # [4] -> EOI token matrix
133
+ text_tkn_tensor = text_tkn_tensor.unsqueeze(0)
134
+ text_embed = phi2_proj_model.phi2_model.base_model.model.model.embed_tokens(text_tkn_tensor)
135
+
136
+ # If image is present, it gets 1st place in input_embed
137
+ if img is not None:
138
+ input_embed = input_image_embed
139
+ input_embed_exists = 1
140
+
141
+ if audio is not None:
142
+ # If input_embed is already present, that means image was present. So, append audio_embed to it
143
+ if input_embed_exists:
144
+ input_embed = torch.cat([input_embed, audio_embed], dim=1)
145
+ # If input_embed is not there, that means image is not there. So, give audio_embed as input_embed
146
+ else:
147
+ input_embed = audio_embed
148
+ input_embed_exists = 1
149
+ inputs_given.append(audio)
150
+
151
+ if text:
152
+ # If input_embed is already present, that means image/audio are present. So, append text_embed to it
153
+ if input_embed_exists:
154
+ if audio is not None:
155
+ input_embed = torch.cat([input_embed, text_embed], dim=1)
156
+ else:
157
+ input_embed = torch.cat([input_embed, text_embed], dim=1)
158
+ # If input_embed is not there, that means neither image not audio there. So, give text_embed as input_embed
159
+ else:
160
+ input_embed = text_embed
161
+ input_embed_exists = 1
162
+ inputs_given.append(text)
163
+
164
+ inputs_given.append(eoq_tokens)
165
+
166
+ eoq_tkn_tensor = torch.tensor(eoq_tokens, dtype=torch.int64).to(device) # [4] -> EOI token matrix
167
+ B = 1
168
+ eoq_tensor = eoq_tkn_tensor.repeat(B, 1) # [B, 4]
169
+ eoq_embed = phi2_proj_model.phi2_model.base_model.model.model.embed_tokens(eoq_tensor) # B, 4, 2560 -> EOI embeddings (torch.float32)
170
+ input_embed = torch.cat([input_embed, eoq_embed], dim=1)
171
+
172
+ return input_embed
173
+
174
+ def gradio_get_answers_fn(image=None, audio=None, text=None):
175
+ audio_tokens = None
176
+ text_tokens = None
177
+ if audio:
178
+ audio_result = audio_model.transcribe(audio)
179
+ audio_text = ''
180
+ for seg in audio_result['segments']:
181
+ audio_text += seg['text']
182
+ audio_text = audio_text.strip()
183
+ audio_tokens = tokenizer.encode(audio_text)
184
+
185
+ if text:
186
+ text_tokens = tokenizer.encode(text)
187
+
188
+ if image or audio or text:
189
+ input_embed = prepare_input_embed(image, audio_tokens, text_tokens)
190
+ with torch.no_grad():
191
+ output = phi2_proj_model(input_embed)
192
+ out_text = tokenizer.batch_decode(output.sequences[:, 1:])[0]
193
+ out_text = out_text.replace("<|endoftext|>", "")
194
+ else:
195
+ out_text = "I didn't get any input. Give me an image/audio/text or combination of these 3 and get the answer back !"
196
+
197
+ return out_text
198
+
199
+ import gradio as gr
200
+
201
+ markdown_description = """
202
+ - Jñāna is a Multimodal LLM app that can accept input as image, text or audio
203
+ - Based on the input you can query the app for more details
204
+ - Uses **microsoft/phi-2 qlora** optimized model finetuned on **instruct150k** dataset
205
+ - Uses **whisperX** model for audio
206
+ """
207
+ demo = gr.Interface(fn=gradio_get_answers_fn,
208
+ inputs=[
209
+ gr.Image(type="pil", label="Image"),
210
+ gr.Audio(label="Audio Query", sources=['microphone', 'upload'], type='filepath'),
211
+ gr.Textbox(info="How may I help you ? please enter your prompt here...", label="Text Query")
212
+ ],
213
+ outputs=gr.Textbox(label="Response"),
214
+ title="Jñāna - Phi2 Multiomodal Conversation Agent",
215
+ description=markdown_description,
216
+ article=" **Credits** : https://theschoolof.ai/ || https://github.com/mshumer/gpt-llm-trainer || https://github.com/huggingface/peft/tree/main/examples/multilayer_perceptron ")
217
+
218
+ demo.queue().launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ torch
3
+ pandas
4
+ torchvision
5
+ pillow
6
+ git+https://github.com/huggingface/transformers
7
+ git+https://github.com/m-bain/whisperx.git
8
+ git+https://github.com/huggingface/peft.git
9
+ einops
10
+ accelerate
11
+ bitsandbytes