Spaces:
Runtime error
Runtime error
import dataclasses | |
from copy import deepcopy | |
from types import SimpleNamespace | |
from typing import List, Union, Dict, Tuple | |
import numpy as np | |
import torch | |
from PIL import Image | |
from torch import nn, Tensor | |
from transformers import StoppingCriteria, StoppingCriteriaList | |
from eval_scripts.eval_utils import load_image, load_audio | |
from imagebind.models.image_bind import ModalityType | |
from bubogpt import BaseProcessor | |
Roles = SimpleNamespace( | |
HUMAN="Human", | |
ASSISTANT="Assistant" | |
) | |
class Message: | |
def __init__(self, role: str, content: Union[str, None]): | |
self.role = role | |
self.content = content | |
class Conversation: | |
"""A class that keeps all conversation history.""" | |
system: str | |
messages: List[Message] | |
sep: str = "###" | |
def get_prompt(self): | |
ret = self.system + self.sep | |
for message in self.messages: | |
if message.content: | |
ret += message.role + ": " + message.content + self.sep | |
else: | |
ret += message.role + ":" | |
return ret | |
def append_message(self, role, content): | |
self.messages.append(Message(role, content)) | |
def copy(self): | |
return Conversation( | |
system=self.system, | |
messages=deepcopy(self.messages), | |
sep=self.sep) | |
def dict(self): | |
return { | |
"system": self.system, | |
"messages": [(msg.role, msg.content) for msg in self.messages], | |
"sep": self.sep | |
} | |
class StoppingCriteriaSub(StoppingCriteria): | |
def __init__(self, stops=[], encounters=1): | |
super().__init__() | |
self.stops = stops | |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): | |
for stop in self.stops: | |
if torch.all((stop == input_ids[0][-len(stop):])).item(): | |
return True | |
return False | |
CONV_X = Conversation( | |
# system="Give the following ..." | |
# "You will be able to ... once I provide it to you. Please answer my questions.", | |
system="Give the following image: <Vision>ImageContent</Vision> or audio: <Audio>AudioContent</Audio>. " | |
"You will be able to see the image/audio once I provide it to you. Please answer my questions.", | |
messages=[], | |
sep="###", | |
) | |
# TODO: If needed and possible, rewrite this file and re-organize the definition of components. | |
class DummyChat: | |
def __init__(self, dummy_answer=None, *args, **kwargs): | |
self.dummy_answer = dummy_answer | |
def ask(self, text, conversation): | |
conversation.append_message(Roles.HUMAN, text) | |
def answer(self, *args, **kwargs): | |
if self.dummy_answer is not None: | |
return self.dummy_answer, None | |
else: | |
print(kwargs) | |
return kwargs["conversation"].messages[-1].content, None | |
def upload_img(self, *args, **kwargs): | |
pass | |
def upload_aud(self, *args, **kwargs): | |
pass | |
class Chat: | |
def __init__(self, | |
model: nn.Module, | |
processors: Dict[str, BaseProcessor], | |
device: str = 'cuda:0' | |
): | |
self.device = device | |
self.model = model | |
self.processors = processors | |
stop_words_ids = [torch.tensor([835]).to(self.device), | |
torch.tensor([2277, 29937]).to(self.device)] # '###' can be encoded in two different ways. | |
self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) | |
self.just_uploaded = False | |
def ask(self, text, conversation): | |
# NOTE: the hard code for postfix is removed. | |
# end_token = '</Vision>' | |
# if len(conversation.messages) > 0 and conversation.messages[-1].role == Roles.HUMAN \ | |
# and conversation.messages[-1].content[-len(end_token):] == end_token: | |
if self.just_uploaded: | |
conversation.messages[-1].content = ' '.join([conversation.messages[-1].content, text]) | |
self.just_uploaded = False | |
else: | |
conversation.append_message(Roles.HUMAN, text) | |
def answer(self, conversation, emb_list, max_new_tokens=300, num_beams=1, min_length=1, top_p=0.9, | |
repetition_penalty=1.0, length_penalty=1, temperature=1.0, max_length=2000): | |
# Generate an answer written by LLaMA | |
conversation.append_message(Roles.ASSISTANT, None) | |
embs = self.get_context_emb(conversation, emb_list) | |
current_max_len = embs.shape[1] + max_new_tokens | |
if current_max_len - max_length > 0: | |
print('Warning: The number of tokens in current conversation exceeds the max length. ' | |
'The model will not see the contexts outside the range.') | |
begin_idx = max(0, current_max_len - max_length) | |
embs = embs[:, begin_idx:] | |
outputs = self.model.llama_model.generate( | |
inputs_embeds=embs, | |
max_new_tokens=max_new_tokens, | |
stopping_criteria=self.stopping_criteria, | |
num_beams=num_beams, | |
do_sample=True, | |
min_length=min_length, | |
top_p=top_p, | |
repetition_penalty=repetition_penalty, | |
length_penalty=length_penalty, | |
temperature=temperature, | |
) | |
output_token = outputs[0] | |
if output_token[0] == 0: # the model might output a unknown token <unk> at the beginning. remove it | |
output_token = output_token[1:] | |
if output_token[0] == 1: # some users find that there is a start token <s> at the beginning. remove it | |
output_token = output_token[1:] | |
output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False) | |
output_text = output_text.split('###')[0] # remove the stop sign '###' | |
output_text = output_text.split('Assistant:')[-1].strip() | |
conversation.messages[-1].content = output_text | |
return output_text, output_token.cpu().numpy() | |
def upload_img(self, image: Union[str, Image.Image, Tensor], conversation: Conversation, emb_list: List[Tensor]): | |
# Upload Image, Encode Image and Create a new message from human. | |
image = load_image(image, self.processors[ModalityType.VISION]).to(self.device) | |
if hasattr(self.model, "encode_img"): | |
# To compitable with minigpt4 | |
image_emb, _ = self.model.encode_img(image) | |
else: | |
all_embeddings = self.model.encode_inputs({ModalityType.VISION: image}) | |
image_emb = all_embeddings[ModalityType.VISION] | |
emb_list.append(image_emb) | |
conversation.append_message(Roles.HUMAN, "<Vision><ModalityHere></Vision>") | |
self.just_uploaded = True | |
# def upload_img_mini(self, image: Union[str, Image.Image, Tensor], conversation: Conversation, emb_list: List[Tensor]): | |
# # Upload Image, Encode Image and Create a new message from human. | |
# image = load_image(image, self.processors[ModalityType.VISION]).to(self.device) | |
# image_emb, _ = self.model.encode_img(image) | |
# emb_list.append(image_emb) | |
# conversation.append_message(Roles.HUMAN, "<Vision><ModalityHere></Vision>") | |
def upload_aud(self, audio: Union[str, Tuple[int, np.ndarray]], conversation: Conversation, emb_list: List[Tensor]): | |
# Upload Audio, Encode Audio and Create a new message from human. | |
audio = load_audio(audio, self.processors[ModalityType.AUDIO]).to(self.device) | |
audio = audio.float() | |
all_embeddings = self.model.encode_inputs({ModalityType.AUDIO: audio}) | |
audio_emb = all_embeddings[ModalityType.AUDIO] | |
emb_list.append(audio_emb) | |
conversation.append_message(Roles.HUMAN, "<Audio><ModalityHere></Audio>") | |
self.just_uploaded = True | |
def get_context_emb(self, conversation: Conversation, emb_list: List[Tensor]): | |
# Insert the embeddings into the prompts and queries. | |
# NOTE: Assume the placeholders have been aligned to the embeddings! | |
prompt = conversation.get_prompt() | |
print(prompt) | |
prompt_segs = prompt.split('<ModalityHere>') | |
assert len(prompt_segs) == len(emb_list) + 1, "Unmatched numbers of placeholders and embeddings." | |
seg_tokens = [ | |
self.model.llama_tokenizer( | |
seg, return_tensors="pt", add_special_tokens=i == 0).to(self.device).input_ids | |
# only add bos to the first seg | |
for i, seg in enumerate(prompt_segs) | |
] | |
seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens] | |
mixed_embs = [emb for pair in zip(seg_embs[:-1], emb_list) for emb in pair] + [seg_embs[-1]] | |
mixed_embs = torch.cat(mixed_embs, dim=1) | |
return mixed_embs |