DataCanvas
/

MMAlaya

 # MMAlaya
+MMAlaya是基于大语言模型[Alaya](https://github.com/DataCanvasIO/Alaya)的多模态模型。
+MMAlaya包含以下三个模块：
+<br>1，大语言模型Alaya。
+<br>2，图像文本特征编码器[blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b)
+<br>3，图像文本特征到大预言模型的线性投影器。
+模型的训练主要基于[LLaVA]架构(https://github.com/haotian-liu/LLaVA)
+2024.01.23 最终在[MMBench](https://mmbench.opencompass.org.cn)线上测试中文测试集分数为56.9，总排名为第20名，7B模型的第9名。英文测试集分数为59.8，总排名为第29名，7B模型的第12名。

inference.py CHANGED Viewed

@@ -12,10 +12,10 @@ import argparse
 def main(args):
     disable_torch_init()
     conv_mode = "mmalaya_llama"
     model_path = args.model_path
-    tokenizer, model, image_processor, context_len = load_pretrained_model(
         model_path=model_path,
         )
     prompts = [
@@ -27,21 +27,25 @@ def main(args):
     time1 = time.time()
     for prompt in prompts:
         conv = conv_templates[conv_mode].copy()
         inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
         conv.append_message(conv.roles[0], inp)
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         if conv_mode == 'mmalaya_llama':
             stop_str = conv.sep2
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, timeout=20.0)
         image = Image.open('./data/chang_chen.jpg').convert("RGB")
         image_tensor = image_processor(image, return_tensors='pt')['pixel_values'].half().cuda()
         with torch.inference_mode():
             generate_ids = model.generate(
                 inputs=input_ids,
@@ -55,7 +59,7 @@ def main(args):
                 use_cache=True,
                 stopping_criteria=[stopping_criteria],
                 )
-            # remove input_ids in generate_ids
             input_token_len = input_ids.shape[1]
             output = tokenizer.batch_decode(
                 generate_ids[:, input_token_len:],

 def main(args):
     disable_torch_init()
     conv_mode = "mmalaya_llama"
     model_path = args.model_path
+    # 加载model，tokenizer，image_processor
+    tokenizer, model, image_processor, _ = load_pretrained_model(
         model_path=model_path,
         )
     prompts = [
     time1 = time.time()
     for prompt in prompts:
+        # 加载对话模板
         conv = conv_templates[conv_mode].copy()
         inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
         conv.append_message(conv.roles[0], inp)
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
+        # 对prompt进行分词
         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        # 加载generate stop条件
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         if conv_mode == 'mmalaya_llama':
             stop_str = conv.sep2
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, timeout=20.0)
+        # 加载图像
         image = Image.open('./data/chang_chen.jpg').convert("RGB")
         image_tensor = image_processor(image, return_tensors='pt')['pixel_values'].half().cuda()
+        # 推理
         with torch.inference_mode():
             generate_ids = model.generate(
                 inputs=input_ids,
                 use_cache=True,
                 stopping_criteria=[stopping_criteria],
                 )
+            # 截断generate_ids中的input_ids，然后解码为文本
             input_token_len = input_ids.shape[1]
             output = tokenizer.batch_decode(
                 generate_ids[:, input_token_len:],