|
from transformers import PretrainedConfig |
|
from typing import List |
|
|
|
from transformers import Qwen2Config, CLIPVisionConfig |
|
|
|
class InfMLLMUnifiedHDChatConfig(PretrainedConfig): |
|
def __init__( |
|
self, |
|
vison_config=None, |
|
lm_config=None, |
|
lm_model="", |
|
lm_tokenizer="", |
|
lora_modules="", |
|
lora_llm=False, |
|
lora_r=128, |
|
lora_alpha=256, |
|
lora_dropout=0, |
|
|
|
encoder_img="", |
|
image_size_img=336, |
|
lora_encoder_img=False, |
|
hd_num=9, |
|
|
|
encoder_video="", |
|
|
|
max_txt_len=4096, |
|
conv_style='qwen-7b-chat', |
|
precision="bf16", |
|
**kwargs |
|
): |
|
self.lm_model = lm_model |
|
self.lm_tokenizer = lm_tokenizer |
|
self.lora_modules = lora_modules |
|
self.lora_llm = lora_llm |
|
self.lora_r = lora_r |
|
self.lora_alpha = lora_alpha |
|
self.lora_dropout = lora_dropout |
|
|
|
self.encoder_img = encoder_img |
|
self.image_size_img = image_size_img |
|
self.lora_encoder_img = lora_encoder_img |
|
self.hd_num = hd_num |
|
|
|
self.encoder_video = encoder_video |
|
|
|
self.max_txt_len = max_txt_len |
|
self.conv_style = conv_style |
|
|
|
self.precision = precision |
|
|
|
if type(vison_config) == dict: |
|
self.vision_config = CLIPVisionConfig(**vison_config) |
|
else: |
|
self.vision_config = vison_config |
|
|
|
if type(lm_config) == dict: |
|
self.lm_config = Qwen2Config(**lm_config) |
|
else: |
|
self.lm_config = lm_config |
|
super().__init__(**kwargs) |
|
|