jaronfei
commited on
Commit
•
d3b2612
1
Parent(s):
b0abd68
first commit
Browse files- README.md +73 -0
- eval.py +257 -0
- llm_adapter/README.md +202 -0
- llm_adapter/adapter_config.json +31 -0
- llm_adapter/adapter_model.safetensors +3 -0
- projector/config.json +38 -0
- projector/configuration_ccam_projector.py +42 -0
- projector/model.safetensors +3 -0
- projector/modeling_ccam_projector.py +203 -0
- ref_results/output_w_sub.json +0 -0
- ref_results/output_wo_sub.json +0 -0
- videoccam.py +312 -0
- visual_encoder_adapter/README.md +202 -0
- visual_encoder_adapter/adapter_config.json +36 -0
- visual_encoder_adapter/adapter_model.safetensors +3 -0
README.md
CHANGED
@@ -1,3 +1,76 @@
|
|
1 |
---
|
2 |
license: mit
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: mit
|
3 |
---
|
4 |
+
|
5 |
+
## Model Summary
|
6 |
+
|
7 |
+
Video-CCAM-4B is a lightweight Video-MLLM built on [Phi-3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [SigLIP SO400M](https://huggingface.co/google/siglip-so400m-patch14-384).
|
8 |
+
|
9 |
+
## Usage
|
10 |
+
|
11 |
+
Inference using Huggingface transformers on NVIDIA GPUs. Requirements tested on python 3.10:
|
12 |
+
```
|
13 |
+
torch==2.1.0
|
14 |
+
torchvision==0.16.0
|
15 |
+
transformers==4.40.2
|
16 |
+
peft==0.10.0
|
17 |
+
pyarrow==13.0.0 # load parquet
|
18 |
+
decord==0.6.0 # load video
|
19 |
+
pysubs2==1.7.2 # load subtitle
|
20 |
+
```
|
21 |
+
|
22 |
+
### Sample Inference Code
|
23 |
+
|
24 |
+
```
|
25 |
+
import torch
|
26 |
+
|
27 |
+
from eval import load_video
|
28 |
+
from videoccam import VideoCCAM
|
29 |
+
|
30 |
+
video_path = 'assets/example.mp4'
|
31 |
+
question = 'Can you please describe what happens in the video in detail?'
|
32 |
+
|
33 |
+
sample_config = dict(
|
34 |
+
sample_type='uniform',
|
35 |
+
num_frames=32
|
36 |
+
)
|
37 |
+
|
38 |
+
mllm = VideoCCAM(
|
39 |
+
model_path='.',
|
40 |
+
chat_template='<|user|>\n{input}<|end|>\n<|assistant|>\n',
|
41 |
+
generation_args=dict(
|
42 |
+
stop_tokens=['<|end|>', '<|endoftext|>'],
|
43 |
+
max_new_tokens=512,
|
44 |
+
do_sample=False,
|
45 |
+
num_beams=5,
|
46 |
+
),
|
47 |
+
llm_name_or_path='microsoft/Phi-3-mini-4k-instruct', # you can replace this with local directory if the model has been downloaded before
|
48 |
+
visual_encoder_name_or_path='google/siglip-so400m-patch14-384', # you can replace this with local directory if the model has been downloaded before
|
49 |
+
special_tokens=['<time>', '</time>'],
|
50 |
+
visual_select_layer=-2,
|
51 |
+
torch_dtype=torch.bfloat16,
|
52 |
+
device_map='cuda:0'
|
53 |
+
)
|
54 |
+
|
55 |
+
frames, = load_video(video_path, **sample_config)
|
56 |
+
response = mllm.generate(texts=[question], videos=[frames])[0]
|
57 |
+
|
58 |
+
print(response)
|
59 |
+
```
|
60 |
+
|
61 |
+
### Video-MME Evaluation
|
62 |
+
|
63 |
+
You are expected to reproduce the results of 48.2 (without subtitle) and 52.9 (with subtitle) by running the following command. By default, the results are saved as `output_w_sub.json` and `output_wo_sub.json` in local directory. We provide our results in `ref_results` directory.
|
64 |
+
|
65 |
+
```
|
66 |
+
python eval.py
|
67 |
+
```
|
68 |
+
|
69 |
+
## Acknowledgement
|
70 |
+
|
71 |
+
* [xtuner](https://github.com/InternLM/xtuner): Video-CCAM-4B is trained using the xtuner framework. Thanks for their excellent works!
|
72 |
+
* [Phi-3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct): Great small language models developed by Microsoft.
|
73 |
+
* [SigLIP SO400M](https://huggingface.co/google/siglip-so400m-patch14-384): Outstanding vision encoder developed by Google.
|
74 |
+
|
75 |
+
## License
|
76 |
+
The model is licensed under the MIT license.
|
eval.py
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
================================================
|
5 |
+
@author: Jaron
|
6 |
+
@time: 2024/06/23 12:59:38
|
7 |
+
@email: fjjth98@163.com
|
8 |
+
@description: Evaluate MLLM on Video-MME Benchmark
|
9 |
+
================================================
|
10 |
+
"""
|
11 |
+
|
12 |
+
import json
|
13 |
+
import torch
|
14 |
+
import pysubs2
|
15 |
+
import os.path as osp
|
16 |
+
|
17 |
+
from PIL import Image
|
18 |
+
from tqdm import tqdm
|
19 |
+
from typing import Any
|
20 |
+
from copy import deepcopy
|
21 |
+
from pandas import read_parquet
|
22 |
+
from decord import VideoReader, cpu
|
23 |
+
from torch.utils.data import Dataset, DataLoader, default_collate
|
24 |
+
|
25 |
+
|
26 |
+
def video_collate_fn(batch: Any) -> Any:
|
27 |
+
"""this collate function address dict video inputs, support to process variable number of frames for different inputs
|
28 |
+
|
29 |
+
Args:
|
30 |
+
batch (_type_): _description_
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
_type_: _description_
|
34 |
+
"""
|
35 |
+
if isinstance(batch[0], dict) and 'video' in batch[0]:
|
36 |
+
video = [b.pop('video') for b in batch]
|
37 |
+
batch = default_collate(batch)
|
38 |
+
batch['video'] = video
|
39 |
+
else:
|
40 |
+
batch = default_collate(batch)
|
41 |
+
return batch
|
42 |
+
|
43 |
+
|
44 |
+
def uniform_indices(num_frames: int, total_frames: int) -> list[int]:
|
45 |
+
"""Get uniform indices
|
46 |
+
|
47 |
+
Args:
|
48 |
+
num_frames (int): number of frames
|
49 |
+
total_frames (int): total number of frames
|
50 |
+
|
51 |
+
Returns:
|
52 |
+
list[int]: Output frame indices
|
53 |
+
"""
|
54 |
+
if num_frames < total_frames:
|
55 |
+
splits = torch.linspace(0, total_frames, num_frames+1, dtype=int)
|
56 |
+
indices = ((splits[:-1] + splits[1:]) // 2).tolist()
|
57 |
+
else:
|
58 |
+
indices = list(range(total_frames))
|
59 |
+
|
60 |
+
return indices
|
61 |
+
|
62 |
+
|
63 |
+
def fps_indices(input_fps: float, total_frames: int, output_fps: float = None, max_num_frames: int = -1) -> list[int]:
|
64 |
+
"""Get indices according to the output_fps
|
65 |
+
|
66 |
+
Args:
|
67 |
+
input_fps (float): input fps
|
68 |
+
total_frames (int): total number of frames
|
69 |
+
output_fps (float, optional): output fps. Defaults to None, means output_fps==input_fps.
|
70 |
+
max_num_frames (int, optional): max number of frames. Defaults to -1, means no limitation.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
list[int]: Output frame indices
|
74 |
+
"""
|
75 |
+
delta = 1 if output_fps is None else input_fps / output_fps
|
76 |
+
indices = torch.arange(0, total_frames, delta).round().to(int)
|
77 |
+
indices = [e for e in indices if e < total_frames]
|
78 |
+
if 0 < max_num_frames < len(indices):
|
79 |
+
indices = indices[:max_num_frames]
|
80 |
+
|
81 |
+
return indices
|
82 |
+
|
83 |
+
|
84 |
+
def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs) -> list[Image.Image] | tuple[list[Image.Image], str]:
|
85 |
+
"""Load video using decord, optionally load subtitles
|
86 |
+
|
87 |
+
Args:
|
88 |
+
src_path (str): video path
|
89 |
+
sample_type (str): 'uniform' or 'fps'
|
90 |
+
sub_path (str): subtitle path, .srt
|
91 |
+
kwargs: for 'uniform', require 'num_frames'; for 'fps', optionally require 'output_fps' and 'max_num_frames'
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
list[Image.Image]: frame list
|
95 |
+
"""
|
96 |
+
vr = VideoReader(src_path, ctx=cpu(0), num_threads=1)
|
97 |
+
total_frames = len(vr)
|
98 |
+
if sample_type == 'uniform':
|
99 |
+
num_frames = kwargs.pop('num_frames')
|
100 |
+
indices = uniform_indices(num_frames, total_frames)
|
101 |
+
elif sample_type == 'fps':
|
102 |
+
input_fps = float(vr.get_avg_fps())
|
103 |
+
output_fps = kwargs.pop('output_fps', None)
|
104 |
+
max_num_frames = kwargs.pop('max_num_frames', -1)
|
105 |
+
indices = fps_indices(input_fps, total_frames, output_fps, max_num_frames)
|
106 |
+
else:
|
107 |
+
raise ValueError(f'Do not support {sample_type} sample type')
|
108 |
+
frames = vr.get_batch(indices).asnumpy() # (T, H, W, C), np.uint8
|
109 |
+
frames = [Image.fromarray(frame) for frame in frames]
|
110 |
+
|
111 |
+
if sub_path is None:
|
112 |
+
return frames
|
113 |
+
elif osp.exists(sub_path):
|
114 |
+
subs = pysubs2.load(sub_path, encoding='utf-8')
|
115 |
+
subtitles = []
|
116 |
+
for idx in indices:
|
117 |
+
sub_text = []
|
118 |
+
cur_time = pysubs2.make_time(fps=float(vr.get_avg_fps()), frames=idx)
|
119 |
+
for sub in subs:
|
120 |
+
if sub.end < cur_time:
|
121 |
+
continue
|
122 |
+
elif sub.start < cur_time:
|
123 |
+
sub_text.append(sub.text)
|
124 |
+
else:
|
125 |
+
break
|
126 |
+
sub_text = ' '.join(sub_text)
|
127 |
+
if sub_text.strip():
|
128 |
+
subtitles.append(sub_text)
|
129 |
+
subtitles = '\n'.join(subtitles)
|
130 |
+
return frames, subtitles
|
131 |
+
else:
|
132 |
+
return frames, ''
|
133 |
+
|
134 |
+
|
135 |
+
class VideoMMEDataset(Dataset):
|
136 |
+
|
137 |
+
def __init__(self, dataset_path: str, sample_config: dict, use_subtitle: bool = False):
|
138 |
+
super().__init__()
|
139 |
+
self.dataset_path = dataset_path
|
140 |
+
self.sample_config = sample_config
|
141 |
+
self.use_subtitle = use_subtitle
|
142 |
+
|
143 |
+
data_dict = {}
|
144 |
+
index_keys = ['video_id', 'duration', 'domain', 'sub_category', 'videoID']
|
145 |
+
value_keys = ['question_id', 'task_type', 'question', 'options', 'answer']
|
146 |
+
df = read_parquet(osp.join(dataset_path, 'videomme', 'test-00000-of-00001.parquet'))
|
147 |
+
df['options'] = df['options'].apply(list)
|
148 |
+
for _, data in df.iterrows():
|
149 |
+
key = tuple(data[k] for k in index_keys)
|
150 |
+
value = data[value_keys].to_dict()
|
151 |
+
if key in data_dict:
|
152 |
+
data_dict[key].append(value)
|
153 |
+
else:
|
154 |
+
data_dict[key] = [value]
|
155 |
+
self.data_list = [dict(zip(index_keys + ['questions'], list(k) + [v])) for k, v in data_dict.items()]
|
156 |
+
|
157 |
+
def __len__(self):
|
158 |
+
return len(self.data_list)
|
159 |
+
|
160 |
+
def __getitem__(self, idx) -> dict:
|
161 |
+
if self.use_subtitle:
|
162 |
+
frames, subtitles = load_video(
|
163 |
+
src_path=osp.join(self.dataset_path, 'video', self.data_list[idx]['videoID'] + '.mp4'),
|
164 |
+
sub_path=osp.join(self.dataset_path, 'subtitle', self.data_list[idx]['videoID'] + '.srt'),
|
165 |
+
**self.sample_config
|
166 |
+
)
|
167 |
+
text = ['\n'.join([
|
168 |
+
"This video's subtitles are listed below:",
|
169 |
+
subtitles,
|
170 |
+
'Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.',
|
171 |
+
i['question']
|
172 |
+
] + i['options']) for i in self.data_list[idx]['questions']]
|
173 |
+
else:
|
174 |
+
frames = load_video(
|
175 |
+
src_path=osp.join(self.dataset_path, 'video', self.data_list[idx]['videoID'] + '.mp4'),
|
176 |
+
**self.sample_config
|
177 |
+
)
|
178 |
+
text = ['\n'.join([
|
179 |
+
'Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.',
|
180 |
+
i['question']
|
181 |
+
] + i['options']) for i in self.data_list[idx]['questions']]
|
182 |
+
subtitles = ''
|
183 |
+
|
184 |
+
return dict(
|
185 |
+
video=frames,
|
186 |
+
text=text
|
187 |
+
)
|
188 |
+
|
189 |
+
|
190 |
+
if __name__ == '__main__':
|
191 |
+
|
192 |
+
from videoccam import VideoCCAM, DEFAULT_VIDEO_TOKEN
|
193 |
+
|
194 |
+
mllm = VideoCCAM(
|
195 |
+
model_path='.',
|
196 |
+
chat_template='<|user|>\n{input}<|end|>\n<|assistant|>\n',
|
197 |
+
generation_args=dict(
|
198 |
+
stop_tokens=['<|end|>', '<|endoftext|>'],
|
199 |
+
max_new_tokens=512,
|
200 |
+
do_sample=False
|
201 |
+
),
|
202 |
+
llm_name_or_path='microsoft/Phi-3-mini-4k-instruct',
|
203 |
+
visual_encoder_name_or_path='google/siglip-so400m-patch14-384',
|
204 |
+
special_tokens=['<time>', '</time>'],
|
205 |
+
visual_select_layer=-2,
|
206 |
+
torch_dtype=torch.bfloat16,
|
207 |
+
device_map='cuda:0'
|
208 |
+
)
|
209 |
+
mllm.eval()
|
210 |
+
|
211 |
+
dataset = VideoMMEDataset(
|
212 |
+
dataset_path='your/dataset/path',
|
213 |
+
sample_config=dict(
|
214 |
+
sample_type='uniform',
|
215 |
+
num_frames=32
|
216 |
+
)
|
217 |
+
)
|
218 |
+
|
219 |
+
with torch.inference_mode():
|
220 |
+
for use_subtitle in (True,):
|
221 |
+
dataset.use_subtitle = use_subtitle
|
222 |
+
dataloader = DataLoader(
|
223 |
+
dataset,
|
224 |
+
batch_size=4,
|
225 |
+
num_workers=8,
|
226 |
+
shuffle=False,
|
227 |
+
pin_memory=True,
|
228 |
+
collate_fn=video_collate_fn
|
229 |
+
)
|
230 |
+
results = []
|
231 |
+
for data in tqdm(dataloader):
|
232 |
+
print(data['text'][0])
|
233 |
+
response, pixel_values = mllm.generate(
|
234 |
+
texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][0]],
|
235 |
+
videos=data['video'],
|
236 |
+
return_pixel_values=True
|
237 |
+
)
|
238 |
+
response = [response]
|
239 |
+
for i in range(1, len(data['text'])):
|
240 |
+
response.append(mllm.generate(
|
241 |
+
texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][i]],
|
242 |
+
pixel_values=pixel_values
|
243 |
+
))
|
244 |
+
response = [[response[i][j] for i in range(len(response))] for j in range(len(response[0]))]
|
245 |
+
results.extend(response)
|
246 |
+
|
247 |
+
outputs = []
|
248 |
+
for data, responses in zip(dataset.data_list, results):
|
249 |
+
data = deepcopy(data)
|
250 |
+
data.pop('videoID')
|
251 |
+
for question, response in zip(data['questions'], responses):
|
252 |
+
question['response'] = response
|
253 |
+
outputs.append(data)
|
254 |
+
|
255 |
+
suffix = 'w_sub' if use_subtitle else 'wo_sub'
|
256 |
+
with open(f'output_{suffix}.json', 'w') as f:
|
257 |
+
json.dump(outputs, f, indent=4, ensure_ascii=False)
|
llm_adapter/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: /group/40006/jaronfei/models/Phi-3-mini-4k-instruct
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.11.1
|
llm_adapter/adapter_config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "/group/40006/jaronfei/models/Phi-3-mini-4k-instruct",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 256,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 512,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"gate_up_proj",
|
24 |
+
"down_proj",
|
25 |
+
"o_proj",
|
26 |
+
"qkv_proj"
|
27 |
+
],
|
28 |
+
"task_type": "CAUSAL_LM",
|
29 |
+
"use_dora": false,
|
30 |
+
"use_rslora": false
|
31 |
+
}
|
llm_adapter/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e10c0045b6d740a7b065e483419c6d468bb55e87d8369e1e6e2272d716481c9
|
3 |
+
size 1610648152
|
projector/config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"CCAMModel"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"auto_map": {
|
7 |
+
"AutoConfig": "configuration_ccam_projector.CCAMConfig",
|
8 |
+
"AutoModel": "modeling_ccam_projector.CCAMModel"
|
9 |
+
},
|
10 |
+
"cross_attention_frequency": 1,
|
11 |
+
"encoder_hidden_size": 1152,
|
12 |
+
"hidden_act": "gelu",
|
13 |
+
"hidden_dropout_prob": 0.1,
|
14 |
+
"hidden_size": 1152,
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 4096,
|
17 |
+
"layer_norm_eps": 1e-12,
|
18 |
+
"max_position_embeddings": 512,
|
19 |
+
"model_type": "ccam_projector",
|
20 |
+
"num_attention_heads": 18,
|
21 |
+
"num_hidden_layers": 1,
|
22 |
+
"num_query_tokens": 1024,
|
23 |
+
"output_size": 3072,
|
24 |
+
"pad_token_id": 0,
|
25 |
+
"position_embedding_type": "absolute",
|
26 |
+
"query_attn_mask_type": "full",
|
27 |
+
"spatial_pos_embed_type": "none",
|
28 |
+
"spatial_resolution": [
|
29 |
+
1,
|
30 |
+
1
|
31 |
+
],
|
32 |
+
"temporal_pos_embed_type": "none",
|
33 |
+
"temporal_resolution": 16,
|
34 |
+
"torch_dtype": "float16",
|
35 |
+
"transformers_version": "4.41.2",
|
36 |
+
"visual_attn_mask_type": "ccam",
|
37 |
+
"vocab_size": 30522
|
38 |
+
}
|
projector/configuration_ccam_projector.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
================================================
|
5 |
+
@author: Jaron
|
6 |
+
@time: 2024/02/20 16:37:16
|
7 |
+
@email: fjjth98@163.com
|
8 |
+
@description: different projector in Video-LLM
|
9 |
+
================================================
|
10 |
+
"""
|
11 |
+
|
12 |
+
from transformers.models.blip_2.configuration_blip_2 import Blip2QFormerConfig
|
13 |
+
|
14 |
+
|
15 |
+
class CCAMConfig(Blip2QFormerConfig):
|
16 |
+
model_type = 'ccam_projector'
|
17 |
+
_auto_class = 'AutoConfig'
|
18 |
+
|
19 |
+
def __init__(
|
20 |
+
self,
|
21 |
+
spatial_pos_embed_type: str = 'learnable', # ['none', 'learnable', 'cosine']
|
22 |
+
spatial_resolution: tuple[int, int] = (1, 1), # (H, W)
|
23 |
+
temporal_pos_embed_type: str = 'learnable',
|
24 |
+
temporal_resolution: int = 0, # T
|
25 |
+
num_query_tokens: int = 512,
|
26 |
+
visual_attn_mask_type: str = 'ccam', # ['ccam', 'full']
|
27 |
+
query_attn_mask_type: str = 'full', # ['causal', 'full']
|
28 |
+
num_hidden_layers=1,
|
29 |
+
cross_attention_frequency=1,
|
30 |
+
output_size=4096, # llm dimension
|
31 |
+
encoder_hidden_size=1024, # visual dimension
|
32 |
+
hidden_size=1024,
|
33 |
+
vocab_size=30522, num_attention_heads=16, intermediate_size=4096, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, position_embedding_type="absolute", **kwargs):
|
34 |
+
super().__init__(vocab_size, hidden_size, num_hidden_layers, num_attention_heads, intermediate_size, hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, max_position_embeddings, initializer_range, layer_norm_eps, pad_token_id, position_embedding_type, cross_attention_frequency, encoder_hidden_size, **kwargs)
|
35 |
+
self.spatial_pos_embed_type = spatial_pos_embed_type
|
36 |
+
self.spatial_resolution = spatial_resolution
|
37 |
+
self.temporal_pos_embed_type = temporal_pos_embed_type
|
38 |
+
self.temporal_resolution = temporal_resolution
|
39 |
+
self.num_query_tokens = num_query_tokens
|
40 |
+
self.visual_attn_mask_type = visual_attn_mask_type
|
41 |
+
self.query_attn_mask_type = query_attn_mask_type
|
42 |
+
self.output_size = output_size
|
projector/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86d539c564feda72963eb5a2f2d712112a29afa8cc42f3738cffbe38c637c18a
|
3 |
+
size 47613856
|
projector/modeling_ccam_projector.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
================================================
|
5 |
+
@author: Jaron
|
6 |
+
@time: 2024/02/20 16:21:56
|
7 |
+
@email: fjjth98@163.com
|
8 |
+
@description: QFormer projector, convert image and video into fixed-length tokens
|
9 |
+
================================================
|
10 |
+
"""
|
11 |
+
|
12 |
+
import math
|
13 |
+
import torch
|
14 |
+
import torch.nn as nn
|
15 |
+
from torch.nn.functional import interpolate
|
16 |
+
from transformers.models.blip_2.modeling_blip_2 import Blip2QFormerModel, Blip2QFormerEncoder
|
17 |
+
|
18 |
+
from .configuration_ccam_projector import CCAMConfig
|
19 |
+
|
20 |
+
|
21 |
+
class SimpleQFormerOutput(nn.Module):
|
22 |
+
# replace last residual MLP with normal MLP
|
23 |
+
def __init__(self, config):
|
24 |
+
super().__init__()
|
25 |
+
self.dense = nn.Linear(config.intermediate_size, config.output_size)
|
26 |
+
|
27 |
+
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor = None) -> torch.Tensor:
|
28 |
+
return self.dense(hidden_states)
|
29 |
+
|
30 |
+
|
31 |
+
class SimpleQFormerIdentity(nn.Module):
|
32 |
+
# just to replace the first attention module with identity, since it is useless
|
33 |
+
|
34 |
+
def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
35 |
+
return hidden_states,
|
36 |
+
|
37 |
+
|
38 |
+
class CCAMModel(Blip2QFormerModel):
|
39 |
+
_auto_class = 'AutoModel'
|
40 |
+
config_class = CCAMConfig
|
41 |
+
base_model_prefix = 'model'
|
42 |
+
supports_gradient_checkpointing = True
|
43 |
+
|
44 |
+
def __init__(self, config: CCAMConfig):
|
45 |
+
super(Blip2QFormerModel, self).__init__(config)
|
46 |
+
self.gradient_checkpointing = False
|
47 |
+
self.config = config
|
48 |
+
self.num_query_tokens = config.num_query_tokens
|
49 |
+
self.visual_attn_mask_type = config.visual_attn_mask_type
|
50 |
+
|
51 |
+
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
52 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
53 |
+
self.encoder = Blip2QFormerEncoder(config)
|
54 |
+
self.encoder.layer[0].attention = SimpleQFormerIdentity() # replace the 1st attention module with identity
|
55 |
+
self.encoder.layer[-1].output_query = SimpleQFormerOutput(config)
|
56 |
+
|
57 |
+
# initialize query tokens
|
58 |
+
self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.hidden_size))
|
59 |
+
|
60 |
+
# initialize pos embed
|
61 |
+
self.spatial_pos_embed = self._create_pos_embed(*config.spatial_resolution, type=config.spatial_pos_embed_type) # (H, W, C)
|
62 |
+
self.temporal_pos_embed = self._create_pos_embed(config.temporal_resolution, type=config.temporal_pos_embed_type) # (T, C)
|
63 |
+
|
64 |
+
# initialize query attn mask
|
65 |
+
if config.query_attn_mask_type == 'full':
|
66 |
+
self.query_attn_mask = None
|
67 |
+
elif config.query_attn_mask_type == 'causal':
|
68 |
+
query_attn_mask = torch.ones(self.num_query_tokens, self.num_query_tokens)
|
69 |
+
q = torch.arange(self.num_query_tokens)
|
70 |
+
query_attn_mask.masked_fill_(q > q[:, None], 0)
|
71 |
+
self.query_attn_mask = query_attn_mask[None]
|
72 |
+
else:
|
73 |
+
raise NotImplementedError(f'Do not support {self.query_attn_mask} query_attn_mask')
|
74 |
+
|
75 |
+
self.post_init()
|
76 |
+
|
77 |
+
def _create_pos_embed(self, *size: int, type: str = 'none') -> torch.Tensor:
|
78 |
+
C = self.config.encoder_hidden_size
|
79 |
+
if type == 'none':
|
80 |
+
pos_embed = None
|
81 |
+
elif type == 'learnable':
|
82 |
+
pos_embed = nn.Parameter(.02 * torch.randn(*size, C))
|
83 |
+
elif type == 'cosine':
|
84 |
+
total_len = 1
|
85 |
+
for i in size:
|
86 |
+
total_len *= i
|
87 |
+
raw = torch.outer(torch.arange(total_len), torch.exp(torch.arange(0, C, 2) * (-math.log(10000.) / C)))
|
88 |
+
pos_embed = nn.Parameter(torch.stack((raw.sin(), raw.cos()), dim=-1).view(*size, C), requires_grad=False)
|
89 |
+
else:
|
90 |
+
raise NotImplementedError(f'Do not support {type} position embeddings')
|
91 |
+
return pos_embed
|
92 |
+
|
93 |
+
def get_attn_mask(self, embeddings: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
94 |
+
"""Get visual_attn_mask and query_attn_mask if needed
|
95 |
+
embeddings (torch.Tensor): (B, T, L, C)
|
96 |
+
"""
|
97 |
+
B, T, L, _ = embeddings.size()
|
98 |
+
device = embeddings.device
|
99 |
+
|
100 |
+
# visual attn mask only work for videos
|
101 |
+
if T > 1:
|
102 |
+
if self.visual_attn_mask_type == 'ccam':
|
103 |
+
base_attn_mask = torch.ones(T, T, device=device)
|
104 |
+
t = torch.arange(T, device=device)
|
105 |
+
base_attn_mask.masked_fill_(t > t[:, None], 0)
|
106 |
+
visual_attn_mask = torch.cat((
|
107 |
+
torch.kron(
|
108 |
+
base_attn_mask,
|
109 |
+
torch.ones(self.num_query_tokens // T, L, device=device)
|
110 |
+
),
|
111 |
+
torch.ones(self.num_query_tokens % T, T * L, device=device)
|
112 |
+
), dim=0)[None].expand(B, -1, -1)
|
113 |
+
elif self.attn_mask_type == 'full':
|
114 |
+
visual_attn_mask = None
|
115 |
+
else:
|
116 |
+
raise NotImplementedError(f'Do not support {self.visual_attn_mask_type} attn_mask')
|
117 |
+
else:
|
118 |
+
visual_attn_mask = None
|
119 |
+
|
120 |
+
if self.query_attn_mask is None:
|
121 |
+
query_attn_mask = None
|
122 |
+
else:
|
123 |
+
query_attn_mask = self.query_attn_mask.expand(B, -1, -1)
|
124 |
+
|
125 |
+
return visual_attn_mask, query_attn_mask
|
126 |
+
|
127 |
+
def batch_forward_no_spatial(self, visual_embeds: torch.Tensor) -> torch.Tensor:
|
128 |
+
"""Batch forward without spatial mask position embeddings
|
129 |
+
|
130 |
+
Args:
|
131 |
+
visual_embeds (torch.Tensor): (B, T, L, C)
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
torch.Tensor: (B, Q, C)
|
135 |
+
"""
|
136 |
+
B, T, _, C = visual_embeds.size()
|
137 |
+
query_embeds = self.query_tokens.expand(B, -1, -1)
|
138 |
+
visual_attn_mask, query_attn_mask = self.get_attn_mask(visual_embeds)
|
139 |
+
|
140 |
+
# add temporal position embeddings
|
141 |
+
if self.temporal_pos_embed is not None:
|
142 |
+
if T == self.temporal_pos_embed.size(0):
|
143 |
+
pos_embed = self.temporal_pos_embed
|
144 |
+
elif T == 1:
|
145 |
+
pos_embed = 0. * self.temporal_pos_embed[:1] # for deepspeed
|
146 |
+
else:
|
147 |
+
pos_embed = interpolate(
|
148 |
+
self.temporal_pos_embed.T[None], # (1, C, t)
|
149 |
+
size=(T,),
|
150 |
+
mode='linear',
|
151 |
+
align_corners=False
|
152 |
+
)[0].T # (T, C)
|
153 |
+
visual_embeds = visual_embeds + pos_embed.view(1, T, 1, C)
|
154 |
+
visual_embeds = visual_embeds.flatten(1, 2)
|
155 |
+
|
156 |
+
return super().forward(
|
157 |
+
query_embeds=query_embeds,
|
158 |
+
attention_mask=query_attn_mask,
|
159 |
+
encoder_hidden_states=visual_embeds,
|
160 |
+
encoder_attention_mask=visual_attn_mask
|
161 |
+
)[0]
|
162 |
+
|
163 |
+
def forward(self, visual_embeds: torch.Tensor, split_sizes: list[int], unmasked_ids: torch.LongTensor = None):
|
164 |
+
"""
|
165 |
+
visual_embeds (torch.Tensor): (T, L, C)
|
166 |
+
split_sizes (list[int]): [t0, t1, ...] sum_i ti=T
|
167 |
+
unmasked_ids (torch.LongTensor): If provided, should be in the shape of (T, L) whose value v 0<=v<=HW-1
|
168 |
+
output_attentions (_type_, optional): _description_. Defaults to None.
|
169 |
+
output_hidden_states (_type_, optional): _description_. Defaults to None.
|
170 |
+
return_dict (_type_, optional): _description_. Defaults to None.
|
171 |
+
"""
|
172 |
+
_, L, C = visual_embeds.size()
|
173 |
+
|
174 |
+
# add spatial position embeddings
|
175 |
+
if self.spatial_pos_embed is not None:
|
176 |
+
pos_embed = self.spatial_pos_embed.view(-1, C) # (H*W, C)
|
177 |
+
if unmasked_ids is None:
|
178 |
+
pos_embed = pos_embed.view(1, L, C) # if not provided, L must equals to H*W
|
179 |
+
else:
|
180 |
+
pos_embed = pos_embed[unmasked_ids] # (T, L, C)
|
181 |
+
visual_embeds = visual_embeds + pos_embed
|
182 |
+
|
183 |
+
# all inputs in this batch has the same t
|
184 |
+
if len(set(split_sizes)) == 1:
|
185 |
+
visual_embeds = visual_embeds.view(len(split_sizes), split_sizes[0], L, C)
|
186 |
+
output = self.batch_forward_no_spatial(visual_embeds)
|
187 |
+
else:
|
188 |
+
visual_embeds = visual_embeds.split(split_sizes, dim=0)
|
189 |
+
# group visual_embeds accoding to the number of frames
|
190 |
+
output, group_visual_embeds = [None] * len(split_sizes), {}
|
191 |
+
for idx, (embed, t) in enumerate(zip(visual_embeds, split_sizes)):
|
192 |
+
if t in group_visual_embeds:
|
193 |
+
group_visual_embeds[t][0].append(idx)
|
194 |
+
group_visual_embeds[t][1].append(embed)
|
195 |
+
else:
|
196 |
+
group_visual_embeds[t] = [[idx], [embed]]
|
197 |
+
for idx, embeds in group_visual_embeds.values():
|
198 |
+
cur_output = self.batch_forward_no_spatial(torch.stack(embeds, dim=0))
|
199 |
+
for i, j in enumerate(idx):
|
200 |
+
output[j] = cur_output[i]
|
201 |
+
output = torch.stack(output, dim=0)
|
202 |
+
|
203 |
+
return output
|
ref_results/output_w_sub.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ref_results/output_wo_sub.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
videoccam.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
================================================
|
5 |
+
@author: Jaron
|
6 |
+
@time: 2024/06/23 09:52:24
|
7 |
+
@email: fjjth98@163.com
|
8 |
+
@description:
|
9 |
+
================================================
|
10 |
+
"""
|
11 |
+
|
12 |
+
import torch
|
13 |
+
import os.path as osp
|
14 |
+
import torch.nn as nn
|
15 |
+
|
16 |
+
from PIL import Image
|
17 |
+
from peft import PeftModel
|
18 |
+
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, SiglipVisionModel, SiglipImageProcessor
|
19 |
+
|
20 |
+
|
21 |
+
IGNORE_INDEX = -100
|
22 |
+
IMAGE_TOKEN_INDEX = -200
|
23 |
+
DEFAULT_IMAGE_TOKEN = '<image>'
|
24 |
+
DEFAULT_VIDEO_TOKEN = '<video>'
|
25 |
+
|
26 |
+
|
27 |
+
class VideoCCAM(nn.Module):
|
28 |
+
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
model_path: str,
|
32 |
+
chat_template: str,
|
33 |
+
generation_args: dict,
|
34 |
+
llm_name_or_path: str = None,
|
35 |
+
visual_encoder_name_or_path: str = None,
|
36 |
+
special_tokens: list[str] = None,
|
37 |
+
visual_select_layer: int = -2,
|
38 |
+
torch_dtype: torch.dtype = torch.float16,
|
39 |
+
device_map: str = 'cuda:0'
|
40 |
+
):
|
41 |
+
super().__init__()
|
42 |
+
self.chat_template = chat_template
|
43 |
+
self.generation_args = generation_args
|
44 |
+
self.visual_select_layer = visual_select_layer
|
45 |
+
self.torch_dtype = torch_dtype
|
46 |
+
self.device_map = device_map
|
47 |
+
|
48 |
+
if llm_name_or_path is None:
|
49 |
+
llm_name_or_path = model_path
|
50 |
+
if visual_encoder_name_or_path is None:
|
51 |
+
visual_encoder_name_or_path = osp.join(model_path, 'visual_encoder')
|
52 |
+
assert osp.exists(visual_encoder_name_or_path), f'{visual_encoder_name_or_path} does not exist, you have to specify `visual_encoder_name_or_path`'
|
53 |
+
projector_path = osp.join(model_path, 'projector')
|
54 |
+
assert osp.exists(projector_path), f'{projector_path} does not exist, you have to change `model_path`'
|
55 |
+
|
56 |
+
self.llm = AutoModelForCausalLM.from_pretrained(
|
57 |
+
llm_name_or_path,
|
58 |
+
trust_remote_code=True,
|
59 |
+
torch_dtype=torch_dtype,
|
60 |
+
device_map=device_map
|
61 |
+
)
|
62 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
63 |
+
llm_name_or_path,
|
64 |
+
trust_remote_code=True
|
65 |
+
)
|
66 |
+
print(f'Load LLM from {llm_name_or_path}')
|
67 |
+
if special_tokens is not None:
|
68 |
+
self.llm.resize_token_embeddings(self.llm.get_input_embeddings().weight.size(0) + len(special_tokens))
|
69 |
+
self.llm.requires_grad_(False)
|
70 |
+
self.llm.get_input_embeddings().weight[-len(special_tokens):].zero_()
|
71 |
+
self.tokenizer.add_tokens(special_tokens, special_tokens=True)
|
72 |
+
print(f'Add special_tokens {special_tokens} to LLM and tokenizer')
|
73 |
+
if osp.exists(adapter_path := osp.join(model_path, 'llm_adapter')):
|
74 |
+
self.llm = PeftModel.from_pretrained(self.llm, adapter_path)
|
75 |
+
print(f'Load LLM adapter from {adapter_path}')
|
76 |
+
self.generation_args['eos_token_id'] = self.tokenizer.convert_tokens_to_ids(self.generation_args.pop('stop_tokens'))
|
77 |
+
|
78 |
+
self.visual_encoder = SiglipVisionModel.from_pretrained(
|
79 |
+
visual_encoder_name_or_path,
|
80 |
+
torch_dtype=torch_dtype,
|
81 |
+
device_map=device_map
|
82 |
+
)
|
83 |
+
self.image_processor = SiglipImageProcessor.from_pretrained(visual_encoder_name_or_path)
|
84 |
+
print(f'Load SigLIP visual encoder from {visual_encoder_name_or_path}')
|
85 |
+
if osp.exists(adapter_path := osp.join(model_path, 'visual_encoder_adapter')):
|
86 |
+
self.visual_encoder = PeftModel.from_pretrained(self.visual_encoder, adapter_path)
|
87 |
+
print(f'Load visual_encoder adapter from {adapter_path}')
|
88 |
+
|
89 |
+
self.projector = AutoModel.from_pretrained(
|
90 |
+
projector_path,
|
91 |
+
torch_dtype=torch_dtype,
|
92 |
+
device_map=device_map,
|
93 |
+
trust_remote_code=True
|
94 |
+
)
|
95 |
+
print(f'Load projector from {projector_path}')
|
96 |
+
|
97 |
+
# Modified from https://github.com/InternLM/xtuner/blob/main/xtuner/model/utils.py#L138
|
98 |
+
def prepare_inputs_labels_for_multimodal(
|
99 |
+
self,
|
100 |
+
input_ids: torch.LongTensor = None,
|
101 |
+
position_ids: torch.LongTensor = None,
|
102 |
+
attention_mask: torch.Tensor = None,
|
103 |
+
past_key_values: list[torch.FloatTensor] = None,
|
104 |
+
labels: torch.LongTensor = None,
|
105 |
+
pixel_values: torch.FloatTensor = None
|
106 |
+
):
|
107 |
+
if pixel_values is None:
|
108 |
+
return {
|
109 |
+
'input_ids': input_ids,
|
110 |
+
'position_ids': position_ids,
|
111 |
+
'attention_mask': attention_mask,
|
112 |
+
'past_key_values': past_key_values,
|
113 |
+
'inputs_embeds': None,
|
114 |
+
'labels': labels
|
115 |
+
}
|
116 |
+
|
117 |
+
_labels = labels
|
118 |
+
_position_ids = position_ids
|
119 |
+
_attention_mask = attention_mask
|
120 |
+
if attention_mask is None:
|
121 |
+
if isinstance(input_ids, torch.Tensor):
|
122 |
+
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
123 |
+
elif isinstance(input_ids, list):
|
124 |
+
attention_mask = [torch.ones_like(i, dtype=torch.bool) for i in input_ids]
|
125 |
+
_attention_mask = attention_mask
|
126 |
+
else:
|
127 |
+
raise ValueError(f'Do not support {type(input_ids)} type as input_ids')
|
128 |
+
else:
|
129 |
+
attention_mask = attention_mask.bool()
|
130 |
+
if position_ids is None:
|
131 |
+
position_ids = torch.arange(
|
132 |
+
0, input_ids[0].shape[0], dtype=torch.long, device=input_ids[0].device)
|
133 |
+
if labels is None:
|
134 |
+
if isinstance(input_ids, torch.Tensor):
|
135 |
+
labels = torch.full_like(input_ids, IGNORE_INDEX)
|
136 |
+
elif isinstance(input_ids, list):
|
137 |
+
labels = [torch.full_like(i, IGNORE_INDEX) for i in input_ids]
|
138 |
+
else:
|
139 |
+
raise ValueError(f'Do not support {type(input_ids)} type as input_ids')
|
140 |
+
|
141 |
+
# remove the padding using attention_mask -- TODO: double check
|
142 |
+
input_ids = [
|
143 |
+
cur_input_ids[cur_attention_mask]
|
144 |
+
for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
|
145 |
+
]
|
146 |
+
labels = [
|
147 |
+
cur_labels[cur_attention_mask]
|
148 |
+
for cur_labels, cur_attention_mask in zip(labels, attention_mask)
|
149 |
+
]
|
150 |
+
|
151 |
+
new_inputs_embeds = []
|
152 |
+
new_labels = []
|
153 |
+
cur_image_idx = 0
|
154 |
+
for batch_idx, cur_input_ids in enumerate(input_ids):
|
155 |
+
num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
|
156 |
+
if num_images == 0:
|
157 |
+
cur_pixel_values = pixel_values[cur_image_idx]
|
158 |
+
cur_inputs_embeds_1 = self.llm.get_input_embeddings()(cur_input_ids)
|
159 |
+
cur_inputs_embeds = torch.cat(
|
160 |
+
[cur_inputs_embeds_1, cur_pixel_values[0:0]], dim=0)
|
161 |
+
new_inputs_embeds.append(cur_inputs_embeds)
|
162 |
+
new_labels.append(labels[batch_idx])
|
163 |
+
cur_image_idx += 1
|
164 |
+
continue
|
165 |
+
|
166 |
+
image_token_indices = [-1] + torch.where(
|
167 |
+
cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [
|
168 |
+
cur_input_ids.shape[0]
|
169 |
+
]
|
170 |
+
cur_input_ids_noim = []
|
171 |
+
cur_labels = labels[batch_idx]
|
172 |
+
cur_labels_noim = []
|
173 |
+
for i in range(len(image_token_indices) - 1):
|
174 |
+
cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] +
|
175 |
+
1:image_token_indices[i +
|
176 |
+
1]])
|
177 |
+
cur_labels_noim.append(cur_labels[image_token_indices[i] +
|
178 |
+
1:image_token_indices[i + 1]])
|
179 |
+
split_sizes = [x.shape[0] for x in cur_labels_noim]
|
180 |
+
cur_inputs_embeds = self.llm.get_input_embeddings()(
|
181 |
+
torch.cat(cur_input_ids_noim))
|
182 |
+
cur_inputs_embeds_no_im = torch.split(
|
183 |
+
cur_inputs_embeds, split_sizes, dim=0)
|
184 |
+
cur_new_inputs_embeds = []
|
185 |
+
cur_new_labels = []
|
186 |
+
|
187 |
+
for i in range(num_images + 1):
|
188 |
+
cur_new_inputs_embeds.append(cur_inputs_embeds_no_im[i])
|
189 |
+
cur_new_labels.append(cur_labels_noim[i])
|
190 |
+
if i < num_images:
|
191 |
+
cur_pixel_values = pixel_values[cur_image_idx]
|
192 |
+
cur_image_idx += 1
|
193 |
+
cur_new_inputs_embeds.append(cur_pixel_values)
|
194 |
+
cur_new_labels.append(
|
195 |
+
torch.full((cur_pixel_values.shape[0], ),
|
196 |
+
IGNORE_INDEX,
|
197 |
+
device=cur_labels.device,
|
198 |
+
dtype=cur_labels.dtype))
|
199 |
+
|
200 |
+
cur_new_inputs_embeds = torch.cat(cur_new_inputs_embeds)
|
201 |
+
cur_new_labels = torch.cat(cur_new_labels)
|
202 |
+
|
203 |
+
new_inputs_embeds.append(cur_new_inputs_embeds)
|
204 |
+
new_labels.append(cur_new_labels)
|
205 |
+
|
206 |
+
# Combine them
|
207 |
+
max_len = max(x.shape[0] for x in new_inputs_embeds)
|
208 |
+
batch_size = len(new_inputs_embeds)
|
209 |
+
|
210 |
+
new_inputs_embeds_padded = []
|
211 |
+
new_labels_padded = torch.full((batch_size, max_len),
|
212 |
+
IGNORE_INDEX,
|
213 |
+
dtype=new_labels[0].dtype,
|
214 |
+
device=new_labels[0].device)
|
215 |
+
attention_mask = torch.zeros((batch_size, max_len),
|
216 |
+
dtype=attention_mask[0].dtype,
|
217 |
+
device=attention_mask[0].device)
|
218 |
+
position_ids = torch.zeros((batch_size, max_len),
|
219 |
+
dtype=position_ids.dtype,
|
220 |
+
device=position_ids.device)
|
221 |
+
|
222 |
+
for i, (cur_new_embed,
|
223 |
+
cur_new_labels) in enumerate(zip(new_inputs_embeds, new_labels)):
|
224 |
+
cur_len = cur_new_embed.shape[0]
|
225 |
+
new_inputs_embeds_padded.append(
|
226 |
+
torch.cat((cur_new_embed,
|
227 |
+
torch.zeros((max_len - cur_len, cur_new_embed.shape[1]),
|
228 |
+
dtype=cur_new_embed.dtype,
|
229 |
+
device=cur_new_embed.device)),
|
230 |
+
dim=0))
|
231 |
+
if cur_len > 0:
|
232 |
+
new_labels_padded[i, :cur_len] = cur_new_labels
|
233 |
+
attention_mask[i, :cur_len] = True
|
234 |
+
position_ids[i, :cur_len] = torch.arange(
|
235 |
+
0,
|
236 |
+
cur_len,
|
237 |
+
dtype=position_ids.dtype,
|
238 |
+
device=position_ids.device)
|
239 |
+
|
240 |
+
new_inputs_embeds = torch.stack(new_inputs_embeds_padded, dim=0)
|
241 |
+
|
242 |
+
if _labels is None:
|
243 |
+
new_labels = None
|
244 |
+
else:
|
245 |
+
new_labels = new_labels_padded
|
246 |
+
|
247 |
+
if _attention_mask is None:
|
248 |
+
attention_mask = None
|
249 |
+
elif isinstance(_attention_mask, list):
|
250 |
+
attention_mask = attention_mask.to(dtype=_attention_mask[0].dtype)
|
251 |
+
else:
|
252 |
+
attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
|
253 |
+
|
254 |
+
if _position_ids is None:
|
255 |
+
position_ids = None
|
256 |
+
|
257 |
+
return {
|
258 |
+
'input_ids': None,
|
259 |
+
'position_ids': position_ids,
|
260 |
+
'attention_mask': attention_mask,
|
261 |
+
'past_key_values': past_key_values,
|
262 |
+
'inputs_embeds': new_inputs_embeds,
|
263 |
+
'labels': new_labels
|
264 |
+
}
|
265 |
+
|
266 |
+
def generate(
|
267 |
+
self,
|
268 |
+
texts: list[str],
|
269 |
+
videos: list[list[Image.Image]] = None,
|
270 |
+
pixel_values: torch.Tensor = None,
|
271 |
+
return_pixel_values: bool = False
|
272 |
+
) -> list[str] | tuple[list[str], torch.Tensor]:
|
273 |
+
"""Genrate respoonse for video and text inputs.
|
274 |
+
|
275 |
+
Args:
|
276 |
+
text (list[str]): list of text inputs
|
277 |
+
video (list[list[Image.Image]], optional): list of frame list. Defaults to None.
|
278 |
+
pixel_values (torch.Tensor, optional): precomputed pixel_values. Defaults to None.
|
279 |
+
return_pixel_values (bool, optional): whether return pixel values or not. Defaults to False.
|
280 |
+
|
281 |
+
Returns:
|
282 |
+
list[str]: _description_
|
283 |
+
"""
|
284 |
+
prediction = []
|
285 |
+
# Get visual embeddings
|
286 |
+
if pixel_values is None:
|
287 |
+
frames, split_sizes = [], []
|
288 |
+
for i in videos:
|
289 |
+
frames += i
|
290 |
+
split_sizes.append(len(i))
|
291 |
+
pixel_values = self.image_processor(frames, return_tensors='pt')['pixel_values'].to(self.torch_dtype).to(self.device_map)
|
292 |
+
pixel_values = self.visual_encoder(pixel_values, output_hidden_states=True).hidden_states[self.visual_select_layer]
|
293 |
+
pixel_values = self.projector(pixel_values, split_sizes)
|
294 |
+
|
295 |
+
for i, t in enumerate(texts):
|
296 |
+
et = self.chat_template.format(input=t).replace(DEFAULT_VIDEO_TOKEN, DEFAULT_IMAGE_TOKEN).split(DEFAULT_IMAGE_TOKEN)
|
297 |
+
assert len(et) == 2, f'Wrong input formats for {t}'
|
298 |
+
input_ids = [torch.tensor(self.tokenizer.encode(et[0]) + [IMAGE_TOKEN_INDEX] + self.tokenizer.encode(et[1], add_special_tokens=False), device=self.device_map)]
|
299 |
+
mm_inputs = self.prepare_inputs_labels_for_multimodal(
|
300 |
+
input_ids=input_ids,
|
301 |
+
pixel_values=pixel_values[i:i+1]
|
302 |
+
)
|
303 |
+
generate_output = self.llm.generate(
|
304 |
+
**mm_inputs,
|
305 |
+
**self.generation_args
|
306 |
+
)[0]
|
307 |
+
prediction.append(self.tokenizer.decode(generate_output, skip_special_tokens=True))
|
308 |
+
|
309 |
+
if return_pixel_values:
|
310 |
+
return prediction, pixel_values
|
311 |
+
else:
|
312 |
+
return prediction
|
visual_encoder_adapter/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: /group/40006/jaronfei/models/siglip-so400m-patch14-384
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.11.1
|
visual_encoder_adapter/adapter_config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": {
|
4 |
+
"base_model_class": "SiglipVisionModel",
|
5 |
+
"parent_library": "xtuner.model.modules.visual_encoder.factory"
|
6 |
+
},
|
7 |
+
"base_model_name_or_path": "/group/40006/jaronfei/models/siglip-so400m-patch14-384",
|
8 |
+
"bias": "none",
|
9 |
+
"fan_in_fan_out": false,
|
10 |
+
"inference_mode": true,
|
11 |
+
"init_lora_weights": true,
|
12 |
+
"layer_replication": null,
|
13 |
+
"layers_pattern": null,
|
14 |
+
"layers_to_transform": null,
|
15 |
+
"loftq_config": {},
|
16 |
+
"lora_alpha": 16,
|
17 |
+
"lora_dropout": 0.05,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 64,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"k_proj",
|
27 |
+
"fc2",
|
28 |
+
"q_proj",
|
29 |
+
"v_proj",
|
30 |
+
"fc1",
|
31 |
+
"out_proj"
|
32 |
+
],
|
33 |
+
"task_type": null,
|
34 |
+
"use_dora": false,
|
35 |
+
"use_rslora": false
|
36 |
+
}
|
visual_encoder_adapter/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89c005eb2005adfb28cd7e9888b3333e73422b5c0ea3f47dd6abdb21cef00328
|
3 |
+
size 71302368
|