Spaces:
Sleeping
Sleeping
XiaoyiYangRIT
commited on
Commit
·
7ea733c
1
Parent(s):
7aa5317
Update some files
Browse files- src/__init__.py +5 -0
- src/model_loader.py +62 -0
- src/prompt.py +7 -0
- src/video_utils.py +94 -0
src/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/__init__.py
|
2 |
+
# 空文件,用于将 src 文件夹标记为 Python 包。
|
3 |
+
# 这样可以通过 from src import model_loader 等方式进行模块导入。
|
4 |
+
|
5 |
+
__version__ = "0.1.0"
|
src/model_loader.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/model_loader.py
|
2 |
+
import os
|
3 |
+
import math
|
4 |
+
import torch
|
5 |
+
from transformers import AutoModel, AutoTokenizer, AutoConfig
|
6 |
+
from huggingface_hub import snapshot_download
|
7 |
+
|
8 |
+
MODEL_NAME = "OpenGVLab/InternVL3-14B"
|
9 |
+
CACHE_DIR = "/data/internvl3_model"
|
10 |
+
|
11 |
+
# === 自动分配模型层到多张 GPU(InternVL3 建议方式) ===
|
12 |
+
def split_model(model_path):
|
13 |
+
device_map = {}
|
14 |
+
world_size = torch.cuda.device_count()
|
15 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
16 |
+
num_layers = config.llm_config.num_hidden_layers
|
17 |
+
|
18 |
+
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
|
19 |
+
num_layers_per_gpu = [num_layers_per_gpu] * world_size
|
20 |
+
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
|
21 |
+
|
22 |
+
layer_cnt = 0
|
23 |
+
for i, num_layer in enumerate(num_layers_per_gpu):
|
24 |
+
for _ in range(num_layer):
|
25 |
+
device_map[f'language_model.model.layers.{layer_cnt}'] = i
|
26 |
+
layer_cnt += 1
|
27 |
+
|
28 |
+
# 固定组件放在 GPU 0
|
29 |
+
for key in [
|
30 |
+
'vision_model', 'mlp1',
|
31 |
+
'language_model.model.tok_embeddings',
|
32 |
+
'language_model.model.embed_tokens',
|
33 |
+
'language_model.output',
|
34 |
+
'language_model.model.norm',
|
35 |
+
'language_model.model.rotary_emb',
|
36 |
+
'language_model.lm_head',
|
37 |
+
f'language_model.model.layers.{num_layers - 1}'
|
38 |
+
]:
|
39 |
+
device_map[key] = 0
|
40 |
+
|
41 |
+
return device_map
|
42 |
+
|
43 |
+
# === 模型加载函数 ===
|
44 |
+
def load_model():
|
45 |
+
if not os.path.exists(CACHE_DIR):
|
46 |
+
print("⏬ First run: downloading model to persistent storage...")
|
47 |
+
snapshot_download(repo_id=MODEL_NAME, local_dir=CACHE_DIR)
|
48 |
+
else:
|
49 |
+
print("✅ Loaded model from persistent cache.")
|
50 |
+
|
51 |
+
device_map = split_model(CACHE_DIR)
|
52 |
+
tokenizer = AutoTokenizer.from_pretrained(CACHE_DIR, trust_remote_code=True)
|
53 |
+
model = AutoModel.from_pretrained(
|
54 |
+
CACHE_DIR,
|
55 |
+
torch_dtype=torch.bfloat16,
|
56 |
+
low_cpu_mem_usage=True,
|
57 |
+
use_flash_attn=False, # 或者True,如果确认安装好FlashAttention
|
58 |
+
trust_remote_code=True,
|
59 |
+
device_map=device_map
|
60 |
+
).eval()
|
61 |
+
|
62 |
+
return tokenizer, model
|
src/prompt.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/prompt.py
|
2 |
+
|
3 |
+
def build_video_prompt(num_frames: int) -> str:
|
4 |
+
"""构建适用于 InternVL3 的单轮 AR 视频评估提示语。"""
|
5 |
+
frame_descriptors = ''.join([f"Frame{i+1}: <image>\n" for i in range(num_frames)])
|
6 |
+
final_prompt = frame_descriptors + "Evaluate the quality of AR occlusion and rendering in the uploaded video."
|
7 |
+
return final_prompt
|
src/video_utils.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/video_utils.py
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from PIL import Image
|
5 |
+
from decord import VideoReader, cpu
|
6 |
+
import torchvision.transforms as T
|
7 |
+
from torchvision.transforms.functional import InterpolationMode
|
8 |
+
from src.prompt import build_video_prompt
|
9 |
+
|
10 |
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
11 |
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
12 |
+
|
13 |
+
# === 构建标准图像预处理 transform ===
|
14 |
+
def build_transform(input_size=448):
|
15 |
+
return T.Compose([
|
16 |
+
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
17 |
+
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
18 |
+
T.ToTensor(),
|
19 |
+
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
|
20 |
+
])
|
21 |
+
|
22 |
+
# === InternVL3 视频帧采样策略 ===
|
23 |
+
def get_frame_indices(num_frames, total_frames):
|
24 |
+
indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
25 |
+
return indices
|
26 |
+
|
27 |
+
# === 从视频中提取图像帧并预处理成 patch tensor ===
|
28 |
+
def process_video_for_internvl3(video_path, num_segments=8, max_patch_per_frame=1, input_size=448):
|
29 |
+
vr = VideoReader(video_path, ctx=cpu(0))
|
30 |
+
total_frames = len(vr)
|
31 |
+
frame_indices = get_frame_indices(num_segments, total_frames)
|
32 |
+
transform = build_transform(input_size)
|
33 |
+
|
34 |
+
pixel_values_list, num_patches_list = [], []
|
35 |
+
for idx in frame_indices:
|
36 |
+
img = Image.fromarray(vr[idx].asnumpy()).convert("RGB")
|
37 |
+
patches = dynamic_preprocess(img, image_size=input_size, max_num=max_patch_per_frame)
|
38 |
+
patch_tensors = [transform(tile) for tile in patches]
|
39 |
+
patch_tensor = torch.stack(patch_tensors)
|
40 |
+
pixel_values_list.append(patch_tensor)
|
41 |
+
num_patches_list.append(patch_tensor.shape[0])
|
42 |
+
|
43 |
+
pixel_values = torch.cat(pixel_values_list, dim=0).to(torch.bfloat16).cuda()
|
44 |
+
prompt = build_video_prompt(len(num_patches_list))
|
45 |
+
|
46 |
+
return pixel_values, num_patches_list, prompt
|
47 |
+
|
48 |
+
# === 图像切片为 patch 区块 ===
|
49 |
+
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=True):
|
50 |
+
orig_width, orig_height = image.size
|
51 |
+
aspect_ratio = orig_width / orig_height
|
52 |
+
|
53 |
+
# 构造备选分块比率
|
54 |
+
target_ratios = set(
|
55 |
+
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1)
|
56 |
+
if i * j <= max_num and i * j >= min_num
|
57 |
+
)
|
58 |
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
59 |
+
|
60 |
+
best_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
61 |
+
target_width = image_size * best_ratio[0]
|
62 |
+
target_height = image_size * best_ratio[1]
|
63 |
+
blocks = best_ratio[0] * best_ratio[1]
|
64 |
+
|
65 |
+
resized_img = image.resize((target_width, target_height))
|
66 |
+
processed_images = []
|
67 |
+
for i in range(blocks):
|
68 |
+
box = (
|
69 |
+
(i % (target_width // image_size)) * image_size,
|
70 |
+
(i // (target_width // image_size)) * image_size,
|
71 |
+
((i % (target_width // image_size)) + 1) * image_size,
|
72 |
+
((i // (target_width // image_size)) + 1) * image_size
|
73 |
+
)
|
74 |
+
split_img = resized_img.crop(box)
|
75 |
+
processed_images.append(split_img)
|
76 |
+
|
77 |
+
if use_thumbnail and len(processed_images) != 1:
|
78 |
+
thumbnail_img = image.resize((image_size, image_size))
|
79 |
+
processed_images.append(thumbnail_img)
|
80 |
+
|
81 |
+
return processed_images
|
82 |
+
|
83 |
+
# === 找出最接近原图比例的块切方案 ===
|
84 |
+
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
85 |
+
best_ratio_diff = float('inf')
|
86 |
+
best_ratio = (1, 1)
|
87 |
+
area = width * height
|
88 |
+
for ratio in target_ratios:
|
89 |
+
target_aspect = ratio[0] / ratio[1]
|
90 |
+
diff = abs(aspect_ratio - target_aspect)
|
91 |
+
if diff < best_ratio_diff or (diff == best_ratio_diff and area > 0.5 * image_size**2 * ratio[0] * ratio[1]):
|
92 |
+
best_ratio_diff = diff
|
93 |
+
best_ratio = ratio
|
94 |
+
return best_ratio
|