XiaoyiYangRIT commited on
Commit
7ea733c
·
1 Parent(s): 7aa5317

Update some files

Browse files
Files changed (4) hide show
  1. src/__init__.py +5 -0
  2. src/model_loader.py +62 -0
  3. src/prompt.py +7 -0
  4. src/video_utils.py +94 -0
src/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # src/__init__.py
2
+ # 空文件,用于将 src 文件夹标记为 Python 包。
3
+ # 这样可以通过 from src import model_loader 等方式进行模块导入。
4
+
5
+ __version__ = "0.1.0"
src/model_loader.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/model_loader.py
2
+ import os
3
+ import math
4
+ import torch
5
+ from transformers import AutoModel, AutoTokenizer, AutoConfig
6
+ from huggingface_hub import snapshot_download
7
+
8
+ MODEL_NAME = "OpenGVLab/InternVL3-14B"
9
+ CACHE_DIR = "/data/internvl3_model"
10
+
11
+ # === 自动分配模型层到多张 GPU(InternVL3 建议方式) ===
12
+ def split_model(model_path):
13
+ device_map = {}
14
+ world_size = torch.cuda.device_count()
15
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
16
+ num_layers = config.llm_config.num_hidden_layers
17
+
18
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
19
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
20
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
21
+
22
+ layer_cnt = 0
23
+ for i, num_layer in enumerate(num_layers_per_gpu):
24
+ for _ in range(num_layer):
25
+ device_map[f'language_model.model.layers.{layer_cnt}'] = i
26
+ layer_cnt += 1
27
+
28
+ # 固定组件放在 GPU 0
29
+ for key in [
30
+ 'vision_model', 'mlp1',
31
+ 'language_model.model.tok_embeddings',
32
+ 'language_model.model.embed_tokens',
33
+ 'language_model.output',
34
+ 'language_model.model.norm',
35
+ 'language_model.model.rotary_emb',
36
+ 'language_model.lm_head',
37
+ f'language_model.model.layers.{num_layers - 1}'
38
+ ]:
39
+ device_map[key] = 0
40
+
41
+ return device_map
42
+
43
+ # === 模型加载函数 ===
44
+ def load_model():
45
+ if not os.path.exists(CACHE_DIR):
46
+ print("⏬ First run: downloading model to persistent storage...")
47
+ snapshot_download(repo_id=MODEL_NAME, local_dir=CACHE_DIR)
48
+ else:
49
+ print("✅ Loaded model from persistent cache.")
50
+
51
+ device_map = split_model(CACHE_DIR)
52
+ tokenizer = AutoTokenizer.from_pretrained(CACHE_DIR, trust_remote_code=True)
53
+ model = AutoModel.from_pretrained(
54
+ CACHE_DIR,
55
+ torch_dtype=torch.bfloat16,
56
+ low_cpu_mem_usage=True,
57
+ use_flash_attn=False, # 或者True,如果确认安装好FlashAttention
58
+ trust_remote_code=True,
59
+ device_map=device_map
60
+ ).eval()
61
+
62
+ return tokenizer, model
src/prompt.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # src/prompt.py
2
+
3
+ def build_video_prompt(num_frames: int) -> str:
4
+ """构建适用于 InternVL3 的单轮 AR 视频评估提示语。"""
5
+ frame_descriptors = ''.join([f"Frame{i+1}: <image>\n" for i in range(num_frames)])
6
+ final_prompt = frame_descriptors + "Evaluate the quality of AR occlusion and rendering in the uploaded video."
7
+ return final_prompt
src/video_utils.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/video_utils.py
2
+ import numpy as np
3
+ import torch
4
+ from PIL import Image
5
+ from decord import VideoReader, cpu
6
+ import torchvision.transforms as T
7
+ from torchvision.transforms.functional import InterpolationMode
8
+ from src.prompt import build_video_prompt
9
+
10
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
11
+ IMAGENET_STD = (0.229, 0.224, 0.225)
12
+
13
+ # === 构建标准图像预处理 transform ===
14
+ def build_transform(input_size=448):
15
+ return T.Compose([
16
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
17
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
18
+ T.ToTensor(),
19
+ T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
20
+ ])
21
+
22
+ # === InternVL3 视频帧采样策略 ===
23
+ def get_frame_indices(num_frames, total_frames):
24
+ indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
25
+ return indices
26
+
27
+ # === 从视频中提取图像帧并预处理成 patch tensor ===
28
+ def process_video_for_internvl3(video_path, num_segments=8, max_patch_per_frame=1, input_size=448):
29
+ vr = VideoReader(video_path, ctx=cpu(0))
30
+ total_frames = len(vr)
31
+ frame_indices = get_frame_indices(num_segments, total_frames)
32
+ transform = build_transform(input_size)
33
+
34
+ pixel_values_list, num_patches_list = [], []
35
+ for idx in frame_indices:
36
+ img = Image.fromarray(vr[idx].asnumpy()).convert("RGB")
37
+ patches = dynamic_preprocess(img, image_size=input_size, max_num=max_patch_per_frame)
38
+ patch_tensors = [transform(tile) for tile in patches]
39
+ patch_tensor = torch.stack(patch_tensors)
40
+ pixel_values_list.append(patch_tensor)
41
+ num_patches_list.append(patch_tensor.shape[0])
42
+
43
+ pixel_values = torch.cat(pixel_values_list, dim=0).to(torch.bfloat16).cuda()
44
+ prompt = build_video_prompt(len(num_patches_list))
45
+
46
+ return pixel_values, num_patches_list, prompt
47
+
48
+ # === 图像切片为 patch 区块 ===
49
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=True):
50
+ orig_width, orig_height = image.size
51
+ aspect_ratio = orig_width / orig_height
52
+
53
+ # 构造备选分块比率
54
+ target_ratios = set(
55
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1)
56
+ if i * j <= max_num and i * j >= min_num
57
+ )
58
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
59
+
60
+ best_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
61
+ target_width = image_size * best_ratio[0]
62
+ target_height = image_size * best_ratio[1]
63
+ blocks = best_ratio[0] * best_ratio[1]
64
+
65
+ resized_img = image.resize((target_width, target_height))
66
+ processed_images = []
67
+ for i in range(blocks):
68
+ box = (
69
+ (i % (target_width // image_size)) * image_size,
70
+ (i // (target_width // image_size)) * image_size,
71
+ ((i % (target_width // image_size)) + 1) * image_size,
72
+ ((i // (target_width // image_size)) + 1) * image_size
73
+ )
74
+ split_img = resized_img.crop(box)
75
+ processed_images.append(split_img)
76
+
77
+ if use_thumbnail and len(processed_images) != 1:
78
+ thumbnail_img = image.resize((image_size, image_size))
79
+ processed_images.append(thumbnail_img)
80
+
81
+ return processed_images
82
+
83
+ # === 找出最接近原图比例的块切方案 ===
84
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
85
+ best_ratio_diff = float('inf')
86
+ best_ratio = (1, 1)
87
+ area = width * height
88
+ for ratio in target_ratios:
89
+ target_aspect = ratio[0] / ratio[1]
90
+ diff = abs(aspect_ratio - target_aspect)
91
+ if diff < best_ratio_diff or (diff == best_ratio_diff and area > 0.5 * image_size**2 * ratio[0] * ratio[1]):
92
+ best_ratio_diff = diff
93
+ best_ratio = ratio
94
+ return best_ratio