File size: 4,019 Bytes
7cfebe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6592d0
201f268
7cfebe5
 
 
 
 
 
 
 
 
 
 
 
 
 
2c3f501
 
7cfebe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201f268
7cfebe5
753781d
f56b328
7cfebe5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201f268
753781d
 
ba1a0ec
7cfebe5
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os

import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import json
from tqdm import tqdm
import os

def read_json(file_path): 
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def write_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# default: Load the model on the available device(s)
print(torch.cuda.device_count())
model_path = "/home/zbz5349/WorkSpace/aigeeks/Qwen2.5-VL/ckpt"
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     model_path, torch_dtype="auto", device_map="auto"
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained(model_path)
print(model.device)




data = read_json('/home/zbz5349/WorkSpace/aigeeks/Qwen2.5-VL/LLaMA-Factory/data/Percption.json')
save_data = []
correct_num = 0
begin = 0
end = 1
batch_size = 1
for batch_idx in tqdm(range(begin, end, batch_size)):
    batch = data[batch_idx:batch_idx + batch_size] 
    
    image_list = []
    input_text_list = []
    data_list = []
    save_list = []
    sd_ans = []
    # while True:
    for idx, i in enumerate(batch):
        save_ =  {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "file:///path/to/video1.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {"type": "image", "image": "file:///path/to/image2.jpg"},
            {"type": "text", "text": "Describe this video."},
        ],
        "answer":"None",
        "result":"None",
    }
        messages =  {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "file:///path/to/video1.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
          {"type": "image", "image": "file:///path/to/image2.jpg"},
            {"type": "text", "text": "Describe this video."},
        ],
    }


        video_path = i['videos']
        image_path = i['images']
        question = i['messages'][0]['content']
        answer = i['messages'][1]['content']
        messages['content'][0]['video'] = video_path
        messages['content'][1]['image'] = image_path
        messages['content'][2]['text'] = question

        save_['content'][0]['video'] = video_path
        save_['content'][1]['image'] = image_path
        save_['content'][2]['text'] = question
        save_['answer'] = answer
        sd_ans.append(answer)
        data_list.append(messages)
        save_list.append(save_)

    text = processor.apply_chat_template(data_list, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs, video_kwargs = process_vision_info(data_list, return_video_kwargs=True)
    fps = 1
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    )
    inputs = inputs.to(model.device)

    # Inference
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    for idx,x in enumerate(output_text):
        save_list[idx]['result'] = x
        save_data.append(save_list[idx])
        
print("correct_num", correct_num)
write_json("infer_answer_.json",save_data)