yanziang commited on
Commit
918b491
ยท
verified ยท
1 Parent(s): 0020250

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +138 -3
README.md CHANGED
@@ -1,3 +1,138 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ library_name: transformers
5
+ license: apache-2.0
6
+ metrics:
7
+ - accuracy
8
+ tags:
9
+ - multimodal
10
+ pipeline_tag: video-text-to-text
11
+ base_model: Qwen/Qwen2.5-VL-7B-Instruct
12
+ ---
13
+
14
+
15
+ # ๐Ÿ’ก VideoChat-R1_5
16
+
17
+ [\[๐Ÿ“‚ GitHub\]](https://github.com/OpenGVLab/VideoChat-R1)
18
+ [\[๐Ÿ“œ Tech Report\]](https://arxiv.org/pdf/2509.21100v1)
19
+
20
+
21
+ ## ๐Ÿš€ How to use the model
22
+
23
+ We provide a simple installation example below:
24
+ ```
25
+ pip install transformers
26
+ ```
27
+ Using qwen_vl_utils in https://github.com/OpenGVLab/VideoChat-R1/blob/main/Videochat-R1.5/src_eval/my_vision_process.py
28
+
29
+ Then you could use our model:
30
+ ```python
31
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
32
+ from qwen_vl_utils import process_vision_info
33
+
34
+ model_path = "OpenGVLab/VideoChat-R1_5"
35
+ # default: Load the model on the available device(s)
36
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
37
+ model_path, torch_dtype="auto", device_map="auto",
38
+ attn_implementation="flash_attention_2"
39
+ )
40
+
41
+ # default processer
42
+ processor = AutoProcessor.from_pretrained(model_path)
43
+
44
+ video_path = "your_video.mp4"
45
+ question = "your_qa.mp4"
46
+ num_percptions = 3
47
+
48
+ QA_THINK_GLUE = """Answer the question: "[QUESTION]" according to the content of the video.
49
+
50
+ Output your think process within the <think> </think> tags.
51
+
52
+ Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. At the same time, in the <glue> </glue> tags, present the precise time period in seconds of the video clips on which you base your answer to this question in the format of [(s1, e1), (s2, e2), ...]. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.
53
+ """
54
+
55
+ QA_THINK = """Answer the question: "[QUESTION]" according to the content of the video.
56
+
57
+ Output your think process within the <think> </think> tags.
58
+
59
+ Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.
60
+ """
61
+
62
+
63
+ def inference(video_path, prompt, model, processor, max_new_tokens=2048, device="cuda:0", client = None, pred_glue=None):
64
+ messages = [
65
+ {"role": "user", "content": [
66
+ {"type": "video",
67
+ "video": video_path,
68
+ 'key_time':pred_glue,
69
+ "total_pixels": 128*12 * 28 * 28,
70
+ "min_pixels": 128 * 28 * 28,
71
+ },
72
+ {"type": "text", "text": prompt},
73
+ ]
74
+ },
75
+ ]
76
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
77
+
78
+ image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, client = client)
79
+ fps_inputs = video_kwargs['fps']
80
+
81
+ inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
82
+ inputs = inputs.to(device)
83
+
84
+ with torch.no_grad():
85
+ output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
86
+
87
+ generated_ids = [output_ids[i][len(inputs.input_ids[i]):] for i in range(len(output_ids))]
88
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
89
+ return output_text[0]
90
+
91
+
92
+ for percption in range(num_percptions):
93
+
94
+ if percption == num_percptions - 1:
95
+ example_prompt = QA_THINK.replace("[QUESTION]", item["problem"]["question"])
96
+ else:
97
+ example_prompt = QA_THINK_GLUE.replace("[QUESTION]", item["problem"]["question"])
98
+
99
+
100
+ ans = inference(video_path, prompt, model, processor, device=device, client=client, pred_glue=pred_glue)
101
+
102
+ pattern_glue = r'<glue>(.*?)</glue>'
103
+ match_glue = re.search(pattern_glue, ans, re.DOTALL)
104
+ # print(f'ann:{ans}')
105
+ answers.append(ans)
106
+
107
+ try:
108
+ if match_glue:
109
+ glue = match_glue.group(1)
110
+ pred_glue = ast.literal_eval(glue)
111
+
112
+
113
+ except Exception as e:
114
+ # iou = 0
115
+ pred_glue = None
116
+ print(ans)
117
+ ```
118
+
119
+
120
+ # :page_facing_up: Citation
121
+
122
+ If you find this project useful in your research, please consider cite:
123
+ ```BibTeX
124
+ @article{li2025videochatr1,
125
+ title={VideoChat-R1: Enhancing Spatio-Temporal
126
+ Perception via Reinforcement Fine-Tuning},
127
+ author={Li, Xinhao and Yan, Ziang and Meng, Desen and Dong, Lu and Zeng, Xiangyu and He, Yinan and Wang, Yali and Qiao, Yu and Wang, Yi and Wang, Limin},
128
+ journal={arXiv preprint arXiv:2504.06958},
129
+ year={2025}
130
+ }
131
+
132
+ @article{yan2025videochatr15,
133
+ title={VideoChat-R1.5: Visual Test-Time Scaling to Reinforce Multimodal Reasoning by Iterative Perception},
134
+ author={Yan, Ziang and Li, Xinhao and He, Yinan and Zhengrong Yue and Zeng, Xiangyu and Wang, Yali and Qiao, Yu and Wang, Limin and Wang, Yi},
135
+ journal={arXiv preprint arXiv:2509.21100},
136
+ year={2025}
137
+ }
138
+ ```