Update README.md
Browse files
README.md
CHANGED
|
@@ -29,20 +29,7 @@ For example, DSE-QWen2-2b-MRL-V1 achieves **85.8** nDCG@5 on [ViDoRE](https://hu
|
|
| 29 |
|
| 30 |
|
| 31 |
## Note:
|
| 32 |
-
|
| 33 |
-
1. clone latest transformers, `git clone https://github.com/huggingface/transformers.git`
|
| 34 |
-
2. Fix a bug in `transformers/models/qwen2_vl/modeling_qwen2_vl.py` around line 1774
|
| 35 |
-
```
|
| 36 |
-
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
|
| 37 |
-
# change the if statement below to if cache_position is not None and cache_position[0] != 0:
|
| 38 |
-
if cache_position[0] != 0:
|
| 39 |
-
pixel_values = None
|
| 40 |
-
pixel_values_videos = None
|
| 41 |
-
```
|
| 42 |
-
3. Install latest transformers from source `pip install -e .`
|
| 43 |
-
4. `pip install qwen-vl-utils`
|
| 44 |
-
|
| 45 |
-
> QWen vision encoder may take high GPU memory if the input image is large. Adjust `'resized_height':680 , 'resized_width':680` (see below) to fit VRAM based on GPU resources.
|
| 46 |
|
| 47 |
## How to Use the Model
|
| 48 |
|
|
@@ -96,7 +83,8 @@ query_texts = [
|
|
| 96 |
]
|
| 97 |
query_image_inputs, query_video_inputs = process_vision_info(query_messages)
|
| 98 |
query_inputs = processor(text=query_texts, images=query_image_inputs, videos=query_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
| 99 |
-
|
|
|
|
| 100 |
with torch.no_grad():
|
| 101 |
output = model(**query_inputs, return_dict=True, output_hidden_states=True)
|
| 102 |
query_embeddings = get_embedding(output.hidden_states[-1], 1536) # adjust dimensionality for efficiency trade-off, e.g. 512
|
|
@@ -138,7 +126,8 @@ doc_texts = [
|
|
| 138 |
]
|
| 139 |
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
|
| 140 |
doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
| 141 |
-
|
|
|
|
| 142 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
| 143 |
with torch.no_grad():
|
| 144 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
|
@@ -184,7 +173,8 @@ doc_texts = [
|
|
| 184 |
]
|
| 185 |
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
|
| 186 |
doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
| 187 |
-
|
|
|
|
| 188 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
| 189 |
with torch.no_grad():
|
| 190 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
## Note:
|
| 32 |
+
QWen vision encoder may take high GPU memory if the input image is large. Adjust `'resized_height':680 , 'resized_width':680` (see below) to fit VRAM based on GPU resources.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
## How to Use the Model
|
| 35 |
|
|
|
|
| 83 |
]
|
| 84 |
query_image_inputs, query_video_inputs = process_vision_info(query_messages)
|
| 85 |
query_inputs = processor(text=query_texts, images=query_image_inputs, videos=query_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
| 86 |
+
cache_position = torch.arange(0, len(query_texts))
|
| 87 |
+
query_inputs = model.prepare_inputs_for_generation(**query_inputs, cache_position=cache_position, use_cache=False)
|
| 88 |
with torch.no_grad():
|
| 89 |
output = model(**query_inputs, return_dict=True, output_hidden_states=True)
|
| 90 |
query_embeddings = get_embedding(output.hidden_states[-1], 1536) # adjust dimensionality for efficiency trade-off, e.g. 512
|
|
|
|
| 126 |
]
|
| 127 |
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
|
| 128 |
doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
| 129 |
+
cache_position = torch.arange(0, len(doc_texts))
|
| 130 |
+
doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, cache_position=cache_position, use_cache=False)
|
| 131 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
| 132 |
with torch.no_grad():
|
| 133 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
|
|
|
| 173 |
]
|
| 174 |
doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages)
|
| 175 |
doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0')
|
| 176 |
+
cache_position = torch.arange(0, len(doc_texts))
|
| 177 |
+
doc_inputs = model.prepare_inputs_for_generation(**doc_inputs, cache_position=cache_position, use_cache=False)
|
| 178 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|
| 179 |
with torch.no_grad():
|
| 180 |
output = model(**doc_inputs, return_dict=True, output_hidden_states=True)
|