File size: 6,193 Bytes
a0daeed 0fb9561 a0daeed 0fb9561 a0daeed 0fb9561 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
---
base_model: microsoft/Phi-3-vision-128k-instruct
library_name: peft
---
# Model Details
- **Developed by:** Jian Chen
- **Model type:** MLLM-based encoder
- **Finetuned from model:** [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)
## Model Sources [optional]
- **GitHub:** [SV-RAG](https://github.com/puar-playground/SV-RAG)
- **Paper:** [SV-RAG: LoRA-Contextualizing Adaptation of Large Multimodal Models for Long Document Understanding](https://arxiv.org/abs/2411.01106)
## Uses
A demo script is provided in the [GitHub](https://github.com/puar-playground/SV-RAG/blob/main/test_retrieval.py)
Alternatively, this code provides a more detailed breakdown of the computation. The [`colpali_engine`](https://github.com/puar-playground/SV-RAG/tree/main/colpali_engine) used is customized and is available in the GitHub.
```
from colpali_engine.models import ColPhi, ColPhiProcessor
class ColPhiRetriever(BaseRetriever):
"""Retriever class using ColPhi for multimodal retrieval."""
def __init__(self, model_name="puar-playground/Col-Phi-3-V", device="cuda" if torch.cuda.is_available() else "cpu"):
"""
Initializes the ColPhi model.
Args:
model_name (str): The model identifier.
device (str): Device to run the model on ('cuda' or 'cpu').
"""
os.system('pip install transformers==4.47.1')
self.multimodel = True
self.device = device
self.model = ColPhi.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map=device,
).eval()
self.processor = ColPhiProcessor.from_pretrained(model_name)
@staticmethod
def pad_and_cat_tensors(tensor_list):
# Find the maximum length of the second dimension (x_i) across all tensors
max_x = max(tensor.size(1) for tensor in tensor_list)
# Pad tensors to have the same size in the second dimension
padded_tensors = []
for tensor in tensor_list:
padding_size = max_x - tensor.size(1)
# Pad with zeros on the right in the second dimension
padded_tensor = torch.nn.functional.pad(tensor, (0, 0, 0, padding_size))
padded_tensors.append(padded_tensor)
# Concatenate the padded tensors along the first dimension
result_tensor = torch.cat(padded_tensors, dim=0)
return result_tensor
def process_text(self, query_list: List[str], batch_size: int = 2):
"""
Processes a list of text queries into embeddings using ColPhi in batches.
Args:
query_list (List[str]): List of query texts.
batch_size (int): Number of queries processed per batch.
Returns:
torch.Tensor: Concatenated embeddings for all queries.
"""
all_embeddings = []
for i in range(0, len(query_list), batch_size):
batch_queries = query_list[i : i + batch_size]
# Convert queries to model-compatible format
batch_inputs = self.processor.process_queries(batch_queries).to(self.model.device)
with torch.no_grad():
batch_embeddings = self.model(**batch_inputs)
all_embeddings.append(batch_embeddings.to("cpu"))
# Concatenate all processed batches into a single tensor
all_embeddings = self.pad_and_cat_tensors(all_embeddings)
# Concatenate all batch outputs into a single tensor
return all_embeddings
def process_image(self, image_dir_list: List[str]):
"""Processes images into embeddings using ColPhi."""
def process_images_in_batches(processor, img_dir_list, model, batch_size=1):
all_embeddings = []
# Split img_dir_list into batches
for i in range(0, len(img_dir_list), batch_size):
batch_img_dirs = img_dir_list[i:i + batch_size]
image_list = [Image.open(img_dir) for img_dir in batch_img_dirs]
# Process the batch of images
batch_features = processor.process_images(image_list)
# Extract the tensor from the BatchFeature object
batch_images = {k: v.to(model.device) for k, v in batch_features.items()}
# Assuming the model expects a specific input (e.g., 'pixel_values')
embeddings = model(**batch_images)
# Move embeddings to CPU and append to the list
embeddings = embeddings.to("cpu")
all_embeddings.append(embeddings)
# Concatenate all processed batches into a single tensor
all_embeddings = torch.cat(all_embeddings, dim=0)
return all_embeddings
# Forward pass
with torch.no_grad():
# image_embeddings = model(**batch_images)
image_embeddings = process_images_in_batches(self.processor, image_dir_list, self.model)
return image_embeddings
def compute_similarity(self, text_embeddings, image_embeddings):
""" Computes cosine similarity between text and image embeddings. """
scores = self.processor.score_multi_vector(text_embeddings, image_embeddings)
return scores
def retrieve(self, query_list: str, image_list: List[str]):
with torch.no_grad():
text_embeddings = self.process_text(query_list)
image_embeddings = self.process_image(image_list)
similarity_score = self.compute_similarity(text_embeddings, image_embeddings)
values, top_indices = torch.tensor(similarity_score).sort(descending=True)
return values, top_indices
```
## Citation
```
@article{chen2024lora,
title={LoRA-Contextualizing Adaptation of Large Multimodal Models for Long Document Understanding},
author={Chen, Jian and Zhang, Ruiyi and Zhou, Yufan and Yu, Tong and Dernoncourt, Franck and Gu, Jiuxiang and Rossi, Ryan A and Chen, Changyou and Sun, Tong},
journal={arXiv preprint arXiv:2411.01106},
year={2024}
}
```
|