Bia
/

CORAL

Safetensors

qwen2_5_vl

Model card Files Files and versions

xet

Community

Bia commited on 14 days ago

Commit

e064cb5

verified ·

1 Parent(s): 85baf14

Update Reademe.md

Browse files

Files changed (1) hide show

README.md +21 -13

README.md CHANGED Viewed

@@ -33,11 +33,16 @@ We introduce CORAL, a multi-modal embedding model built upon Qwen2.5-3B-Instruct
 CORAL is short for Contrastive Reconstruction for Multimodal Retrieval. The loss function of CORAL consists of three components: Contrastive Learning Loss, Vision Reconstruction Loss, and Masked Language Modeling Loss. During training, we reconstruct both the query and its corresponding positive sample.
 <p align="center">
-  <img src="https://merit-2025.github.io/static/images/part3/method.png" alt="CORAL Structure" style="width: 100%; max-width: 500px;">
 </p>
 <p align="center"><b>Overview for CORAL</b></p>
 ## Usage
@@ -46,6 +51,7 @@ CORAL is short for Contrastive Reconstruction for Multimodal Retrieval. The loss
 We provide the checkpoint of CORAL on Huggingface. You can load the model using the following code:
 ```python
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
@@ -54,6 +60,7 @@ from qwen_vl_utils import process_vision_info
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Bia/CORAL", torch_dtype="auto", device_map="auto"
 )
 processor = AutoProcessor.from_pretrained("Bia/CORAL")
 ## Prepare Inputs
@@ -64,12 +71,12 @@ query = [
             {"type": "text", "text": "Find a product of backpack that have the same brand with <Product 1> \n "},
             {
                 "type": "image",
-                "image": "images/product_1.jpg",
             },
             {"type": "text", "text": "\n Ransel MOSSDOOM Polyester dengan Ruang Komputer dan Penyimpanan Besar, Ukuran $30 \times 12 \times 38$ cm , Berat 0.32 kg. </Product 1> and the same fashion style with <Product 2> "},
             {
                 "type": "image",
-                "image": "images/product_2.jpg",
             },
             {"type": "text", "text": "\n Elegant Pink Flats with Low Heel and Buckle Closure for Stylish Party Wear </Product 2> with a quilted texture and a chain strap."}
             ],
@@ -83,7 +90,7 @@ candidate = [
             {"type": "text", "text": "Represent the given product: "},
             {
                 "type": "image",
-                "image": "images/product_3.jpg",
             },
             {"type": "text", "text": "\n MOSSDOOM Elegant Pink PU Leather Handbag with Chain Strap and Large Capacity, Compact Size $18 \times 9.5 \times 15 \mathrm{~cm}$."},
         ],
@@ -120,19 +127,20 @@ candidate_inputs = processor(
 # Encode Embeddings
-query_outputs = model(**query_inputs, return_dict=True, output_hidden_states=True)
-query_embedding = query_outputs.hidden_states[-1][:,-1,:]
-query_embedding = torch.nn.functional.normalize(query_embedding, dim=-1)
-print(query_embedding.shape)  # torch.Size([1, 2048])
-candidate_outputs = model(**inputs, return_dict=True, output_hidden_states=True)
-candidate_embedding = candidate_outputs.hidden_states[-1][:,-1,:]
-candidate_embedding = torch.nn.functional.normalize(candidate_embedding, dim=-1)
-print(candidate_embedding.shape)  # torch.Size([1, 2048])
 # Compute Similarity
 similarity = torch.matmul(query_embedding, candidate_embedding.T)
-print(similarity)  # tensor([[0.7650]], device='cuda:0')
 ```
 ## Evaluation

 CORAL is short for Contrastive Reconstruction for Multimodal Retrieval. The loss function of CORAL consists of three components: Contrastive Learning Loss, Vision Reconstruction Loss, and Masked Language Modeling Loss. During training, we reconstruct both the query and its corresponding positive sample.
 <p align="center">
+  <img src="https://merit-2025.github.io/static/images/part3/method.png" alt="CORAL Overview" style="width: 100%; max-width: 600px;">
 </p>
 <p align="center"><b>Overview for CORAL</b></p>
+<p align="center">
+  <img src="images/example.jpg" alt="Example" style="width: 100%; max-width: 600px;">
+</p>
+<p align="center"><b>Example Query and Ground Truth</b></p>
 ## Usage
 We provide the checkpoint of CORAL on Huggingface. You can load the model using the following code:
 ```python
+import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Bia/CORAL", torch_dtype="auto", device_map="auto"
 )
 processor = AutoProcessor.from_pretrained("Bia/CORAL")
 ## Prepare Inputs
             {"type": "text", "text": "Find a product of backpack that have the same brand with <Product 1> \n "},
             {
                 "type": "image",
+                "image": "CORAL/images/product_1.png",
             },
             {"type": "text", "text": "\n Ransel MOSSDOOM Polyester dengan Ruang Komputer dan Penyimpanan Besar, Ukuran $30 \times 12 \times 38$ cm , Berat 0.32 kg. </Product 1> and the same fashion style with <Product 2> "},
             {
                 "type": "image",
+                "image": "CORAL/images/product_2.png",
             },
             {"type": "text", "text": "\n Elegant Pink Flats with Low Heel and Buckle Closure for Stylish Party Wear </Product 2> with a quilted texture and a chain strap."}
             ],
             {"type": "text", "text": "Represent the given product: "},
             {
                 "type": "image",
+                "image": "CORAL/images/product_3.png",
             },
             {"type": "text", "text": "\n MOSSDOOM Elegant Pink PU Leather Handbag with Chain Strap and Large Capacity, Compact Size $18 \times 9.5 \times 15 \mathrm{~cm}$."},
         ],
 # Encode Embeddings
+with torch.inference_mode():
+    query_outputs = model(**query_inputs, return_dict=True, output_hidden_states=True)
+    query_embedding = query_outputs.hidden_states[-1][:,-1,:]
+    query_embedding = torch.nn.functional.normalize(query_embedding, dim=-1)
+    print(query_embedding.shape)  # torch.Size([1, 2048])
+    candidate_outputs = model(**candidate_inputs, return_dict=True, output_hidden_states=True)
+    candidate_embedding = candidate_outputs.hidden_states[-1][:,-1,:]
+    candidate_embedding = torch.nn.functional.normalize(candidate_embedding, dim=-1)
+    print(candidate_embedding.shape)  # torch.Size([1, 2048])
 # Compute Similarity
 similarity = torch.matmul(query_embedding, candidate_embedding.T)
+print(similarity)  # tensor([[0.6992]], device='cuda:0', dtype=torch.bfloat16)
 ```
 ## Evaluation