koukandre commited on
Commit
64bc3d2
1 Parent(s): 94d4306

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +11 -14
README.md CHANGED
@@ -50,20 +50,17 @@ This dual capability makes it an excellent tool for multimodal retrieval-augment
50
  ```python
51
  !pip install transformers einops timm pillow
52
  from transformers import AutoModel
53
- from numpy.linalg import norm
54
-
55
- cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
56
 
57
  # Initialize the model
58
  model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
59
 
60
  # New meaningful sentences
61
- sentences = ['Bridge close-shot', 'Bridge in far away']
62
 
63
  # Public image URLs
64
  image_urls = [
65
- 'https://fastly.picsum.photos/id/74/4288/2848.jpg?hmac=q02MzzHG23nkhJYRXR-_RgKTr6fpfwRgcXgE0EKvNB8',
66
- 'https://fastly.picsum.photos/id/84/1280/848.jpg?hmac=YFRYDI4UsfbeTzI8ZakNOR98wVU7a-9a2tGF542539s'
67
  ]
68
 
69
  # Encode text and images
@@ -71,11 +68,11 @@ text_embeddings = model.encode_text(sentences)
71
  image_embeddings = model.encode_image(image_urls) # also accepts PIL.image, local filenames, dataURI
72
 
73
  # Compute similarities
74
- print(cos_sim(text_embeddings[0], text_embeddings[1])) # text embedding similarity
75
- print(cos_sim(text_embeddings[0], image_embeddings[0])) # text-image cross-modal similarity
76
- print(cos_sim(text_embeddings[0], image_embeddings[1])) # text-image cross-modal similarity
77
- print(cos_sim(text_embeddings[1], image_embeddings[0])) # text-image cross-modal similarity
78
- print(cos_sim(text_embeddings[1], image_embeddings[1])) # text-image cross-modal similarity
79
  ```
80
 
81
  3. JavaScript developers can use Jina CLIP via the [Transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install Transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
@@ -92,7 +89,7 @@ const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patc
92
  const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
93
 
94
  // Run tokenization
95
- const texts = ['Bridge close-shot', 'Bridge in far away'];
96
  const text_inputs = tokenizer(texts, { padding: true, truncation: true });
97
 
98
  // Compute text embeddings
@@ -100,8 +97,8 @@ const { text_embeds } = await text_model(text_inputs);
100
 
101
  // Read images and run processor
102
  const urls = [
103
- 'https://fastly.picsum.photos/id/74/4288/2848.jpg?hmac=q02MzzHG23nkhJYRXR-_RgKTr6fpfwRgcXgE0EKvNB8',
104
- 'https://fastly.picsum.photos/id/84/1280/848.jpg?hmac=YFRYDI4UsfbeTzI8ZakNOR98wVU7a-9a2tGF542539s',
105
  ];
106
  const image = await Promise.all(urls.map(url => RawImage.read(url)));
107
  const image_inputs = await processor(image);
 
50
  ```python
51
  !pip install transformers einops timm pillow
52
  from transformers import AutoModel
 
 
 
53
 
54
  # Initialize the model
55
  model = AutoModel.from_pretrained('jinaai/jina-clip-v1', trust_remote_code=True)
56
 
57
  # New meaningful sentences
58
+ sentences = ['A blue cat', 'A red cat']
59
 
60
  # Public image URLs
61
  image_urls = [
62
+ 'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
63
+ 'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
64
  ]
65
 
66
  # Encode text and images
 
68
  image_embeddings = model.encode_image(image_urls) # also accepts PIL.image, local filenames, dataURI
69
 
70
  # Compute similarities
71
+ print(text_embeddings[0] @ text_embeddings[1].T) # text embedding similarity
72
+ print(text_embeddings[0] @ image_embeddings[0].T) # text-image cross-modal similarity
73
+ print(text_embeddings[0] @ image_embeddings[1].T) # text-image cross-modal similarity
74
+ print(text_embeddings[1] @ image_embeddings[0].T) # text-image cross-modal similarity
75
+ print(text_embeddings[1] @ image_embeddings[1].T)# text-image cross-modal similarity
76
  ```
77
 
78
  3. JavaScript developers can use Jina CLIP via the [Transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install Transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
 
89
  const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
90
 
91
  // Run tokenization
92
+ const texts = ['A blue cat', 'A red cat'];
93
  const text_inputs = tokenizer(texts, { padding: true, truncation: true });
94
 
95
  // Compute text embeddings
 
97
 
98
  // Read images and run processor
99
  const urls = [
100
+ 'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
101
+ 'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
102
  ];
103
  const image = await Promise.all(urls.map(url => RawImage.read(url)));
104
  const image_inputs = await processor(image);